{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Guidance acceleration\n", "\n", "When multiple generation or LLM-directed control flow statements are used in a single Guidance program then we can significantly improve inference performance by maintaining a session state with the LLM inferencer and so reusing the Key/Value caches as we progress through the program. This is much faster than letting the model generate all of the structural tokens itself (for example if the structure was demonstrated using a one-shot example), and also faster that simply recalling the model without any state at each point in the Guidance program.\n", "\n", "We call this \"guidance acceleration\" and it is supported currently by local models such as `guidance.models.LlamaCpp` or `guidance.models.Transformers` as demonstrated below." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import time\n", "import torch\n", "import guidance\n", "from guidance import models, gen" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this.\n" ] } ], "source": [ "# Define a trivial string we can extend with small models easily\n", "prefix = \"Repeat this. Repeat this. \"*5 + \"Repeat this. Repeat this.\"\n", "print(prefix)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the model we will test" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "560ce990c6484163aa19a2f37788dfeb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model = 'mistralai/Mistral-7B-v0.1'\n", "device = 'cuda'\n", "llm_gpt2_large_gpu = guidance.models.Transformers(model, device=device) # run on an A100 for the numbers below" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run with normal token acceleration" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this." ], "text/plain": [ "
Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this." ], "text/plain": [ "
Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this. Repeat this." ], "text/plain": [ "