Construct Multi-Turn Prompts

Llama - Multi-Prompt Input

Follow Up Question


Remember that the model as setup now does not remember the previous prompts

The second prompt below will not yield the expected outcome because the model has no clue what “he” refers to since it does not retain your previous prompts.

from utils import llama, llama_chat

prompt = """
Help me write a birthday card for my dear friend Andrew.
Here are details about my friend:
He likes long walks on the beach and reading in the bookstore.
His hobbies include reading research papers and speaking at conferences.
His favorite color is light blue.
He likes pandas.
"""
response = llama(prompt)
print(response)

prompt_2 = """
Oh, he also likes teaching. Can you rewrite it to include that?
"""
response_2 = llama(prompt_2)
print(response_2)

So how can we carry on a multi-prompt interaction with the model like we would with a chatbot?

Look at the example below

We have to provide prior prompts and responses as part of the context of each new turn in the conversation

TAGS

Note:

  1. We have to include [INST] … [/INST] tags at the beginning and end of each prompt
  2. We have to enclose each previous prompt & response in their own .. tags
  3. We must always have pairs of prompt/response in each set of
  4. We must have one last prompt in its own set of tags to inform the model that’s the prompt we want it to evaluate
prompt_1 = """
    What are fun activities I can do this weekend?
"""
response_1 = llama(prompt_1)

prompt_2 = """
Which of these would be good for my health?
"""

chat_prompt = f"""
<s>[INST] {prompt_1} [/INST]
{response_1}
</s>
<s>[INST] {prompt_2} [/INST]
"""
print(chat_prompt)

Then we ask for the response and print it

response_2 = llama(chat_prompt,
                 add_inst=False,
                 verbose=True)
                 
print(response_2)

Chat Helper Function

Instead of having to type in the TAGS manually, we can make use of the Chat Helper Function which inserts the needed brackets. I’ve included the utils.py file at the end of this page to display the code for the functions used.

from utils import llama_chat

prompt_1 = """
    What are fun activities I can do this weekend?
"""
response_1 = llama(prompt_1)

prompt_2 = """
Which of these would be good for my health?
"""

prompts = [prompt_1,prompt_2]
responses = [response_1]

# Pass prompts and responses to llama_chat function.
#Set verbose=true to view the code passed
response_2 = llama_chat(prompts,responses,verbose=True)

print(response_2)

Here is a sample

# replace prompt_3 with your own question!
prompt_3 = "Which of these activites would be fun with friends?"
prompts = [prompt_1, prompt_2, prompt_3]
responses = [response_1, response_2]

response_3 = llama_chat(prompts, responses, verbose=True)

print(response_3)

UTILS.py


Here is the utils.py file that we use to define the llama() and llama_chat() functions.

import os
from dotenv import load_dotenv
import os
from dotenv import load_dotenv, find_dotenv
import warnings
import requests
import json
import time
# Initailize global variables
_ = load_dotenv(find_dotenv())
# warnings.filterwarnings('ignore')
url = f"{os.getenv('DLAI_TOGETHER_API_BASE', 'https://api.together.xyz')}/inference"
headers = {
        "Authorization": f"Bearer {os.getenv('TOGETHER_API_KEY')}",
        "Content-Type": "application/json"
    }


def llama(prompt, 
          add_inst=True, 
          model="togethercomputer/llama-2-7b-chat", 
          temperature=0.0, 
          max_tokens=1024,
          verbose=False,
          url=url,
          headers=headers,
          base=2, # number of seconds to wait
          max_tries=3):
    
    if add_inst:
        prompt = f"[INST]{prompt}[/INST]"

    if verbose:
        print(f"Prompt:\n{prompt}\n")
        print(f"model: {model}")

    data = {
            "model": model,
            "prompt": prompt,
            "temperature": temperature,
            "max_tokens": max_tokens
        }

    # Allow multiple attempts to call the API incase of downtime.
    # Return provided response to user after 3 failed attempts.    
    wait_seconds = [base**i for i in range(max_tries)]

    for num_tries in range(max_tries):
        try:
            response = requests.post(url, headers=headers, json=data)
            return response.json()['output']['choices'][0]['text']
        except Exception as e:
            if response.status_code != 500:
                return response.json()

            print(f"error message: {e}")
            print(f"response object: {response}")
            print(f"num_tries {num_tries}")
            print(f"Waiting {wait_seconds[num_tries]} seconds before automatically trying again.")
            time.sleep(wait_seconds[num_tries])
 
    print(f"Tried {max_tries} times to make API call to get a valid response object")
    print("Returning provided response")
    return response


def llama_chat(prompts, 
               responses,
               model="togethercomputer/llama-2-7b-chat", 
               temperature=0.0, 
               max_tokens=1024,
               verbose=False,
               url=url,
               headers=headers,
               base=2,
               max_tries=3
              ):

    prompt = get_prompt_chat(prompts,responses)

    # Allow multiple attempts to call the API incase of downtime.
    # Return provided response to user after 3 failed attempts.    
    wait_seconds = [base**i for i in range(max_tries)]

    for num_tries in range(max_tries):
        try:
            response = llama(prompt=prompt,
                             add_inst=False,
                             model=model, 
                             temperature=temperature, 
                             max_tokens=max_tokens,
                             verbose=verbose,
                             url=url,
                             headers=headers
                            )
            return response
        except Exception as e:
            if response.status_code != 500:
                return response.json()

            print(f"error message: {e}")
            print(f"response object: {response}")
            print(f"num_tries {num_tries}")
            print(f"Waiting {wait_seconds[num_tries]} seconds before automatically trying again.")
            time.sleep(wait_seconds[num_tries])
 
    print(f"Tried {max_tries} times to make API call to get a valid response object")
    print("Returning provided response")
    return response


def get_prompt_chat(prompts, responses):
  prompt_chat = f"<s>[INST] {prompts[0]} [/INST]"
  for n, response in enumerate(responses):
    prompt = prompts[n + 1]
    prompt_chat += f"\n{response}\n </s><s>[INST] \n{ prompt }\n [/INST]"

  return prompt_chat