Add response generation for examples from blog post (databrickslabs#27)

matthayes · web-flow · commit af91832c6630 · 2023-03-27T22:44:13.000-07:00
This adds response generation code and runs it over the examples from the [blog post](https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html). A few of the sample outputs: **Instruction:** Write a love letter to Edgar Allan Poe. > My love for you is as eternal and enduring as the starlit nights, as deep as the ocean depths, as strong as the wind, and as sweet as the summer breeze. My heart has been yours since the first day I saw you, and it will remain yours forever. I can see you now and my heart fills with longing for our embrace, as I long to be with you forever. **Instruction:** Explain to me the difference between nuclear fission and fusion. > Nuclear fission is the splitting of a nucleus, which is a heavy atom, such as uranium, into two separate nuclei, while nuclear fusion is the joining of two nuclei to form a single nucleus, such as hydrogen and helium. Fission occurs when the nucleus absorbs a neutron, leaving a remnant of the nucleus. Fusion occurs when the two nuclei come together and form a single nucleus. Fission is a nuclear reaction and fusion is a nuclear reaction. **Instruction:** Write a tweet announcing Dolly, a large language model from Databricks. > Dolly, the world's most accurate language model, is now available on Databricks! 🎉 🎉 🎉 **Instruction:** I'm selling my Nikon D-750, write a short blurb for my ad. > I'm selling my Nikon D-750, an amazing camera. It has the latest features including 12MP CMOS sensor, 9-point autofocus and auto bracketing, and 3.0x optical zoom. It also has a cool Image Stabilizer system and a dual SD card slot. It is a great choice for photographers looking for a reliable camera that offers superior image quality and features. **Instruction:** Give me a list of 5 science fiction books I should read next. > 1. The Hitchhiker's Guide to the Galaxy by Douglas Adams > 2. Ender’s Game by Orson Scott Card > 3. 1984 by George Orwell > 4. Star Wars by George Lucas > 5. The Hitchhiker's Guide to the Galaxy by Douglas Adams
diff --git a/train_dolly.py b/train_dolly.py
@@ -114,3 +114,24 @@
 # MAGIC     --lr 1e-5
 
 # COMMAND ----------
+
+from training.generate import generate_response, load_model_tokenizer_for_generate
+
+model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
+
+# COMMAND ----------
+
+# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html
+instructions = [
+    "Write a love letter to Edgar Allan Poe.",
+    "Write a tweet announcing Dolly, a large language model from Databricks.",
+    "I'm selling my Nikon D-750, write a short blurb for my ad.",
+    "Explain to me the difference between nuclear fission and fusion.",
+    "Give me a list of 5 science fiction books I should read next.",
+]
+
+# Use the model to generate responses for each of the instructions above.
+for instruction in instructions:
+    response = generate_response(instruction, model=model, tokenizer=tokenizer)
+    if response:
+        print(f"Instruction: {instruction}\n\n{response}\n\n-----------\n")
diff --git a/training/generate.py b/training/generate.py
@@ -0,0 +1,101 @@
+import logging
+import re
+from typing import Tuple
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+
+logger = logging.getLogger(__name__)
+
+# The format of the instruction the model has been trained on.
+INTRO = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+INSTRUCTION_FORMAT = """{intro}
+
+### Instruction:
+{instruction}
+
+### Response:
+"""
+
+
+def load_model_tokenizer_for_generate(
+    pretrained_model_name_or_path: str,
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    """Loads the model and tokenizer so that it can be used for generating responses.
+
+    Args:
+        pretrained_model_name_or_path (str): name or path for model
+
+    Returns:
+        Tuple[PreTrainedModel, PreTrainedTokenizer]: model and tokenizer
+    """
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, padding_side="left")
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path, device_map="auto", trust_remote_code=True
+    )
+    return model, tokenizer
+
+
+def generate_response(
+    instruction: str,
+    *,
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    do_sample: bool = True,
+    max_new_tokens: int = 128,
+    top_p: float = 0.92,
+    top_k: int = 0,
+    **kwargs,
+) -> str:
+    """Given an instruction, uses the model and tokenizer to generate a response.  This formats the instruction in
+    the instruction format that the model was fine-tuned on.
+
+    Args:
+        instruction (str): instruction to generate response for
+        model (PreTrainedModel): model to use
+        tokenizer (PreTrainedTokenizer): tokenizer to use
+        do_sample (bool, optional): Whether or not to use sampling. Defaults to True.
+        max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
+        top_p (float, optional): If set to float < 1, only the smallest set of most probable tokens with probabilities
+            that add up to top_p or higher are kept for generation. Defaults to 0.92.
+        top_k (int, optional): The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            Defaults to 0.
+
+    Returns:
+        str: the generated response
+    """
+    input_ids = tokenizer(
+        INSTRUCTION_FORMAT.format(intro=INTRO, instruction=instruction), return_tensors="pt"
+    ).input_ids.to("cuda")
+
+    gen_tokens = model.generate(
+        input_ids,
+        pad_token_id=tokenizer.pad_token_id,
+        do_sample=do_sample,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        top_k=top_k,
+        **kwargs,
+    )
+    decoded = tokenizer.batch_decode(gen_tokens)[0]
+
+    # The response appears after "### Response:".  The model has been trained to append "### End" at the end.
+    m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", decoded, flags=re.DOTALL)
+
+    response = None
+    if m:
+        response = m.group(1).strip()
+    else:
+        # The model might not generate the "### End" sequence before reaching the max tokens.  In this case, return
+        # everything after "### Response:".
+        m = re.search(r"#+\s*Response:\s*(.+)", decoded, flags=re.DOTALL)
+        if m:
+            response = m.group(1).strip()
+        else:
+            logger.warn(f"Failed to find response in:\n{decoded}")
+
+    return response