Skip to content

Commit af91832

Browse files
authored
Add response generation for examples from blog post (databrickslabs#27)
This adds response generation code and runs it over the examples from the [blog post](https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html). A few of the sample outputs: **Instruction:** Write a love letter to Edgar Allan Poe. > My love for you is as eternal and enduring as the starlit nights, as deep as the ocean depths, as strong as the wind, and as sweet as the summer breeze. My heart has been yours since the first day I saw you, and it will remain yours forever. I can see you now and my heart fills with longing for our embrace, as I long to be with you forever. **Instruction:** Explain to me the difference between nuclear fission and fusion. > Nuclear fission is the splitting of a nucleus, which is a heavy atom, such as uranium, into two separate nuclei, while nuclear fusion is the joining of two nuclei to form a single nucleus, such as hydrogen and helium. Fission occurs when the nucleus absorbs a neutron, leaving a remnant of the nucleus. Fusion occurs when the two nuclei come together and form a single nucleus. Fission is a nuclear reaction and fusion is a nuclear reaction. **Instruction:** Write a tweet announcing Dolly, a large language model from Databricks. > Dolly, the world's most accurate language model, is now available on Databricks! 🎉 🎉 🎉 **Instruction:** I'm selling my Nikon D-750, write a short blurb for my ad. > I'm selling my Nikon D-750, an amazing camera. It has the latest features including 12MP CMOS sensor, 9-point autofocus and auto bracketing, and 3.0x optical zoom. It also has a cool Image Stabilizer system and a dual SD card slot. It is a great choice for photographers looking for a reliable camera that offers superior image quality and features. **Instruction:** Give me a list of 5 science fiction books I should read next. > 1. The Hitchhiker's Guide to the Galaxy by Douglas Adams > 2. Ender’s Game by Orson Scott Card > 3. 1984 by George Orwell > 4. Star Wars by George Lucas > 5. The Hitchhiker's Guide to the Galaxy by Douglas Adams
1 parent f0de3ce commit af91832

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

train_dolly.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,24 @@
114114
# MAGIC --lr 1e-5
115115

116116
# COMMAND ----------
117+
118+
from training.generate import generate_response, load_model_tokenizer_for_generate
119+
120+
model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
121+
122+
# COMMAND ----------
123+
124+
# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html
125+
instructions = [
126+
"Write a love letter to Edgar Allan Poe.",
127+
"Write a tweet announcing Dolly, a large language model from Databricks.",
128+
"I'm selling my Nikon D-750, write a short blurb for my ad.",
129+
"Explain to me the difference between nuclear fission and fusion.",
130+
"Give me a list of 5 science fiction books I should read next.",
131+
]
132+
133+
# Use the model to generate responses for each of the instructions above.
134+
for instruction in instructions:
135+
response = generate_response(instruction, model=model, tokenizer=tokenizer)
136+
if response:
137+
print(f"Instruction: {instruction}\n\n{response}\n\n-----------\n")

training/generate.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import logging
2+
import re
3+
from typing import Tuple
4+
5+
from transformers import (
6+
AutoModelForCausalLM,
7+
AutoTokenizer,
8+
PreTrainedModel,
9+
PreTrainedTokenizer,
10+
)
11+
12+
logger = logging.getLogger(__name__)
13+
14+
# The format of the instruction the model has been trained on.
15+
INTRO = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
16+
INSTRUCTION_FORMAT = """{intro}
17+
18+
### Instruction:
19+
{instruction}
20+
21+
### Response:
22+
"""
23+
24+
25+
def load_model_tokenizer_for_generate(
26+
pretrained_model_name_or_path: str,
27+
) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
28+
"""Loads the model and tokenizer so that it can be used for generating responses.
29+
30+
Args:
31+
pretrained_model_name_or_path (str): name or path for model
32+
33+
Returns:
34+
Tuple[PreTrainedModel, PreTrainedTokenizer]: model and tokenizer
35+
"""
36+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, padding_side="left")
37+
model = AutoModelForCausalLM.from_pretrained(
38+
pretrained_model_name_or_path, device_map="auto", trust_remote_code=True
39+
)
40+
return model, tokenizer
41+
42+
43+
def generate_response(
44+
instruction: str,
45+
*,
46+
model: PreTrainedModel,
47+
tokenizer: PreTrainedTokenizer,
48+
do_sample: bool = True,
49+
max_new_tokens: int = 128,
50+
top_p: float = 0.92,
51+
top_k: int = 0,
52+
**kwargs,
53+
) -> str:
54+
"""Given an instruction, uses the model and tokenizer to generate a response. This formats the instruction in
55+
the instruction format that the model was fine-tuned on.
56+
57+
Args:
58+
instruction (str): instruction to generate response for
59+
model (PreTrainedModel): model to use
60+
tokenizer (PreTrainedTokenizer): tokenizer to use
61+
do_sample (bool, optional): Whether or not to use sampling. Defaults to True.
62+
max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
63+
top_p (float, optional): If set to float < 1, only the smallest set of most probable tokens with probabilities
64+
that add up to top_p or higher are kept for generation. Defaults to 0.92.
65+
top_k (int, optional): The number of highest probability vocabulary tokens to keep for top-k-filtering.
66+
Defaults to 0.
67+
68+
Returns:
69+
str: the generated response
70+
"""
71+
input_ids = tokenizer(
72+
INSTRUCTION_FORMAT.format(intro=INTRO, instruction=instruction), return_tensors="pt"
73+
).input_ids.to("cuda")
74+
75+
gen_tokens = model.generate(
76+
input_ids,
77+
pad_token_id=tokenizer.pad_token_id,
78+
do_sample=do_sample,
79+
max_new_tokens=max_new_tokens,
80+
top_p=top_p,
81+
top_k=top_k,
82+
**kwargs,
83+
)
84+
decoded = tokenizer.batch_decode(gen_tokens)[0]
85+
86+
# The response appears after "### Response:". The model has been trained to append "### End" at the end.
87+
m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", decoded, flags=re.DOTALL)
88+
89+
response = None
90+
if m:
91+
response = m.group(1).strip()
92+
else:
93+
# The model might not generate the "### End" sequence before reaching the max tokens. In this case, return
94+
# everything after "### Response:".
95+
m = re.search(r"#+\s*Response:\s*(.+)", decoded, flags=re.DOTALL)
96+
if m:
97+
response = m.group(1).strip()
98+
else:
99+
logger.warn(f"Failed to find response in:\n{decoded}")
100+
101+
return response

0 commit comments

Comments
 (0)