Add special tokens for keys in prompt (databrickslabs#39)

matthayes · web-flow · commit 8398e3a2368c · 2023-03-30T13:42:25.000-07:00
This configures the tokenizer so that strings `### Instruction:`, `### Response:`, and `### End` are all represented by a single token ID each.  This simplifies the logic for finding the response.  `generate` is also now configured to stop generation at `### End`, making generation faster.

The default `max_new_tokens` has been double to 256.

The notebook now has widgets `local_training_root` and `dbfs_output_root` for configuring where data is stored locally and in DBFS.  By default if `local_training_root` is not provided it now uses `/local_disk0` if it exists and otherwise defaults to a subdir of the home directory, as before.
diff --git a/train_dolly.py b/train_dolly.py
@@ -59,6 +59,8 @@
 from training.trainer import load_training_dataset, load_tokenizer
 
 dbutils.widgets.text("num_gpus", "", "num_gpus")
+dbutils.widgets.text("local_training_root", "", "local_training_root")
+dbutils.widgets.text("dbfs_output_root", "", "dbfs_output_root")
 
 # COMMAND ----------
 
@@ -75,12 +77,27 @@
 root_path = os.getcwd()
 deepspeed_config = os.path.join(root_path, "config/ds_z3_bf16_config.json")
 
-local_training_root = os.path.join(os.path.expanduser('~'), "dolly_training")
+dolly_training_dir_name = "dolly_training"
+
+# Use the local training root path if it was provided.  Otherwise try to find a sensible default.
+local_training_root = dbutils.widgets.get("local_training_root")
+if not local_training_root:
+    # Use preferred path when working in a Databricks cluster if it exists.
+    if os.path.exists("/local_disk0"):
+        local_training_root = os.path.join("/local_disk0", dolly_training_dir_name)
+    # Otherwise use the home directory.
+    else:
+        local_training_root = os.path.join(os.path.expanduser('~'), dolly_training_dir_name)
+
+dbfs_output_root = dbutils.widgets.get("dbfs_output_root")
+if not dbfs_output_root:
+    dbfs_output_root = f"/dbfs/{dolly_training_dir_name}"
 
 os.makedirs(local_training_root, exist_ok=True)
+os.makedirs(dbfs_output_root, exist_ok=True)
 
 local_output_dir = os.path.join(local_training_root, checkpoint_dir_name)
-dbfs_output_dir = os.path.join("/dbfs/dolly_training", checkpoint_dir_name)
+dbfs_output_dir = os.path.join(dbfs_output_root, checkpoint_dir_name)
 
 num_gpus_flag = ""
 num_gpus = dbutils.widgets.get("num_gpus")
diff --git a/training/consts.py b/training/consts.py
@@ -0,0 +1,19 @@
+DEFAULT_TRAINING_DATASET = "tatsu-lab/alpaca"
+DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6B"
+RESPONSE_KEY = "### Response:"
+END_KEY = "### End"
+INSTRUCTION_KEY = "### Instruction:"
+RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
+DEFAULT_SEED = 42
+
+# The format of the instruction the model has been trained on.
+PROMPT_FORMAT = """%s
+
+%s
+{instruction}
+
+%s""" % (
+    "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+    INSTRUCTION_KEY,
+    RESPONSE_KEY_NL,
+)
diff --git a/training/generate.py b/training/generate.py
@@ -1,25 +1,17 @@
 import logging
-import re
 from typing import Tuple
 
+import numpy as np
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     PreTrainedModel,
     PreTrainedTokenizer,
 )
 
-logger = logging.getLogger(__name__)
-
-# The format of the instruction the model has been trained on.
-INTRO = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-INSTRUCTION_FORMAT = """{intro}
-
-### Instruction:
-{instruction}
+from .consts import END_KEY, PROMPT_FORMAT, RESPONSE_KEY
 
-### Response:
-"""
+logger = logging.getLogger(__name__)
 
 
 def load_model_tokenizer_for_generate(
@@ -40,13 +32,35 @@ def load_model_tokenizer_for_generate(
     return model, tokenizer
 
 
+def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
+    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
+
+    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
+    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
+
+    Args:
+        tokenizer (PreTrainedTokenizer): the tokenizer
+        key (str): the key to convert to a single token
+
+    Raises:
+        RuntimeError: if more than one ID was generated
+
+    Returns:
+        int: the token ID for the given key
+    """
+    token_ids = tokenizer.encode(key)
+    if len(token_ids) > 1:
+        raise RuntimeError(f"Expected only a single token for '{key}' but found {token_ids}")
+    return token_ids[0]
+
+
 def generate_response(
     instruction: str,
     *,
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
     do_sample: bool = True,
-    max_new_tokens: int = 128,
+    max_new_tokens: int = 256,
     top_p: float = 0.92,
     top_k: int = 0,
     **kwargs,
@@ -68,34 +82,45 @@ def generate_response(
     Returns:
         str: the generated response
     """
-    input_ids = tokenizer(
-        INSTRUCTION_FORMAT.format(intro=INTRO, instruction=instruction), return_tensors="pt"
-    ).input_ids.to("cuda")
+    input_ids = tokenizer(PROMPT_FORMAT.format(instruction=instruction), return_tensors="pt").input_ids.to("cuda")
+
+    response_key_token_id = get_special_token_id(tokenizer, RESPONSE_KEY)
+    end_key_token_id = get_special_token_id(tokenizer, END_KEY)
 
     gen_tokens = model.generate(
         input_ids,
         pad_token_id=tokenizer.pad_token_id,
+        # Ensure generation stops once it generates "### End"
+        eos_token_id=end_key_token_id,
         do_sample=do_sample,
         max_new_tokens=max_new_tokens,
         top_p=top_p,
         top_k=top_k,
         **kwargs,
-    )
-    decoded = tokenizer.batch_decode(gen_tokens)[0]
+    )[0].cpu()
 
-    # The response appears after "### Response:".  The model has been trained to append "### End" at the end.
-    m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", decoded, flags=re.DOTALL)
+    # The response will be set to this variable if we can identify it.
+    decoded = None
 
-    response = None
-    if m:
-        response = m.group(1).strip()
+    # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the prompt,
+    # we should definitely find it.  We will return the tokens found after this token.
+    response_pos = None
+    response_positions = np.where(gen_tokens == response_key_token_id)[0]
+    if len(response_positions) == 0:
+        logger.warn(f"Could not find response key {response_key_token_id} in: {gen_tokens}")
     else:
-        # The model might not generate the "### End" sequence before reaching the max tokens.  In this case, return
-        # everything after "### Response:".
-        m = re.search(r"#+\s*Response:\s*(.+)", decoded, flags=re.DOTALL)
-        if m:
-            response = m.group(1).strip()
-        else:
-            logger.warn(f"Failed to find response in:\n{decoded}")
-
-    return response
+        response_pos = response_positions[0]
+
+    if response_pos:
+        # Next find where "### End" is located.  The model has been trained to end its responses with this sequence
+        # (or actually, the token ID it maps to, since it is a special token).  We may not find this token, as the
+        # response could be truncated.  If we don't find it then just return everything to the end.  Note that
+        # even though we set eos_token_id, we still see the this token at the end.
+        end_pos = None
+        end_positions = np.where(gen_tokens == end_key_token_id)[0]
+        if len(end_positions) > 0:
+            end_pos = end_positions[0]
+
+        decoded = tokenizer.decode(gen_tokens[response_pos + 1 : end_pos]).strip()
+
+    return decoded
diff --git a/training/trainer.py b/training/trainer.py
@@ -29,20 +29,26 @@
     set_seed,
 )
 
-logger = logging.getLogger(__name__)
+from .consts import (
+    DEFAULT_INPUT_MODEL,
+    DEFAULT_SEED,
+    DEFAULT_TRAINING_DATASET,
+    END_KEY,
+    INSTRUCTION_KEY,
+    RESPONSE_KEY,
+    RESPONSE_KEY_NL,
+)
 
-DEFAULT_TRAINING_DATASET = "tatsu-lab/alpaca"
-DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6B"
-RESPONSE_KEY = "### Response:\n"
-DEFAULT_SEED = 42
-MAX_LENGTH = 1024
+logger = logging.getLogger(__name__)
 
 
 class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
     def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
         batch = super().torch_call(examples)
 
-        response_token_ids = self.tokenizer.encode(RESPONSE_KEY)
+        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
+        # sequence of tokens.
+        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)
 
         labels = batch["labels"].clone()
 
@@ -67,7 +73,7 @@ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
         return batch
 
 
-def preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int = MAX_LENGTH) -> dict:
+def preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int) -> dict:
     return tokenizer(
         batch["text"],
         max_length=max_length,
@@ -81,10 +87,10 @@ def load_training_dataset(training_data_id: str = DEFAULT_TRAINING_DATASET, spli
     logger.info("Found %d rows", dataset.num_rows)
 
     # Remove empty responses
-    dataset = dataset.filter(lambda rec: not rec["text"].strip().endswith("### Response:"))
+    dataset = dataset.filter(lambda rec: not rec["text"].strip().endswith(RESPONSE_KEY))
 
     def _func(rec):
-        rec["text"] += "\n\n### End"
+        rec["text"] += f"\n\n{END_KEY}"
         return rec
 
     dataset = dataset.map(_func)
@@ -114,15 +120,18 @@ def get_model_tokenizer(
 ) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
     tokenizer = load_tokenizer(pretrained_model_name_or_path)
     model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)
+    tokenizer.add_special_tokens({"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY]})
+    model.resize_token_embeddings(len(tokenizer))
+
     return model, tokenizer
 
 
-def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int = MAX_LENGTH, seed=DEFAULT_SEED) -> Dataset:
+def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed=DEFAULT_SEED) -> Dataset:
     """Loads the training dataset and tokenizes it so it is ready for training.
 
     Args:
         tokenizer (AutoTokenizer): Tokenizer tied to the model.
-        max_length (int, optional): Maximum number of tokens to emit from tokenizer. Defaults to MAX_INPUT_LENGTH.
+        max_length (int): Maximum number of tokens to emit from tokenizer.
 
     Returns:
         Dataset: HuggingFace dataset
@@ -164,7 +173,10 @@ def train(
 
     model, tokenizer = get_model_tokenizer(gradient_checkpointing=gradient_checkpointing)
 
-    processed_dataset = preprocess_dataset(tokenizer=tokenizer, seed=seed)
+    # Use the same max length that the model supports
+    max_length: int = model.config.n_positions
+
+    processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed)
 
     split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed)
 
@@ -225,9 +237,7 @@ def train(
 
 
 @click.command()
-@click.option(
-    "--local-output-dir", type=str, help="Write directly to this local path", required=True
-)
+@click.option("--local-output-dir", type=str, help="Write directly to this local path", required=True)
 @click.option("--dbfs-output-dir", type=str, help="Sync data to this path on DBFS")
 @click.option("--epochs", type=int, default=3, help="Number of epochs to train for.")
 @click.option("--per-device-train-batch-size", type=int, default=8, help="Batch size to use for training.")