Improve tokenization to work with other tokenizers (databrickslabs#40)

matthayes · web-flow · commit e950ab33754d · 2023-03-30T16:36:26.000-07:00
This addresses databrickslabs#4. The tokenizer used by bloom appears to combine the newline after `## Response:` with the following character, which does not happen with GPT-J 6b. This results in the tokens for `### Response:\n` being different when appearing in the text compared to when it is tokenized in isolation. My solution here is to change the key to `### Response:\n" so that this becomes a single token. The other fix is to try getting a different config setting for the max length, or fallback to 1024 if none can be found. I've tested this on [bloomz-7b1-mt](https://huggingface.co/bigscience/bloomz-7b1-mt) and it produces similar generation quality. It also still trains successfully using GPT-J 6B as the base model.
diff --git a/training/consts.py b/training/consts.py
@@ -1,9 +1,8 @@
 DEFAULT_TRAINING_DATASET = "tatsu-lab/alpaca"
 DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6B"
-RESPONSE_KEY = "### Response:"
 END_KEY = "### End"
 INSTRUCTION_KEY = "### Instruction:"
-RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
+RESPONSE_KEY_NL = f"### Response:\n"
 DEFAULT_SEED = 42
 
 # The format of the instruction the model has been trained on.
diff --git a/training/generate.py b/training/generate.py
@@ -9,7 +9,7 @@
     PreTrainedTokenizer,
 )
 
-from .consts import END_KEY, PROMPT_FORMAT, RESPONSE_KEY
+from .consts import END_KEY, PROMPT_FORMAT, RESPONSE_KEY_NL
 
 logger = logging.getLogger(__name__)
 
@@ -84,7 +84,7 @@ def generate_response(
     """
     input_ids = tokenizer(PROMPT_FORMAT.format(instruction=instruction), return_tensors="pt").input_ids.to("cuda")
 
-    response_key_token_id = get_special_token_id(tokenizer, RESPONSE_KEY)
+    response_key_token_id = get_special_token_id(tokenizer, RESPONSE_KEY_NL)
     end_key_token_id = get_special_token_id(tokenizer, END_KEY)
 
     gen_tokens = model.generate(
diff --git a/training/trainer.py b/training/trainer.py
@@ -35,7 +35,6 @@
     DEFAULT_TRAINING_DATASET,
     END_KEY,
     INSTRUCTION_KEY,
-    RESPONSE_KEY,
     RESPONSE_KEY_NL,
 )
 
@@ -47,7 +46,7 @@ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
         batch = super().torch_call(examples)
 
         # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
-        # sequence of tokens.
+        # sequence of tokens.  This should just be a single token.
         response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)
 
         labels = batch["labels"].clone()
@@ -56,14 +55,15 @@ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
 
             response_token_ids_start_idx = None
             for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
-                if np.array_equal(response_token_ids, batch["labels"][i, idx : idx + len(response_token_ids)]):
-                    response_token_ids_start_idx = idx
-                    break
+                response_token_ids_start_idx = idx
+                break
 
             if response_token_ids_start_idx is None:
-                raise RuntimeError("Could not find response key token IDs")
+                raise RuntimeError(
+                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
+                )
 
-            response_token_ids_end_idx = response_token_ids_start_idx + len(response_token_ids)
+            response_token_ids_end_idx = response_token_ids_start_idx + 1
 
             # Make pytorch loss function ignore all tokens up through the end of the response key
             labels[i, :response_token_ids_end_idx] = -100
@@ -87,7 +87,8 @@ def load_training_dataset(training_data_id: str = DEFAULT_TRAINING_DATASET, spli
     logger.info("Found %d rows", dataset.num_rows)
 
     # Remove empty responses
-    dataset = dataset.filter(lambda rec: not rec["text"].strip().endswith(RESPONSE_KEY))
+    response_key_stripped = RESPONSE_KEY_NL.strip()
+    dataset = dataset.filter(lambda rec: not rec["text"].strip().endswith(response_key_stripped))
 
     def _func(rec):
         rec["text"] += f"\n\n{END_KEY}"
@@ -102,6 +103,7 @@ def load_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL) ->
     logger.info(f"Loading tokenizer for {pretrained_model_name_or_path}")
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
     tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.add_special_tokens({"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]})
     return tokenizer
 
 
@@ -120,7 +122,6 @@ def get_model_tokenizer(
 ) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
     tokenizer = load_tokenizer(pretrained_model_name_or_path)
     model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)
-    tokenizer.add_special_tokens({"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY]})
     model.resize_token_embeddings(len(tokenizer))
 
     return model, tokenizer
@@ -173,8 +174,11 @@ def train(
 
     model, tokenizer = get_model_tokenizer(gradient_checkpointing=gradient_checkpointing)
 
-    # Use the same max length that the model supports
-    max_length: int = model.config.n_positions
+    # Use the same max length that the model supports.  Try a couple different keys in case a different
+    # model is used.  The default model uses n_positions.  If no config settings can be found just default
+    # to 1024 as this is probably supported by most models.
+    conf = model.config
+    max_length: int = getattr(conf, "n_positions", getattr(conf, "seq_lenth", 1024))
 
     processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed)