Skip to content

Commit ea46513

Browse files
committed
Remove empty responses, append end tokens to training dataset
1 parent 03bf385 commit ea46513

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

training/trainer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,16 @@ def load_training_dataset(training_data_id: str = DEFAULT_TRAINING_DATASET, spli
7979
logger.info(f"Loading {training_data_id} dataset")
8080
dataset: Dataset = load_dataset(training_data_id)[split]
8181
logger.info("Found %d rows", dataset.num_rows)
82+
83+
# Remove empty responses
84+
dataset = dataset.filter(lambda rec: not rec["text"].strip().endswith("### Response:"))
85+
86+
def _func(rec):
87+
rec["text"] += "\n\n### End"
88+
return rec
89+
90+
dataset = dataset.map(_func)
91+
8292
return dataset
8393

8494

0 commit comments

Comments
 (0)