Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Commit 2bc3203

Browse files
authored
Merge pull request #576 from microsoft/daden/bugfix
Daden/bugfix improvement and bug fix based on the bug bash feedback
2 parents 806d5fb + 2748b98 commit 2bc3203

9 files changed

+147
-304
lines changed

examples/text_summarization/abstractive_summarization_bertsumabs_cnndm.ipynb

Lines changed: 61 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
"metadata": {},
4747
"source": [
4848
"## Before you start\n",
49-
"Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour. Better performance can be achieved by increasing the MAX_STEPS.\n",
49+
"\n",
50+
"It's recommended to run this notebook on GPU machines as it's very computationally intensive. Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour. Better performance can be achieved by increasing the MAX_STEPS.\n",
5051
"\n",
5152
"* **ROUGE Evalation**: To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](./summarization_evaluation.ipynb) for setup.\n",
5253
"\n",
@@ -92,11 +93,18 @@
9293
"if nlp_path not in sys.path:\n",
9394
" sys.path.insert(0, nlp_path)\n",
9495
"\n",
95-
"from utils_nlp.models.transformers.abstractive_summarization_bertsum import BertSumAbs, BertSumAbsProcessor\n",
96+
"from utils_nlp.models.transformers.abstractive_summarization_bertsum import (\n",
97+
" BertSumAbs,\n",
98+
" BertSumAbsProcessor,\n",
99+
")\n",
96100
"\n",
97101
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
98102
"from utils_nlp.eval import compute_rouge_python\n",
99103
"\n",
104+
"from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
105+
"import nltk\n",
106+
"from nltk import tokenize\n",
107+
"\n",
100108
"import pandas as pd\n",
101109
"import pprint\n",
102110
"import scrapbook as sb"
@@ -139,8 +147,8 @@
139147
"outputs": [],
140148
"source": [
141149
"train_dataset, test_dataset = CNNDMSummarizationDataset(\n",
142-
" top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\n",
143-
" )"
150+
" top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\n",
151+
")"
144152
]
145153
},
146154
{
@@ -190,36 +198,41 @@
190198
"MAX_SOURCE_SEQ_LENGTH = 640\n",
191199
"MAX_TARGET_SEQ_LENGTH = 140\n",
192200
"\n",
193-
"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md. \n",
201+
"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md.\n",
194202
"FP16 = False\n",
195203
"if FP16:\n",
196-
" FP16_OPT_LEVEL=\"O2\"\n",
197-
" \n",
204+
" FP16_OPT_LEVEL = \"O2\"\n",
205+
"\n",
198206
"# fine-tuning parameters\n",
199207
"# batch size, unit is the number of tokens\n",
200-
"BATCH_SIZE_PER_GPU = 3\n",
208+
"BATCH_SIZE_PER_GPU = 1\n",
201209
"\n",
202210
"\n",
203211
"# GPU used for training\n",
204212
"NUM_GPUS = torch.cuda.device_count()\n",
213+
"if NUM_GPUS > 0:\n",
214+
" BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\n",
215+
"else:\n",
216+
" BATCH_SIZE = 1\n",
217+
"\n",
205218
"\n",
206219
"# Learning rate\n",
207-
"LEARNING_RATE_BERT=5e-4/2.0\n",
208-
"LEARNING_RATE_DEC=0.05/2.0\n",
220+
"LEARNING_RATE_BERT = 5e-4 / 2.0\n",
221+
"LEARNING_RATE_DEC = 0.05 / 2.0\n",
209222
"\n",
210223
"\n",
211224
"# How often the statistics reports show up in training, unit is step.\n",
212-
"REPORT_EVERY=10\n",
213-
"SAVE_EVERY=500\n",
225+
"REPORT_EVERY = 10\n",
226+
"SAVE_EVERY = 500\n",
214227
"\n",
215228
"# total number of steps for training\n",
216-
"MAX_STEPS=1e3\n",
217-
" \n",
229+
"MAX_STEPS = 1e3\n",
230+
"\n",
218231
"if not QUICK_RUN:\n",
219-
" MAX_STEPS=5e3\n",
232+
" MAX_STEPS = 5e3\n",
220233
"\n",
221-
"WARMUP_STEPS_BERT=2000\n",
222-
"WARMUP_STEPS_DEC=1000 \n"
234+
"WARMUP_STEPS_BERT = 2000\n",
235+
"WARMUP_STEPS_DEC = 1000"
223236
]
224237
},
225238
{
@@ -253,21 +266,20 @@
253266
},
254267
"outputs": [],
255268
"source": [
256-
"\n",
257269
"summarizer.fit(\n",
258-
" train_dataset,\n",
259-
" num_gpus=NUM_GPUS,\n",
260-
" batch_size=BATCH_SIZE_PER_GPU*NUM_GPUS,\n",
261-
" max_steps=MAX_STEPS,\n",
262-
" learning_rate_bert=LEARNING_RATE_BERT,\n",
263-
" learning_rate_dec=LEARNING_RATE_DEC,\n",
264-
" warmup_steps_bert=WARMUP_STEPS_BERT,\n",
265-
" warmup_steps_dec=WARMUP_STEPS_DEC,\n",
266-
" save_every=SAVE_EVERY,\n",
267-
" report_every=REPORT_EVERY*5,\n",
268-
" fp16=FP16,\n",
269-
" # checkpoint=\"saved checkpoint path\"\n",
270-
")\n"
270+
" train_dataset,\n",
271+
" num_gpus=NUM_GPUS,\n",
272+
" batch_size=BATCH_SIZE,\n",
273+
" max_steps=MAX_STEPS,\n",
274+
" learning_rate_bert=LEARNING_RATE_BERT,\n",
275+
" learning_rate_dec=LEARNING_RATE_DEC,\n",
276+
" warmup_steps_bert=WARMUP_STEPS_BERT,\n",
277+
" warmup_steps_dec=WARMUP_STEPS_DEC,\n",
278+
" save_every=SAVE_EVERY,\n",
279+
" report_every=REPORT_EVERY * 5,\n",
280+
" fp16=FP16,\n",
281+
" # checkpoint=\"saved checkpoint path\"\n",
282+
")"
271283
]
272284
},
273285
{
@@ -327,14 +339,19 @@
327339
"TEST_TOP_N = 32\n",
328340
"if not QUICK_RUN:\n",
329341
" TEST_TOP_N = len(test_dataset)\n",
342+
"\n",
343+
"if NUM_GPUS:\n",
344+
" BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\n",
345+
"else:\n",
346+
" BATCH_SIZE = 1\n",
330347
" \n",
331-
"shortened_dataset= test_dataset.shorten(top_n=TEST_TOP_N)\n",
348+
"shortened_dataset = test_dataset.shorten(top_n=TEST_TOP_N)\n",
332349
"src = shortened_dataset.get_source()\n",
333350
"reference_summaries = [\" \".join(t).rstrip(\"\\n\") for t in shortened_dataset.get_target()]\n",
334351
"generated_summaries = summarizer.predict(\n",
335-
" shortened_dataset, batch_size=32*4, num_gpus=NUM_GPUS\n",
352+
" shortened_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS\n",
336353
")\n",
337-
"assert len(generated_summaries) == len(reference_summaries)\n"
354+
"assert len(generated_summaries) == len(reference_summaries)"
338355
]
339356
},
340357
{
@@ -374,13 +391,6 @@
374391
"pprint.pprint(rouge_scores)"
375392
]
376393
},
377-
{
378-
"cell_type": "code",
379-
"execution_count": null,
380-
"metadata": {},
381-
"outputs": [],
382-
"source": []
383-
},
384394
{
385395
"cell_type": "code",
386396
"execution_count": null,
@@ -415,39 +425,22 @@
415425
{
416426
"cell_type": "code",
417427
"execution_count": null,
418-
"metadata": {},
428+
"metadata": {
429+
"scrolled": true
430+
},
419431
"outputs": [],
420432
"source": [
421-
"from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
422-
"import nltk\n",
423-
"from nltk import tokenize\n",
424-
"\n",
425433
"test_dataset = SummarizationDataset(\n",
426-
" None,\n",
427-
" source=[source],\n",
428-
" source_preprocessing=[tokenize.sent_tokenize],\n",
434+
" None, source=[source], source_preprocessing=[tokenize.sent_tokenize],\n",
429435
")\n",
430-
"generated_summaries = summarizer.predict(\n",
431-
" test_dataset, batch_size=1, num_gpus=1\n",
432-
")\n"
436+
"generated_summaries = summarizer.predict(test_dataset, batch_size=1, num_gpus=NUM_GPUS)"
433437
]
434438
},
435439
{
436440
"cell_type": "code",
437-
"execution_count": 26,
441+
"execution_count": null,
438442
"metadata": {},
439-
"outputs": [
440-
{
441-
"data": {
442-
"text/plain": [
443-
"'two employees bought , sold weapons on their own , company says . company fired workers , turned them in to atf , says it was identified in the feds are sold weapons , entirely genuine \" u . s . officials say they turned them two miles east - northeast of oakland , while donors are paid just $ 300 to $ 1 , 000 .'"
444-
]
445-
},
446-
"execution_count": 26,
447-
"metadata": {},
448-
"output_type": "execute_result"
449-
}
450-
],
443+
"outputs": [],
451444
"source": [
452445
"generated_summaries[0]"
453446
]
@@ -475,9 +468,9 @@
475468
"metadata": {
476469
"celltoolbar": "Tags",
477470
"kernelspec": {
478-
"display_name": "python3.6 cm3",
471+
"display_name": "Python (nlp_gpu)",
479472
"language": "python",
480-
"name": "cm3"
473+
"name": "nlp_gpu"
481474
},
482475
"language_info": {
483476
"codemirror_mode": {
@@ -494,4 +487,4 @@
494487
},
495488
"nbformat": 4,
496489
"nbformat_minor": 2
497-
}
490+
}

examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
"import time\n",
6666
"\n",
6767
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\n",
68-
"from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
68+
"from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
6969
"from utils_nlp.eval import compute_rouge_python\n",
7070
"\n",
7171
"start_time = time.time()"

examples/text_summarization/abstractive_summarization_unilm_cnndm.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44

55
import torch
66

7-
from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer
7+
from utils_nlp.models.transformers.abstractive_summarization_seq2seq import (
8+
S2SAbsSumProcessor,
9+
S2SAbstractiveSummarizer
10+
)
11+
812
from utils_nlp.eval import compute_rouge_python
913

1014
parser = argparse.ArgumentParser()

examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
"- Azure Machine Learning Workspace\n",
2626
"- Azure Machine Learning SDK\n",
2727
"\n",
28-
"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). "
28+
"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). \n",
29+
"\n",
30+
"You can run this notebook on CPU-only machines."
2931
]
3032
},
3133
{
@@ -84,7 +86,9 @@
8486
" ExtSumProcessor,\n",
8587
")\n",
8688
"# Check core SDK version number\n",
87-
"print(\"SDK version:\", azureml.core.VERSION)"
89+
"print(\"SDK version:\", azureml.core.VERSION)\n",
90+
"\n",
91+
"import pprint"
8892
]
8993
},
9094
{
@@ -106,7 +110,6 @@
106110
"RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\" # modifiy to use your own\n",
107111
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
108112
"\n",
109-
"\n",
110113
"# for creating Azure ML Compute Cluster\n",
111114
"AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\" # modifiy to use your own\n",
112115
"NODE_COUNT = 2\n",
@@ -152,7 +155,7 @@
152155
"\n",
153156
"##\n",
154157
"# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\n",
155-
"TOP_N = 1000\n",
158+
"TOP_N = 100\n",
156159
"QUICK_RUN = True\n",
157160
"if not QUICK_RUN:\n",
158161
" TOP_N = -1"
@@ -293,11 +296,11 @@
293296
"outputs": [],
294297
"source": [
295298
"ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
296-
"!mkdir -p {PROJECT_FOLDER}\n",
297-
"!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
298-
"!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
299-
"!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
300-
"!cp -r ../../utils_nlp {PROJECT_FOLDER}"
299+
"os.makedirs(PROJECT_FOLDER, exist_ok=True)\n",
300+
"os.system(\"python ../../tools/generate_conda_file.py --gpu --name {}\".format(CONDA_ENV_NAME))\n",
301+
"os.system(\"cp ./nlp_gpu.yaml {}\".format(PROJECT_FOLDER))\n",
302+
"os.system(\"cp {} {}\".format(ENTRY_SCRIPT, PROJECT_FOLDER))\n",
303+
"os.system(\"cp -r ../../utils_nlp {}\".format(PROJECT_FOLDER))"
301304
]
302305
},
303306
{
@@ -397,8 +400,8 @@
397400
"metadata": {},
398401
"outputs": [],
399402
"source": [
400-
"# need to clear the local output dir as the ds.download won't download if the path exists\n",
401-
"!rm -rf {LOCAL_OUTPUT_DIR}/* "
403+
"# need to clear the local output dir as the ds.download won't download if the path exists \n",
404+
"os.system(\"rm -rf {}/*\".format(LOCAL_OUTPUT_DIR))"
402405
]
403406
},
404407
{
@@ -418,10 +421,13 @@
418421
"metadata": {},
419422
"outputs": [],
420423
"source": [
424+
"# the script uses <q> as sentence separator so it can write the prediction into the files properly\n",
425+
"# here we need to replace <q> with \"\\n\" to prepare for evalation\n",
426+
"# removing the ending \"\\n\" is also a preparation step for evalution.\n",
421427
"prediction = []\n",
422428
"with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
423429
" for cnt, line in enumerate(filehandle):\n",
424-
" prediction.append(line[0:-1]) # remove the ending \"\\n\""
430+
" prediction.append(line[0:-1].replace(\"<q>\", \"\\n\")) # remove the ending \"\\n\""
425431
]
426432
},
427433
{
@@ -451,7 +457,7 @@
451457
"for i in ext_sum_test:\n",
452458
" source.append(i[\"src_txt\"]) \n",
453459
" temp_target.append(\" \".join(j) for j in i['tgt']) \n",
454-
"target = [''.join(i) for i in list(temp_target)]"
460+
"target = ['\\n'.join(i) for i in list(temp_target)]"
455461
]
456462
},
457463
{
@@ -498,13 +504,13 @@
498504
"metadata": {},
499505
"outputs": [],
500506
"source": [
501-
"# processor = ExtSumProcessor()\n",
507+
"BATCH_SIZE = 32\n",
502508
"summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_CACHE_DIR)\n",
503509
"summarizer.model.load_state_dict(\n",
504510
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
505511
" map_location=\"cpu\"))\n",
506512
"\n",
507-
"prediction = summarizer.predict(test_dataset[0:TOP_N], num_gpus=torch.cuda.device_count(), batch_size=128, sentence_separator = \"\\n\")\n",
513+
"prediction = summarizer.predict(ext_sum_test, num_gpus=torch.cuda.device_count(), batch_size=BATCH_SIZE, sentence_separator = \"\\n\")\n",
508514
"#\"\"\""
509515
]
510516
},

0 commit comments

Comments
 (0)