Skip to content

Commit a4049ad

Browse files
autohf (microsoft#43)
automate huggingface transformer
1 parent e031c2e commit a4049ad

29 files changed

+4316
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,5 @@ notebook/.azureml
153153
mlruns
154154
logs
155155
automl.pkl
156+
157+
.idea/*

docs/index.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ Tune
3838
:members:
3939

4040

41+
NLP
42+
------
43+
44+
.. autoclass:: flaml.nlp.AutoTransformers
45+
:members:
46+
47+
4148
.. Indices and tables
4249
.. ==================
4350

flaml/nlp/README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
How to use AutoTransformers:
2+
3+
```python
4+
from flaml.nlp.autotransformers import AutoTransformers
5+
6+
autohf = AutoTransformers()
7+
preparedata_setting = {
8+
"dataset_subdataset_name": "glue:rte",
9+
"pretrained_model_size": "electra-base-discriminator:base",
10+
"data_root_path": "data/",
11+
"max_seq_length": 128,
12+
}
13+
autohf.prepare_data(**preparedata_setting)
14+
autohf_settings = {"resources_per_trial": {"gpu": 1, "cpu": 1},
15+
"num_samples": -1, # unlimited sample size
16+
"time_budget": 3600,
17+
"ckpt_per_epoch": 1,
18+
"fp16": False,
19+
}
20+
validation_metric, analysis = \
21+
autohf.fit(**autohf_settings,)
22+
23+
```
24+
25+
The current use cases that are supported:
26+
1. A simplified version of fine-tuning the GLUE dataset using HuggingFace;
27+
2. For selecting better search space for fine-tuning the GLUE dataset;
28+
3. Use the search algorithms in flaml for more efficient fine-tuning of HuggingFace;
29+
30+
The use cases that can be supported in future:
31+
1. HPO fine-tuning for text generation;
32+
2. HPO fine-tuning for question answering;

flaml/nlp/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from flaml.nlp.autotransformers import AutoTransformers
2+
from flaml.nlp.result_analysis.azure_utils import AzureUtils, JobID

flaml/nlp/autotransformers.py

Lines changed: 852 additions & 0 deletions
Large diffs are not rendered by default.

flaml/nlp/dataset/__init__.py

Whitespace-only changes.
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
from collections import OrderedDict
2+
from functools import partial
3+
4+
from transformers import AutoTokenizer
5+
from .sentence_keys_auto import get_sentence_keys
6+
7+
8+
def inserting_sepp(sent, start, end, this_tokenizer):
9+
return \
10+
sent[:start].rstrip() + " " + this_tokenizer.sep_token + " " + sent[start:end] \
11+
+ " " + this_tokenizer.sep_token + " " + sent[end:].lstrip()
12+
13+
14+
def tokenize_superglue_copa(this_example,
15+
this_tokenizer,
16+
dataset_name,
17+
subdataset_name=None,
18+
**kwargs):
19+
return None
20+
21+
22+
def tokenize_superglue_wic_gpt2(this_example,
23+
this_tokenizer,
24+
dataset_name,
25+
subdataset_name=None,
26+
**kwargs):
27+
return None
28+
29+
30+
def tokenize_superglue_wic(this_example,
31+
this_tokenizer,
32+
dataset_name,
33+
subdataset_name=None,
34+
**kwargs
35+
):
36+
"""
37+
tokenize the data from the wic task (word-in-context dataset),
38+
e.g., sentence 1: "There's a lot of trash on the bed of the river"
39+
sentence 2: "I keep a glass of water next to my bed when I sleep",
40+
label = False (different word senses)
41+
In the superglue data, the position of the word in sentence 1 and 2 are provided
42+
What this function does is to update the span position after tokenization, based on each LM's own tokenizer,
43+
The key is to insert an [SEP] before and after the original sentence, then feed it into the LM's tokenizer.
44+
There are two challenges:
45+
(1) Each LM's tokenizations are different, e.g., in XLNet's tokenizer, the paddings are on the left'
46+
(2) Some LM's tokenization would add an underline symbol before the word, e.g., "There's a lot"
47+
-> [_There, _', _s, _a, _lot]
48+
When underline meets special char such as '"', "'", the tokenized sequence after adding [SEP] needs to be
49+
aligned with the sequence tokenized without [SEP]. We use a two pointer algorithm for the alignment
50+
"""
51+
sent1, sent2 = this_example["sentence1"], this_example["sentence2"]
52+
start1, end1 = this_example["start1"], this_example["end1"]
53+
start2, end2 = this_example["start2"], this_example["end2"]
54+
"""
55+
Add [SEP] to the sentence
56+
"""
57+
altered_sent1 = inserting_sepp(sent1, start1, end1, this_tokenizer)
58+
altered_sent2 = inserting_sepp(sent2, start2, end2, this_tokenizer)
59+
input_ids_sepp = this_tokenizer(*(altered_sent1, altered_sent2),
60+
padding="max_length",
61+
max_length=1024,
62+
truncation=True)["input_ids"]
63+
data_pair = (sent1, sent2)
64+
assert "max_seq_length" in kwargs, "max_seq_length must be provided for glue"
65+
this_data = this_tokenizer(*data_pair, padding="max_length", max_length=kwargs["max_seq_length"], truncation=True)
66+
input_ids = this_data["input_ids"]
67+
which_sepp = 0
68+
69+
"""
70+
span_start_end: a 2x2 array:
71+
* (span_start_end[0][0], span_start_end[0][1]) are the spans of the position of the word in the first sentence
72+
* (span_start_end[1][0], span_start_end[1][1]) are the spans of the position of the word in the second sentence
73+
"""
74+
span_start_end = [[-1, -1], [-1, -1]]
75+
76+
ptr_sepp = ptr_nosepp = 0
77+
try:
78+
padding_direction = this_tokenizer.padding_side
79+
if padding_direction == "left":
80+
padding_id = input_ids_sepp[0]
81+
while input_ids_sepp[ptr_sepp] == padding_id:
82+
ptr_sepp += 1
83+
while input_ids[ptr_nosepp] == padding_id:
84+
ptr_nosepp += 1
85+
except KeyError:
86+
pass
87+
sep_id = this_tokenizer.convert_tokens_to_ids([this_tokenizer.sep_token])[0]
88+
"""
89+
use two pointers to align the tokenized sequence before and after adding [SEP];
90+
ptr_sepp: the pointer after adding; ptr_nosepp: the pointer without adding
91+
"""
92+
while ptr_sepp < len(input_ids_sepp) and ptr_nosepp < len(input_ids) and \
93+
input_ids_sepp[ptr_sepp] != 0 and input_ids[ptr_nosepp] != 0:
94+
if input_ids_sepp[ptr_sepp] == input_ids[ptr_nosepp]:
95+
ptr_sepp += 1
96+
ptr_nosepp += 1
97+
else:
98+
if not (input_ids_sepp[ptr_sepp] == sep_id
99+
or this_tokenizer.convert_ids_to_tokens([input_ids_sepp[ptr_sepp]])[0] in ('▁', '_')):
100+
break
101+
if input_ids_sepp[ptr_sepp] == sep_id:
102+
span_start_end[int(which_sepp / 2)][which_sepp % 2] = ptr_nosepp
103+
which_sepp += 1
104+
ptr_sepp += 1
105+
else:
106+
ptr_sepp += 1
107+
"""
108+
max_word_span is the maximum tokens of the word
109+
It is set to 16 following deberta:
110+
https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/apps/tasks/superglue_tasks.py#L1054
111+
"""
112+
max_word_span = 16
113+
word_indices = []
114+
for idx1 in range(2):
115+
if span_start_end[idx1][1] < kwargs["max_seq_length"]:
116+
first_span = [x for x in range(span_start_end[idx1][0], span_start_end[idx1][1])
117+
if x < kwargs["max_seq_length"]] + [0] * (max_word_span - span_start_end[idx1][1]
118+
+ span_start_end[idx1][0])
119+
word_indices.append(first_span)
120+
this_data["word_spans"] = word_indices
121+
return this_data
122+
123+
124+
def tokenize_glue(this_example,
125+
this_tokenizer,
126+
dataset_name,
127+
subdataset_name=None,
128+
**kwargs):
129+
sentence_keys = get_sentence_keys(dataset_name, subdataset_name)
130+
131+
if len(sentence_keys) > 1:
132+
sentence1_key, sentence2_key = sentence_keys[0], sentence_keys[1]
133+
else:
134+
sentence1_key = sentence_keys[0]
135+
sentence2_key = None
136+
137+
data_pair = (
138+
(this_example[sentence1_key],) if sentence2_key is None else (
139+
this_example[sentence1_key], this_example[sentence2_key])
140+
)
141+
assert "max_seq_length" in kwargs, "max_seq_length must be provided for glue"
142+
return this_tokenizer(*data_pair, padding="max_length", max_length=kwargs["max_seq_length"], truncation=True)
143+
144+
145+
TOKENIZER_MAPPING = OrderedDict(
146+
[
147+
(("glue", "rte"), tokenize_glue),
148+
(("glue", "mrpc"), tokenize_glue),
149+
(("glue", "cola"), tokenize_glue),
150+
(("glue", "wnli"), tokenize_glue),
151+
(("glue", "stsb"), tokenize_glue),
152+
(("glue", "sst2"), tokenize_glue),
153+
(("glue", "mnli"), tokenize_glue),
154+
(("glue", "qqp"), tokenize_glue),
155+
(("glue", "qnli"), tokenize_glue),
156+
(("super_glue", "wic"), tokenize_superglue_wic),
157+
]
158+
)
159+
160+
161+
class AutoEncodeText:
162+
"""
163+
This is a generic input text tokenization class that will be instantiated as one of the
164+
tokenization classes of the library when created with the
165+
`~flaml.nlp.dataset.AutoEncodeText.from_model_and_dataset_name` class method.
166+
167+
This class cannot be instantiated directly using ``__init__()`` (throws an error).
168+
"""
169+
170+
def __init__(self):
171+
raise EnvironmentError(
172+
"AutoEncodeText is designed to be instantiated "
173+
"using the `AutoEncodeText.from_model_and_dataset_name(cls,"
174+
"data_raw,model_checkpoint_path,dataset_name,subdataset_name = None,**kwargs)` methods."
175+
)
176+
177+
@classmethod
178+
def from_model_and_dataset_name(cls,
179+
data_raw,
180+
model_checkpoint_path,
181+
dataset_name,
182+
subdataset_name=None,
183+
**kwargs):
184+
"""
185+
Instantiate one of the input text tokenization classes from the raw data, model checkpoint path, dataset name
186+
and sub dataset name. The raw data is used for creating a mapping function from the raw tokens to the
187+
tokenized token ids.
188+
189+
Args:
190+
data_raw:
191+
The raw data (a datasets.Dataset object)
192+
193+
model_checkpoint_path:
194+
A string variable which specifies the model path, e.g., "google/electra-base-discriminator"
195+
196+
dataset_name:
197+
A string variable which is the dataset name, e.g., "glue"
198+
199+
subdataset_name:
200+
A string variable which is the sub dataset name,e.g., "rte"
201+
202+
kwargs:
203+
The values in kwargs of any keys will be used for the mapping function
204+
205+
Examples:
206+
>>> from datasets import load_dataset
207+
>>> data_raw = load_dataset("glue", "rte")
208+
>>> AutoEncodeText.from_model_and_dataset_name(data_raw, "google/electra-base-discriminator", ["glue"], "rte")
209+
210+
"""
211+
if (dataset_name, subdataset_name) in TOKENIZER_MAPPING.keys():
212+
this_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path, use_fast=True)
213+
token_func = TOKENIZER_MAPPING[(dataset_name, subdataset_name)]
214+
return data_raw.map(
215+
partial(token_func,
216+
this_tokenizer=this_tokenizer,
217+
dataset_name=dataset_name,
218+
subdataset_name=subdataset_name,
219+
**kwargs), batched=False)
220+
raise ValueError(
221+
"Unrecognized method {},{} for this kind of AutoGridSearchSpace: {}.\n"
222+
"Method name should be one of {}.".format(
223+
dataset_name, subdataset_name, cls.__name__, ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys())
224+
)
225+
)

flaml/nlp/dataset/metric_auto.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# https://github.com/huggingface/datasets/blob/master/metrics/glue/glue.py
2+
from collections import OrderedDict
3+
4+
metric_mode_mapping_glue = {
5+
"cola": [("matthews_correlation", "max")],
6+
"mnli": [("accuracy", "max")],
7+
"mrpc": [("accuracy", "max"), ("f1", "max")],
8+
"qnli": [("accuracy", "max")],
9+
"qqp": [("accuracy", "max"), ("f1", "max")],
10+
"rte": [("accuracy", "max")],
11+
"sst2": [("accuracy", "max")],
12+
"stsb": [("pearson", "max"), ("spearmanr", "max")],
13+
"wnli": [("accuracy", "max")]
14+
}
15+
16+
metric_mode_mapping_squad = [("exact_match", "max"), ("f1", "max")]
17+
18+
metric_mode_mapping_super_glue = {
19+
"axb": [("matthews_correlation", "max")],
20+
"cb": [("accuracy", "max"), ("f1", "max")],
21+
"copa": [("accuracy", "max")],
22+
"rte": [("accuracy", "max")],
23+
"wic": [("accuracy", "max")],
24+
"wsc": [("accuracy", "max")],
25+
"wsc.fixed": [("accuracy", "max")],
26+
"boolq": [("accuracy", "max")],
27+
"axg": [("accuracy", "max")]
28+
}
29+
30+
metric_mode_mapping_imdb = [("accuracy", "max")]
31+
32+
metric_mode_mapping_yelp = [("accuracy", "max")]
33+
34+
METRIC_MAPPING = OrderedDict(
35+
[
36+
("squad", metric_mode_mapping_squad),
37+
("glue", metric_mode_mapping_glue),
38+
("super_glue", metric_mode_mapping_super_glue),
39+
("imdb", metric_mode_mapping_imdb),
40+
("yelp_review_full", metric_mode_mapping_yelp)
41+
]
42+
)
43+
44+
45+
def get_default_and_alternative_metric(dataset_name,
46+
subdataset_name=None,
47+
custom_metric_name=None,
48+
custom_metric_mode_name=None):
49+
if dataset_name not in METRIC_MAPPING.keys():
50+
assert custom_metric_name and custom_metric_mode_name, \
51+
"The dataset is not in {}, you must explicitly specify " \
52+
"the custom_metric_name and custom_metric_mode_name".format(",".join(METRIC_MAPPING.keys()))
53+
eval_name_mapping = METRIC_MAPPING[dataset_name]
54+
if isinstance(eval_name_mapping, dict):
55+
assert subdataset_name and subdataset_name in eval_name_mapping, \
56+
"dataset_name and subdataset_name not correctly specified"
57+
default_metric, default_mode = eval_name_mapping[subdataset_name][0]
58+
all_metrics, all_mode \
59+
= [x[0] for x in eval_name_mapping[subdataset_name]] \
60+
+ ["loss"], [x[1] for x in eval_name_mapping[subdataset_name]] + ["min"]
61+
62+
return default_metric, default_mode, all_metrics, all_mode
63+
else:
64+
assert isinstance(eval_name_mapping, list), "dataset_name and subdataset_name not correctly specified"
65+
66+
default_metric, default_mode = eval_name_mapping[0]
67+
all_metrics, all_mode = [x[0] for x in eval_name_mapping] + ["loss"], \
68+
[x[1] for x in eval_name_mapping] + ["min"]
69+
70+
return default_metric, default_mode, all_metrics, all_mode
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
sentence_keys_glue = {
2+
"cola": ["sentence"],
3+
"mnli": ["premise", "hypothesis"],
4+
"mrpc": ["sentence1", "sentence2"],
5+
"qnli": ["sentence", "question"],
6+
"qqp": ["question1", "question2"],
7+
"rte": ["sentence1", "sentence2"],
8+
"sst2": ["sentence"],
9+
"stsb": ["sentence1", "sentence2"],
10+
"wnli": ["sentence1", "sentence2"]
11+
}
12+
13+
sentence_keys_super_glue = {
14+
"rte": ["hypothesis", "premise"],
15+
"wic": ["sentence1", "sentence2"],
16+
"wsc": ["text"]
17+
}
18+
19+
20+
def get_sentence_keys(dataset_name, subdataset_name=None):
21+
eval_name_mapping = globals()["sentence_keys_" + dataset_name]
22+
if isinstance(eval_name_mapping, dict):
23+
assert subdataset_name and subdataset_name in eval_name_mapping, \
24+
"dataset_name and subdataset_name not correctly specified"
25+
sentence_keys = eval_name_mapping[subdataset_name]
26+
else:
27+
sentence_keys = eval_name_mapping
28+
return sentence_keys

0 commit comments

Comments
 (0)