-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathbo_finetuning.py
More file actions
145 lines (132 loc) · 4.61 KB
/
bo_finetuning.py
File metadata and controls
145 lines (132 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import logging
from argparse import ArgumentParser
from pathlib import Path
import torch
from bo_options import lora_target_map
from syne_tune import StoppingCriterion, Tuner, num_gpu
from syne_tune.backend import LocalBackend
from syne_tune.config_space import choice, loguniform, randint, uniform
from syne_tune.optimizer.baselines import BayesianOptimization, RandomSearch
# Model-agnostic configuration space (or search space)
# May benefit from tweaking for specific model types including custom models
config_space = {
"learning-rate": loguniform(1e-4, 1e-2),
"weight-decay": loguniform(1e-5, 1e-1),
"adam-beta1": uniform(0.9, 0.99),
"adam-beta2": uniform(0.9, 0.999),
"adam-epsilon": loguniform(1e-9, 1e-6),
"num-warmup-steps": randint(0, 10000),
"lr-scheduler-type": choice(["linear", "cosine", "linear_with_warmup", "cosine_with_warmup"]),
"lora-alpha": loguniform(4, 256),
"lora-dropout": uniform(0, 0.5),
"lora-r": randint(2, 64),
"finetune-train-seqlen": randint(64, 1024),
"finetune-test-seqlen": 2048,
"finetune-train-nsamples": 8192,
"finetune-train-batch-size": randint(1, 8),
"wandb-project": "syne-tune-phi",
"finetune-dataset": "alpaca",
"ppl-eval-dataset": "alpaca",
}
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
# Temporary fix to be able to use syne tune on AMD GPUs.
# Can be removed once syne tune supports ROCm.
num_gpu._num_gpus = torch.cuda.device_count()
logging.info(f"Number of available cuda devices for syne tune: {num_gpu._num_gpus}")
parser = ArgumentParser()
parser.add_argument(
"--model",
type=str,
default="facebook/opt-125m",
help="Model to fine-tune",
)
path_group = parser.add_mutually_exclusive_group()
path_group.add_argument(
"--model-path",
type=str,
default=None,
help="Path to load the model and tokenizer from (required for local models, not required for HF models)",
)
path_group.add_argument(
"--sliced-model-path",
type=str,
help="Path to load the model to fine-tune (sliced) and tokenizer from",
default=None,
)
parser.add_argument(
"--sparsity", type=float, default=0.0, help="A measure of how much slicing is applied (in the range [0, 1))"
)
parser.add_argument(
"--method",
type=str,
choices=(
"RS",
"BO",
),
default="RS",
)
parser.add_argument(
"--random-seed",
type=int,
default=42,
)
parser.add_argument(
"--n-workers",
type=int,
default=4,
)
parser.add_argument(
"--max-wallclock-time",
type=int,
default=3600,
)
parser.add_argument(
"--experiment-tag",
type=str,
default="bo-finetune",
)
args, _ = parser.parse_known_args()
train_file = "run_finetuning.py"
entry_point = Path(__file__).parent / train_file
mode = "min"
metric = "ppl"
# Local backend: Responsible for scheduling trials [3]
# The local backend runs trials as sub-processes on a single instance
trial_backend = LocalBackend(entry_point=str(entry_point))
# Common scheduler kwargs
method_kwargs = dict(
metric=metric,
mode=mode,
random_seed=args.random_seed,
search_options={"num_init_random": args.n_workers + 2},
)
# Add model-specific config options such as model type, model path and layers to fine-tune
config_space['model'] = args.model
if args.model_path:
config_space['model-path'] = args.model_path
if args.sliced_model_path:
config_space['sliced-model-path'] = args.sliced_model_path
config_space['sparsity'] = args.sparsity
config_space['lora-target-option'] = choice(list(lora_target_map(args.model).keys()))
if args.method == "RS":
scheduler = RandomSearch(config_space, **method_kwargs)
elif args.method == "BO":
scheduler = BayesianOptimization(config_space, **method_kwargs)
else:
raise ValueError(f"Unknown method: {args.method}")
# Stopping criterion: We stop after `args.max_wallclock_time` seconds
stop_criterion = StoppingCriterion(max_wallclock_time=args.max_wallclock_time)
tuner = Tuner(
trial_backend=trial_backend,
scheduler=scheduler,
stop_criterion=stop_criterion,
n_workers=args.n_workers,
tuner_name=args.experiment_tag,
metadata={
"seed": args.random_seed,
"algorithm": args.method,
"tag": args.experiment_tag,
},
)
tuner.run()