Skip to content

Commit eb6a091

Browse files
authored
Merge pull request microsoft#340 from Derek-Wds/main
Support resuming recorder
2 parents 0e0a318 + 005d752 commit eb6a091

File tree

3 files changed

+102
-53
lines changed

3 files changed

+102
-53
lines changed

qlib/workflow/__init__.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,27 @@ def __repr__(self):
2222

2323
@contextmanager
2424
def start(
25-
self, experiment_name: Optional[Text] = None, recorder_name: Optional[Text] = None, uri: Optional[Text] = None
25+
self,
26+
experiment_name: Optional[Text] = None,
27+
recorder_name: Optional[Text] = None,
28+
uri: Optional[Text] = None,
29+
resume: bool = False,
2630
):
2731
"""
2832
Method to start an experiment. This method can only be called within a Python's `with` statement. Here is the example code:
2933
3034
.. code-block:: Python
3135
36+
# start new experiment and recorder
3237
with R.start('test', 'recorder_1'):
3338
model.fit(dataset)
3439
R.log...
3540
... # further operations
3641
42+
# resume previous experiment and recorder
43+
with R.start('test', 'recorder_1', resume=True): # if users want to resume recorder, they have to specify the exact same name for experiment and recorder.
44+
... # further operations
45+
3746
Parameters
3847
----------
3948
experiment_name : str
@@ -45,16 +54,18 @@ def start(
4554
The default uri is set in the qlib.config. Note that this uri argument will not change the one defined in the config file.
4655
Therefore, the next time when users call this function in the same experiment,
4756
they have to also specify this argument with the same value. Otherwise, inconsistent uri may occur.
57+
resume : bool
58+
whether to resume the specific recorder with given name under the given experiment.
4859
"""
49-
run = self.start_exp(experiment_name, recorder_name, uri)
60+
run = self.start_exp(experiment_name, recorder_name, uri, resume)
5061
try:
5162
yield run
5263
except Exception as e:
5364
self.end_exp(Recorder.STATUS_FA) # end the experiment if something went wrong
5465
raise e
5566
self.end_exp(Recorder.STATUS_FI)
5667

57-
def start_exp(self, experiment_name=None, recorder_name=None, uri=None):
68+
def start_exp(self, experiment_name=None, recorder_name=None, uri=None, resume=False):
5869
"""
5970
Lower level method for starting an experiment. When use this method, one should end the experiment manually
6071
and the status of the recorder may not be handled properly. Here is the example code:
@@ -75,12 +86,14 @@ def start_exp(self, experiment_name=None, recorder_name=None, uri=None):
7586
uri : str
7687
the tracking uri of the experiment, where all the artifacts/metrics etc. will be stored.
7788
The default uri are set in the qlib.config.
89+
resume : bool
90+
whether to resume the specific recorder with given name under the given experiment.
7891
7992
Returns
8093
-------
8194
An experiment instance being started.
8295
"""
83-
return self.exp_manager.start_exp(experiment_name, recorder_name, uri)
96+
return self.exp_manager.start_exp(experiment_name, recorder_name, uri, resume)
8497

8598
def end_exp(self, recorder_status=Recorder.STATUS_FI):
8699
"""

qlib/workflow/exp.py

Lines changed: 68 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,16 @@ def info(self):
3939
output["recorders"] = list(recorders.keys())
4040
return output
4141

42-
def start(self, recorder_name=None):
42+
def start(self, recorder_name=None, resume=False):
4343
"""
4444
Start the experiment and set it to be active. This method will also start a new recorder.
4545
4646
Parameters
4747
----------
4848
recorder_name : str
4949
the name of the recorder to be created.
50+
resume : bool
51+
whether to resume the first recorder
5052
5153
Returns
5254
-------
@@ -149,7 +151,57 @@ def get_recorder(self, recorder_id=None, recorder_name=None, create: bool = True
149151
-------
150152
A recorder object.
151153
"""
152-
raise NotImplementedError(f"Please implement the `get_recorder` method.")
154+
# special case of getting the recorder
155+
if recorder_id is None and recorder_name is None:
156+
if self.active_recorder is not None:
157+
return self.active_recorder
158+
recorder_name = self._default_rec_name
159+
if create:
160+
recorder, is_new = self._get_or_create_rec(recorder_id=recorder_id, recorder_name=recorder_name)
161+
else:
162+
recorder, is_new = self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), False
163+
if is_new:
164+
self.active_recorder = recorder
165+
# start the recorder
166+
self.active_recorder.start_run()
167+
return recorder
168+
169+
def _get_or_create_rec(self, recorder_id=None, recorder_name=None) -> (object, bool):
170+
"""
171+
Method for getting or creating a recorder. It will try to first get a valid recorder, if exception occurs, it will
172+
automatically create a new recorder based on the given id and name.
173+
"""
174+
try:
175+
if recorder_id is None and recorder_name is None:
176+
recorder_name = self._default_rec_name
177+
return self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), False
178+
except ValueError:
179+
if recorder_name is None:
180+
recorder_name = self._default_rec_name
181+
logger.info(f"No valid recorder found. Create a new recorder with name {recorder_name}.")
182+
return self.create_recorder(recorder_name), True
183+
184+
def _get_recorder(self, recorder_id=None, recorder_name=None):
185+
"""
186+
Get specific recorder by name or id. If it does not exist, raise ValueError
187+
188+
Parameters
189+
----------
190+
recorder_id :
191+
The id of recorder
192+
recorder_name :
193+
The name of recorder
194+
195+
Returns
196+
-------
197+
Recorder:
198+
The searched recorder
199+
200+
Raises
201+
------
202+
ValueError
203+
"""
204+
raise NotImplementedError(f"Please implement the `_get_recorder` method")
153205

154206
def list_recorders(self):
155207
"""
@@ -178,12 +230,20 @@ def __init__(self, id, name, uri):
178230
def __repr__(self):
179231
return "{name}(id={id}, info={info})".format(name=self.__class__.__name__, id=self.id, info=self.info)
180232

181-
def start(self, recorder_name=None):
233+
def start(self, recorder_name=None, resume=False):
182234
logger.info(f"Experiment {self.id} starts running ...")
183-
# set up recorder
184-
recorder = self.create_recorder(recorder_name)
235+
# Get or create recorder
236+
if recorder_name is None:
237+
recorder_name = self._default_rec_name
238+
# resume the recorder
239+
if resume:
240+
recorder, _ = self._get_or_create_rec(recorder_name=recorder_name)
241+
# create a new recorder
242+
else:
243+
recorder = self.create_recorder(recorder_name)
244+
# Set up active recorder
185245
self.active_recorder = recorder
186-
# start the recorder
246+
# Start the recorder
187247
self.active_recorder.start_run()
188248

189249
return self.active_recorder
@@ -200,35 +260,6 @@ def create_recorder(self, recorder_name=None):
200260

201261
return recorder
202262

203-
def get_recorder(self, recorder_id=None, recorder_name=None, create=True):
204-
# special case of getting the recorder
205-
if recorder_id is None and recorder_name is None:
206-
if self.active_recorder is not None:
207-
return self.active_recorder
208-
recorder_name = self._default_rec_name
209-
if create:
210-
recorder, is_new = self._get_or_create_rec(recorder_id=recorder_id, recorder_name=recorder_name)
211-
else:
212-
recorder, is_new = self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), False
213-
if is_new:
214-
self.active_recorder = recorder
215-
# start the recorder
216-
self.active_recorder.start_run()
217-
return recorder
218-
219-
def _get_or_create_rec(self, recorder_id=None, recorder_name=None) -> (object, bool):
220-
"""
221-
Method for getting or creating a recorder. It will try to first get a valid recorder, if exception occurs, it will
222-
automatically create a new recorder based on the given id and name.
223-
"""
224-
try:
225-
return self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), False
226-
except ValueError:
227-
if recorder_name is None:
228-
recorder_name = self._default_rec_name
229-
logger.info(f"No valid recorder found. Create a new recorder with name {recorder_name}.")
230-
return self.create_recorder(recorder_name), True
231-
232263
def _get_recorder(self, recorder_id=None, recorder_name=None):
233264
"""
234265
Method for getting or creating a recorder. It will try to first get a valid recorder, if exception occurs, it will
@@ -246,7 +277,7 @@ def _get_recorder(self, recorder_id=None, recorder_name=None):
246277
raise ValueError("No valid recorder has been found, please make sure the input recorder id is correct.")
247278
elif recorder_name is not None:
248279
logger.warning(
249-
f"Please make sure the recorder name {recorder_name} is unique, we will only return the first recorder if there exist several matched the given name."
280+
f"Please make sure the recorder name {recorder_name} is unique, we will only return the latest recorder if there exist several matched the given name."
250281
)
251282
recorders = self.list_recorders()
252283
for rid in recorders:
@@ -280,7 +311,7 @@ def delete_recorder(self, recorder_id=None, recorder_name=None):
280311
UNLIMITED = 50000 # FIXME: Mlflow can only list 50000 records at most!!!!!!!
281312

282313
def list_recorders(self, max_results=UNLIMITED):
283-
runs = self._client.search_runs(self.id, run_view_type=ViewType.ACTIVE_ONLY, max_results=max_results)[::-1]
314+
runs = self._client.search_runs(self.id, run_view_type=ViewType.ACTIVE_ONLY, max_results=max_results)
284315
recorders = dict()
285316
for i in range(len(runs)):
286317
recorder = MLflowRecorder(self.id, self._uri, mlflow_run=runs[i])

qlib/workflow/expm.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class ExpManager:
2525

2626
def __init__(self, uri: Text, default_exp_name: Optional[Text]):
2727
self._current_uri = uri
28-
self.default_exp_name = default_exp_name
28+
self._default_exp_name = default_exp_name
2929
self.active_experiment = None # only one experiment can active each time
3030

3131
def __repr__(self):
@@ -36,6 +36,7 @@ def start_exp(
3636
experiment_name: Optional[Text] = None,
3737
recorder_name: Optional[Text] = None,
3838
uri: Optional[Text] = None,
39+
resume: bool = False,
3940
**kwargs,
4041
):
4142
"""
@@ -50,6 +51,8 @@ def start_exp(
5051
name of the recorder to be started.
5152
uri : str
5253
the current tracking URI.
54+
resume : boolean
55+
whether to resume the experiment and recorder.
5356
5457
Returns
5558
-------
@@ -151,9 +154,7 @@ def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True)
151154
if self.active_experiment is not None:
152155
return self.active_experiment
153156
# User don't want get active code now.
154-
# Don't assume underlying code could handle the case of two None
155-
if experiment_id is None and experiment_name is None:
156-
experiment_name = self.default_exp_name
157+
experiment_name = self._default_exp_name
157158

158159
if create:
159160
exp, is_new = self._get_or_create_exp(experiment_id=experiment_id, experiment_name=experiment_name)
@@ -171,25 +172,23 @@ def _get_or_create_exp(self, experiment_id=None, experiment_name=None) -> (objec
171172
automatically create a new experiment based on the given id and name.
172173
"""
173174
try:
174-
if experiment_id is None and experiment_name is None:
175-
experiment_name = self.default_exp_name
176175
return self._get_exp(experiment_id=experiment_id, experiment_name=experiment_name), False
177176
except ValueError:
178177
if experiment_name is None:
179-
experiment_name = self.default_exp_name
178+
experiment_name = self._default_exp_name
180179
logger.info(f"No valid experiment found. Create a new experiment with name {experiment_name}.")
181180
return self.create_exp(experiment_name), True
182181

183182
def _get_exp(self, experiment_id=None, experiment_name=None) -> Experiment:
184183
"""
185-
get specific experiment by name or id. If it does not exist, raise ValueError
184+
Get specific experiment by name or id. If it does not exist, raise ValueError.
186185
187186
Parameters
188187
----------
189188
experiment_id :
190189
The id of experiment
191190
experiment_name :
192-
The id name experiment
191+
The name of experiment
193192
194193
Returns
195194
-------
@@ -291,16 +290,22 @@ def client(self):
291290
return self._client
292291

293292
def start_exp(
294-
self, experiment_name: Optional[Text] = None, recorder_name: Optional[Text] = None, uri: Optional[Text] = None
293+
self,
294+
experiment_name: Optional[Text] = None,
295+
recorder_name: Optional[Text] = None,
296+
uri: Optional[Text] = None,
297+
resume: bool = False,
295298
):
296299
# Set the tracking uri
297300
self.set_uri(uri)
298301
# Create experiment
302+
if experiment_name is None:
303+
experiment_name = self._default_exp_name
299304
experiment, _ = self._get_or_create_exp(experiment_name=experiment_name)
300305
# Set up active experiment
301306
self.active_experiment = experiment
302307
# Start the experiment
303-
self.active_experiment.start(recorder_name)
308+
self.active_experiment.start(recorder_name, resume)
304309

305310
return self.active_experiment
306311

@@ -316,7 +321,7 @@ def create_exp(self, experiment_name: Optional[Text] = None):
316321
# init experiment
317322
experiment_id = self.client.create_experiment(experiment_name)
318323
experiment = MLflowExperiment(experiment_id, experiment_name, self.uri)
319-
experiment._default_name = self.default_exp_name
324+
experiment._default_name = self._default_exp_name
320325

321326
return experiment
322327

0 commit comments

Comments
 (0)