|
| 1 | +# Copyright (c) Microsoft Corporation. |
| 2 | +# Licensed under the MIT License. |
| 3 | +""" |
| 4 | + The motivation of this demo |
| 5 | + - To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing |
| 6 | +""" |
| 7 | + |
| 8 | +from copy import deepcopy |
| 9 | +from pathlib import Path |
| 10 | +import pickle |
| 11 | +from pprint import pprint |
| 12 | +import subprocess |
| 13 | + |
| 14 | +import yaml |
| 15 | + |
| 16 | +from qlib import init |
| 17 | +from qlib.data.dataset.handler import DataHandlerLP |
| 18 | +from qlib.log import TimeInspector |
| 19 | +from qlib.model.trainer import task_train |
| 20 | +from qlib.utils import init_instance_by_config |
| 21 | + |
| 22 | +# For general purpose, we use relative path |
| 23 | +DIRNAME = Path(__file__).absolute().resolve().parent |
| 24 | + |
| 25 | +if __name__ == "__main__": |
| 26 | + init() |
| 27 | + |
| 28 | + repeat = 2 |
| 29 | + exp_name = "data_mem_reuse_demo" |
| 30 | + |
| 31 | + config_path = DIRNAME.parent / "benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml" |
| 32 | + task_config = yaml.safe_load(config_path.open()) |
| 33 | + |
| 34 | + # 1) without using processed data in memory |
| 35 | + with TimeInspector.logt("The original time without reusing processed data in memory:"): |
| 36 | + for i in range(repeat): |
| 37 | + task_train(task_config["task"], experiment_name=exp_name) |
| 38 | + |
| 39 | + # 2) prepare processed data in memory. |
| 40 | + hd_conf = task_config["task"]["dataset"]["kwargs"]["handler"] |
| 41 | + pprint(hd_conf) |
| 42 | + hd: DataHandlerLP = init_instance_by_config(hd_conf) |
| 43 | + |
| 44 | + # 3) with reusing processed data in memory |
| 45 | + new_task = deepcopy(task_config["task"]) |
| 46 | + new_task["dataset"]["kwargs"]["handler"] = hd |
| 47 | + print(new_task) |
| 48 | + |
| 49 | + with TimeInspector.logt("The time with reusing processed data in memory:"): |
| 50 | + # this will save the time to reload and process data from disk(in `DataHandlerLP`) |
| 51 | + # It still takes a lot of time in the backtest phase |
| 52 | + for i in range(repeat): |
| 53 | + task_train(new_task["task"], experiment_name=exp_name) |
| 54 | + |
| 55 | + # 4) User can change other parts exclude processed data in memory(handler) |
| 56 | + new_task = deepcopy(task_config["task"]) |
| 57 | + new_task["dataset"]["kwargs"]["segments"]["train"] = ("20100101", "20131231") |
| 58 | + with TimeInspector.logt("The time with reusing processed data in memory:"): |
| 59 | + task_train(new_task["task"], experiment_name=exp_name) |
0 commit comments