Skip to content

Commit 759705b

Browse files
committed
example to reusing processed data in memory
1 parent f6864d5 commit 759705b

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
"""
4+
The motivation of this demo
5+
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
6+
"""
7+
8+
from copy import deepcopy
9+
from pathlib import Path
10+
import pickle
11+
from pprint import pprint
12+
import subprocess
13+
14+
import yaml
15+
16+
from qlib import init
17+
from qlib.data.dataset.handler import DataHandlerLP
18+
from qlib.log import TimeInspector
19+
from qlib.model.trainer import task_train
20+
from qlib.utils import init_instance_by_config
21+
22+
# For general purpose, we use relative path
23+
DIRNAME = Path(__file__).absolute().resolve().parent
24+
25+
if __name__ == "__main__":
26+
init()
27+
28+
repeat = 2
29+
exp_name = "data_mem_reuse_demo"
30+
31+
config_path = DIRNAME.parent / "benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml"
32+
task_config = yaml.safe_load(config_path.open())
33+
34+
# 1) without using processed data in memory
35+
with TimeInspector.logt("The original time without reusing processed data in memory:"):
36+
for i in range(repeat):
37+
task_train(task_config["task"], experiment_name=exp_name)
38+
39+
# 2) prepare processed data in memory.
40+
hd_conf = task_config["task"]["dataset"]["kwargs"]["handler"]
41+
pprint(hd_conf)
42+
hd: DataHandlerLP = init_instance_by_config(hd_conf)
43+
44+
# 3) with reusing processed data in memory
45+
new_task = deepcopy(task_config["task"])
46+
new_task["dataset"]["kwargs"]["handler"] = hd
47+
print(new_task)
48+
49+
with TimeInspector.logt("The time with reusing processed data in memory:"):
50+
# this will save the time to reload and process data from disk(in `DataHandlerLP`)
51+
# It still takes a lot of time in the backtest phase
52+
for i in range(repeat):
53+
task_train(new_task["task"], experiment_name=exp_name)
54+
55+
# 4) User can change other parts exclude processed data in memory(handler)
56+
new_task = deepcopy(task_config["task"])
57+
new_task["dataset"]["kwargs"]["segments"]["train"] = ("20100101", "20131231")
58+
with TimeInspector.logt("The time with reusing processed data in memory:"):
59+
task_train(new_task["task"], experiment_name=exp_name)

0 commit comments

Comments
 (0)