From afd4060c3247c7a6e0bda7d4cc54d4d80ed01e8e Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 08:57:31 +0000 Subject: [PATCH 1/8] add docs & fix reinit of datatset --- examples/highfreq/README.md | 28 ++++++++++++++++++++++ examples/highfreq/__init__.py | 0 examples/highfreq/workflow.py | 45 +++++++++++++++++++++++++---------- qlib/data/dataset/__init__.py | 39 +++++++++++++++++++++++++++--- 4 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 examples/highfreq/README.md delete mode 100644 examples/highfreq/__init__.py diff --git a/examples/highfreq/README.md b/examples/highfreq/README.md new file mode 100644 index 00000000000..56981abcc32 --- /dev/null +++ b/examples/highfreq/README.md @@ -0,0 +1,28 @@ +# High-Frequency Dataset + +This dataset is an example for RL high frequency trading. + +## Get High-Frequency Data + +Get high-frequency data by running the following command: +```bash + python workflow.py get_data +``` + +## Dump & Reload & Reinitialize the Dataset + + +The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of `qlib.utils.serial.Serializable`, which supports being dumped in or loaded from disk in `pickle` format. + +### About Reinitialization + +After reloading `Dataset` from disk, `Qlib` also support reinitialize the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc. + +The example is given in `workflow.py`, users can run the code as follows. + +### Run the Code + +Run the example by running the following command: +```bash + python workflow.py dump_and_load_dataset +``` \ No newline at end of file diff --git a/examples/highfreq/__init__.py b/examples/highfreq/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index 6649079d836..dac27082fce 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -9,7 +9,7 @@ import pickle import numpy as np import pandas as pd -from qlib.config import HIGH_FREQ_CONFIG +from qlib.config import REG_CN, HIGH_FREQ_CONFIG from qlib.contrib.model.gbdt import LGBModel from qlib.contrib.data.handler import Alpha158 from qlib.contrib.strategy.strategy import TopkDropoutStrategy @@ -26,7 +26,6 @@ from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull - class HighfreqWorkflow(object): SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None} @@ -123,8 +122,7 @@ def get_data(self): backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) print(backtest_train, backtest_test) - del xtrain, xtest - del backtest_train, backtest_test + return def dump_and_load_dataset(self): """dump and load dataset state on disk""" @@ -146,18 +144,39 @@ def dump_and_load_dataset(self): dataset_backtest = pickle.load(file_dataset_backtest) self._prepare_calender_cache() - ##=============reload_dataset============= - dataset.init(init_type=DataHandlerLP.IT_LS) - dataset_backtest.init() + ##=============reinit dataset============= + dataset.init( + handler_kwargs = { + "init_type" : DataHandlerLP.IT_LS, + "start_time" : "2021-01-19 00:00:00", + "end_time" : "2021-01-25 16:00:00", + }, + segment_kwargs = { + "test": ( + "2021-01-19 00:00:00", + "2021-01-25 16:00:00", + ), + } + ) + dataset_backtest.init( + handler_kwargs = { + "start_time" : "2021-01-19 00:00:00", + "end_time" : "2021-01-25 16:00:00", + }, + segment_kwargs = { + "test": ( + "2021-01-19 00:00:00", + "2021-01-25 16:00:00", + ), + } + ) ##=============get data============= - xtrain, xtest = dataset.prepare(["train", "test"]) - backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) + xtest = dataset.prepare(["test"]) + backtest_test = dataset_backtest.prepare(["test"]) - print(xtrain, xtest) - print(backtest_train, backtest_test) - del xtrain, xtest - del backtest_train, backtest_test + print(xtest, backtest_test) + return if __name__ == "__main__": diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 6b98baf8f34..e2606ec0f0d 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -87,9 +87,42 @@ def __init__(self, handler: Union[dict, DataHandler], segments: dict): """ super().__init__(handler, segments) - def init(self, **kwargs): - """Initialize the DatasetH, Only parameters belonging to handler.init will be passed in""" - self.handler.init(**kwargs) + def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None): + """ + Initialize the DatasetH + + Parameters + ---------- + handler_kwargs : dict + Config of DataHanlder, which could include the following arguments: + + - arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'. + + - arguments of DataHandler.init, such as 'enable_cache', etc. + + segment_kwargs : dict + Config of segments which is same as 'segments' in DatasetH.setup_data + + """ + if handler_kwargs: + if not isinstance(handler_kwargs, dict): + raise TypeError(f"param handler_kwargs must be type dict, not {type(handler_kwargs)}") + kwargs_init = {} + kwargs_conf_data = {} + conf_data_arg = {"instruments", "start_time", "end_time"} + for k, v in handler_kwargs.items(): + if k in conf_data_arg: + kwargs_conf_data.update({k:v}) + else: + kwargs_init.update({k:v}) + + self.handler.conf_data(**kwargs_conf_data) + self.handler.init(**kwargs_init) + + if segment_kwargs: + if not isinstance(segment_kwargs, dict): + raise TypeError(f"param handler_kwargs must be type dict, not {type(segment_kwargs)}") + self.segments = segment_kwargs.copy() def setup_data(self, handler: Union[dict, DataHandler], segments: dict): """ From 77989fbb3d845dac55c10e8b93c6addfbe1d4158 Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 09:00:41 +0000 Subject: [PATCH 2/8] black format --- examples/highfreq/workflow.py | 27 ++++++++++++++------------- qlib/data/dataset/__init__.py | 14 +++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index dac27082fce..ff3d3c5522b 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -26,6 +26,7 @@ from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull + class HighfreqWorkflow(object): SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None} @@ -146,29 +147,29 @@ def dump_and_load_dataset(self): self._prepare_calender_cache() ##=============reinit dataset============= dataset.init( - handler_kwargs = { - "init_type" : DataHandlerLP.IT_LS, - "start_time" : "2021-01-19 00:00:00", - "end_time" : "2021-01-25 16:00:00", + handler_kwargs={ + "init_type": DataHandlerLP.IT_LS, + "start_time": "2021-01-19 00:00:00", + "end_time": "2021-01-25 16:00:00", }, - segment_kwargs = { + segment_kwargs={ "test": ( - "2021-01-19 00:00:00", + "2021-01-19 00:00:00", "2021-01-25 16:00:00", ), - } + }, ) dataset_backtest.init( - handler_kwargs = { - "start_time" : "2021-01-19 00:00:00", - "end_time" : "2021-01-25 16:00:00", + handler_kwargs={ + "start_time": "2021-01-19 00:00:00", + "end_time": "2021-01-25 16:00:00", }, - segment_kwargs = { + segment_kwargs={ "test": ( - "2021-01-19 00:00:00", + "2021-01-19 00:00:00", "2021-01-25 16:00:00", ), - } + }, ) ##=============get data============= diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index e2606ec0f0d..8ff8c1210af 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -87,19 +87,19 @@ def __init__(self, handler: Union[dict, DataHandler], segments: dict): """ super().__init__(handler, segments) - def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None): + def init(self, handler_kwargs: dict = None, segment_kwargs: dict = None): """ Initialize the DatasetH - + Parameters ---------- handler_kwargs : dict Config of DataHanlder, which could include the following arguments: - + - arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'. - arguments of DataHandler.init, such as 'enable_cache', etc. - + segment_kwargs : dict Config of segments which is same as 'segments' in DatasetH.setup_data @@ -112,10 +112,10 @@ def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None): conf_data_arg = {"instruments", "start_time", "end_time"} for k, v in handler_kwargs.items(): if k in conf_data_arg: - kwargs_conf_data.update({k:v}) + kwargs_conf_data.update({k: v}) else: - kwargs_init.update({k:v}) - + kwargs_init.update({k: v}) + self.handler.conf_data(**kwargs_conf_data) self.handler.init(**kwargs_init) From 5d829fcd96f1a15e45059c93daa45287ede44914 Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 09:02:32 +0000 Subject: [PATCH 3/8] update docs --- examples/highfreq/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/highfreq/README.md b/examples/highfreq/README.md index 56981abcc32..067db4318f8 100644 --- a/examples/highfreq/README.md +++ b/examples/highfreq/README.md @@ -16,7 +16,7 @@ The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the ### About Reinitialization -After reloading `Dataset` from disk, `Qlib` also support reinitialize the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc. +After reloading `Dataset` from disk, `Qlib` also support reinitializing the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc. The example is given in `workflow.py`, users can run the code as follows. From 6aac5ea4aa51dd49e2527e8b58a6b1cdaa7d790a Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 14:55:03 +0000 Subject: [PATCH 4/8] update data.rst docs --- docs/component/data.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/component/data.rst b/docs/component/data.rst index dd32c5cd840..3e9586bf4b3 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -31,7 +31,7 @@ Qlib Format Data We've specially designed a data structure to manage financial data, please refer to the `File storage design section in Qlib paper `_ for detailed information. Such data will be stored with filename suffix `.bin` (We'll call them `.bin` file, `.bin` format, or qlib format). `.bin` file is designed for scientific computing on finance data. -``Qlib`` provides two different off-the-shelf dataset, which can be accessed through this `link `_: +``Qlib`` provides two different off-the-shelf datasets, which can be accessed through this `link `_: ======================== ================= ================ Dataset US Market China Market @@ -41,6 +41,7 @@ Alpha360 √ √ Alpha158 √ √ ======================== ================= ================ +Also, ``Qlib`` provides a high-frequency dataset. Users can run a high-frequency dataset example through this `link `_. Qlib Format Dataset -------------------- From 2614941cecaacc9896138ea4776b68878b96479c Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 15:46:54 +0000 Subject: [PATCH 5/8] update docs --- docs/advanced/serial.rst | 42 +++++++++++++++++++++++++++++++++++++ docs/reference/api.rst | 10 +++++++++ examples/highfreq/README.md | 4 ++-- 3 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 docs/advanced/serial.rst diff --git a/docs/advanced/serial.rst b/docs/advanced/serial.rst new file mode 100644 index 00000000000..a0e6480b9bb --- /dev/null +++ b/docs/advanced/serial.rst @@ -0,0 +1,42 @@ +.. _serial: + +================================= +Serialization +================================= +.. currentmodule:: qlib + +Introduction +=================== +``Qlib`` supports dumping the state of ``DataHandler``, ``DataSet``, ``Processor`` and ``Model``, etc. into a disk and reloading them. + +Serializable Class +======================== + +``Qlib`` provides a base class ``qlib.utils.serial.Serializable``, whose state can be dumped in or loaded from disk in `pickle` format. +When users dump the state of the ``Serializable`` instance, the attributes of the instance whose name **does not** start with `_` will be saved on the disk. + +Example +========================== +``Qlib``'s serializable class includes ``DataHandler``, ``DataSet``, ``Processor`` and ``Model``, etc., which are subclass of ``qlib.utils.serial.Serializable``. +Specifically, ``qlib.data.dataset.DatasetH`` is one of them. Users can serialize ``DatasetH`` as follows. + +.. code-block:: Python + + ##=============dump dataset============= + dataset.to_pickle(path="dataset.pkl") # dataset is the instance of qlib.data.dataset.DatasetH + + ##=============reload dataset============= + with open("dataset.pkl", "rb") as file_dataset: + dataset = pickle.load(file_dataset) + +.. note:: + Only state of ``DatasetH`` should be saved on the disk, such as some `mean` and `variance` used for data normalization, etc. + + After reloading the ``DatasetH``, users need to reinitialize it. It means that users can reset some states of ``DatasetH`` or ``QlibDataHandler`` such as `instruments`, `start_time`, `end_time` and `segments`, etc., and generate new data according to the states (data is not state and should not be saved on the disk). + +A more detailed example is in this `link `_. + + +API +=================== +Please refer to `Serializable API <../reference/api.html#module-qlib.utils.serial.Serializable>`_. diff --git a/docs/reference/api.rst b/docs/reference/api.rst index f21a9f518a9..3167d8a622b 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -152,4 +152,14 @@ Recorder Record Template -------------------- .. automodule:: qlib.workflow.record_temp + :members: + + +Utils +==================== + +Serializable +-------------------- + +.. automodule:: qlib.utils.serial.Serializable :members: \ No newline at end of file diff --git a/examples/highfreq/README.md b/examples/highfreq/README.md index 067db4318f8..30c2e19db9b 100644 --- a/examples/highfreq/README.md +++ b/examples/highfreq/README.md @@ -12,11 +12,11 @@ Get high-frequency data by running the following command: ## Dump & Reload & Reinitialize the Dataset -The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of `qlib.utils.serial.Serializable`, which supports being dumped in or loaded from disk in `pickle` format. +The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of [`qlib.utils.serial.Serializable`](https://qlib.readthedocs.io/en/latest/advanced/serial.html), whose state can be dumped in or loaded from disk in `pickle` format. ### About Reinitialization -After reloading `Dataset` from disk, `Qlib` also support reinitializing the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc. +After reloading `Dataset` from disk, `Qlib` also support reinitializing the dataset. It means that users can reset some states of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segments`, etc., and generate new data according to the states. The example is given in `workflow.py`, users can run the code as follows. From 394efae717d069a4ed17165564cac314fb733aac Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 15:50:45 +0000 Subject: [PATCH 6/8] update index --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index 15a36b48927..3fa35fc60dc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -49,6 +49,7 @@ Document Structure Building Formulaic Alphas Online & Offline mode + Serialization .. toctree:: :maxdepth: 3 From e2bdb248707511696242221f22674b53b07b18d4 Mon Sep 17 00:00:00 2001 From: bxdd <45119470+bxdd@users.noreply.github.com> Date: Fri, 5 Feb 2021 12:15:55 +0800 Subject: [PATCH 7/8] Update serial.rst --- docs/advanced/serial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/advanced/serial.rst b/docs/advanced/serial.rst index a0e6480b9bb..05d5f37c716 100644 --- a/docs/advanced/serial.rst +++ b/docs/advanced/serial.rst @@ -12,7 +12,7 @@ Introduction Serializable Class ======================== -``Qlib`` provides a base class ``qlib.utils.serial.Serializable``, whose state can be dumped in or loaded from disk in `pickle` format. +``Qlib`` provides a base class ``qlib.utils.serial.Serializable``, whose state can be dumped into or loaded from disk in `pickle` format. When users dump the state of the ``Serializable`` instance, the attributes of the instance whose name **does not** start with `_` will be saved on the disk. Example @@ -23,7 +23,7 @@ Specifically, ``qlib.data.dataset.DatasetH`` is one of them. Users can serialize .. code-block:: Python ##=============dump dataset============= - dataset.to_pickle(path="dataset.pkl") # dataset is the instance of qlib.data.dataset.DatasetH + dataset.to_pickle(path="dataset.pkl") # dataset is an instance of qlib.data.dataset.DatasetH ##=============reload dataset============= with open("dataset.pkl", "rb") as file_dataset: From cd02ee1ab1e67e85d7e76490f2eccfcd9e816d7d Mon Sep 17 00:00:00 2001 From: bxdd <45119470+bxdd@users.noreply.github.com> Date: Fri, 5 Feb 2021 12:17:03 +0800 Subject: [PATCH 8/8] Update serial.rst --- docs/advanced/serial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/serial.rst b/docs/advanced/serial.rst index 05d5f37c716..8c0f837467f 100644 --- a/docs/advanced/serial.rst +++ b/docs/advanced/serial.rst @@ -13,7 +13,7 @@ Serializable Class ======================== ``Qlib`` provides a base class ``qlib.utils.serial.Serializable``, whose state can be dumped into or loaded from disk in `pickle` format. -When users dump the state of the ``Serializable`` instance, the attributes of the instance whose name **does not** start with `_` will be saved on the disk. +When users dump the state of a ``Serializable`` instance, the attributes of the instance whose name **does not** start with `_` will be saved on the disk. Example ==========================