From 7dc19a8201780c7b3e9d0f6d9ca3f68809d9343a Mon Sep 17 00:00:00 2001 From: zhupr Date: Wed, 29 Sep 2021 12:30:31 +0800 Subject: [PATCH 1/7] modify the example of multi-freq --- examples/benchmarks/LightGBM/average_ops.py | 37 ++++++ .../LightGBM/features_resample_N.py | 15 +++ .../benchmarks/LightGBM/multi_freq_handler.py | 120 ++++++++++++++++++ ...w_config_lightgbm_Alpha158_multi_freq.yaml | 32 ++--- qlib/workflow/record_temp.py | 6 +- 5 files changed, 193 insertions(+), 17 deletions(-) create mode 100644 examples/benchmarks/LightGBM/average_ops.py create mode 100644 examples/benchmarks/LightGBM/features_resample_N.py create mode 100644 examples/benchmarks/LightGBM/multi_freq_handler.py diff --git a/examples/benchmarks/LightGBM/average_ops.py b/examples/benchmarks/LightGBM/average_ops.py new file mode 100644 index 00000000000..6de7710338b --- /dev/null +++ b/examples/benchmarks/LightGBM/average_ops.py @@ -0,0 +1,37 @@ +import math +import numpy as np + +from qlib.data.ops import ElemOperator + + +class Avg(ElemOperator): + MINUTES = 240 + + def __init__(self, feature, start_index, end_index, func="nanmean"): + assert start_index < end_index, "Avg in end_index must be greater than start_index" + self.feature = feature + self.s_i = start_index + self.e_i = end_index + self.func = func + self.min_periods = 1 if self.func == "nanmean" else self.e_i - self.s_i + super().__init__(feature) + + def _load_internal(self, instrument, start_index, end_index, freq): + series = self.feature.load(instrument, start_index, end_index, freq) + if series.empty: + return series + start_index = math.ceil(series.index[0] / self.MINUTES) * self.MINUTES + res = series.rolling(self.e_i - self.s_i, min_periods=self.min_periods).mean() + mask = [] + while start_index <= series.index[-1]: + mask.append(start_index + self.e_i - 1) + start_index += self.MINUTES + res.loc[~series.index.isin(mask)] = np.nan + return res + + def get_extended_window_size(self): + lft_etd, rght_etd = self.feature.get_extended_window_size() + return lft_etd + self.MINUTES, rght_etd + self.MINUTES + + def __str__(self): + return "{}({},{},{},{})".format(type(self).__name__, self.feature, self.s_i, self.e_i, self.func) diff --git a/examples/benchmarks/LightGBM/features_resample_N.py b/examples/benchmarks/LightGBM/features_resample_N.py new file mode 100644 index 00000000000..8496ab00f09 --- /dev/null +++ b/examples/benchmarks/LightGBM/features_resample_N.py @@ -0,0 +1,15 @@ +import pandas as pd + +from qlib.data.inst_processor import InstProcessor +from qlib.utils.resam import resam_calendar + + +class ResampleNProcessor(InstProcessor): + def __init__(self, target_frq: str, **kwargs): + self.target_frq = target_frq + + def __call__(self, df: pd.DataFrame, *args, **kwargs): + df.index = pd.to_datetime(df.index) + res_index = resam_calendar(df.index, "1min", self.target_frq) + df = df.resample(self.target_frq).last().reindex(res_index) + return df diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py new file mode 100644 index 00000000000..33a486e7e13 --- /dev/null +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -0,0 +1,120 @@ +import warnings +from pathlib import Path +from typing import Union +import pandas as pd + +from qlib.data import D +from qlib.data.dataset.loader import QlibDataLoader +from qlib.contrib.data.handler import DataHandlerLP, _DEFAULT_LEARN_PROCESSORS, check_transform_proc + + +class MultiFreqLoader(QlibDataLoader): + def load_group_df( + self, + instruments, + exprs: list, + names: list, + start_time: Union[str, pd.Timestamp] = None, + end_time: Union[str, pd.Timestamp] = None, + gp_name: str = None, + ) -> pd.DataFrame: + if instruments is None: + warnings.warn("`instruments` is not set, will load all stocks") + instruments = "all" + if isinstance(instruments, str): + instruments = D.instruments(instruments, filter_pipe=self.filter_pipe) + elif self.filter_pipe is not None: + warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list") + + if gp_name == "feature": + # freq == day + _exps = list(filter(lambda x: not x.startswith("Avg"), exprs)) + _day_df = D.features(instruments, _exps, start_time, end_time, freq="day") + _day_df.columns = list(filter(lambda x: int("".join(filter(str.isdigit, x))) == 0, names)) + # freq == 1min + _exps = list(filter(lambda x: x.startswith("Avg"), exprs)) + _min_df = D.features( + instruments, + _exps, + start_time, + end_time, + freq="1min", + inst_processors=self.inst_processor.get("feature", []), + ) + _min_df.columns = list(filter(lambda x: int("".join(filter(str.isdigit, x))) > 0, names)) + df = pd.concat([_day_df, _min_df], axis=1, sort=False) + elif gp_name == "label": + freq = self.freq[gp_name] if isinstance(self.freq, dict) else self.freq + df = D.features( + instruments, + exprs, + start_time, + end_time, + freq=freq, + inst_processors=self.inst_processor.get(gp_name, []), + ) + df.columns = names + else: + raise ValueError(f"not support") + + if self.swap_level: + df = df.swaplevel().sort_index() # NOTE: if swaplevel, return + return df + + +class Avg15minHandler(DataHandlerLP): + def __init__( + self, + instruments="csi500", + start_time=None, + end_time=None, + freq="day", + infer_processors=[], + learn_processors=_DEFAULT_LEARN_PROCESSORS, + fit_start_time=None, + fit_end_time=None, + process_type=DataHandlerLP.PTYPE_A, + filter_pipe=None, + inst_processor=None, + **kwargs, + ): + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) + + data_loader = { + "class": "MultiFreqLoader", + "module_path": str(Path(__file__).resolve()), + "kwargs": { + "config": { + "feature": self.get_feature_config(), + "label": kwargs.get("label", self.get_label_config()), + }, + "filter_pipe": filter_pipe, + "freq": freq, + "inst_processor": inst_processor, + }, + } + super().__init__( + instruments=instruments, + start_time=start_time, + end_time=end_time, + data_loader=data_loader, + infer_processors=infer_processors, + learn_processors=learn_processors, + process_type=process_type, + ) + + def get_feature_config(self): + fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"] + names = list(map(lambda x: x.strip("$") + "0", fields)) + tmp_fields = [] + tmp_names = [] + for i, _f in enumerate(fields): + _fields = [f"Avg({_f}, {15 * j}, {15 * j + 15}, 'nanmean')" for j in range(0, 240 // 15)] + _names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(1, 240 // 15 + 1)] + tmp_fields += _fields + tmp_names += _names + return fields + tmp_fields, names + tmp_names + + def get_label_config(self): + return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml index fcb3dd367ac..b19dfb8083b 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml @@ -4,7 +4,10 @@ qlib_init: 1min: "~/.qlib/qlib_data/cn_data_1min" region: cn dataset_cache: null - maxtasksperchild: 1 + maxtasksperchild: null + custom_ops: + - class: Avg + module_path: average_ops.py market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config @@ -20,11 +23,10 @@ data_handler_config: &data_handler_config # with label as reference inst_processor: feature: - - class: Resample1minProcessor - module_path: features_sample.py + - class: ResampleNProcessor + module_path: features_resample_N.py kwargs: - hour: 14 - minute: 56 + target_frq: 1d port_analysis_config: &port_analysis_config strategy: @@ -62,25 +64,25 @@ task: module_path: qlib.data.dataset kwargs: handler: - class: Alpha158 - module_path: qlib.contrib.data.handler + class: Avg15minHandler + module_path: multi_freq_handler.py kwargs: *data_handler_config segments: train: [2008-01-01, 2014-12-31] valid: [2015-01-01, 2016-12-31] test: [2017-01-01, 2020-08-01] - record: + record: - class: SignalRecord - module_path: qlib.workflow.record_temp - kwargs: + module_path: qlib.workflow.record_temp + kwargs: model: dataset: - class: SigAnaRecord - module_path: qlib.workflow.record_temp - kwargs: + module_path: qlib.workflow.record_temp + kwargs: ana_long_short: False ann_scaler: 252 - class: PortAnaRecord - module_path: qlib.workflow.record_temp - kwargs: - config: *port_analysis_config \ No newline at end of file + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 98615fba2be..8b290a5ffe0 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -88,7 +88,7 @@ def load(self, name): obj = self.recorder.load_object(name) return obj - def list(): + def list(self): """ List the supported artifacts. @@ -98,7 +98,7 @@ def list(): """ return [] - def check(self, cls=self): + def check(self, cls=None): """ Check if the records is properly generated and saved. @@ -106,6 +106,8 @@ def check(self, cls=self): ------ FileExistsError: whether the records are stored properly. """ + if cls is None: + cls = self artifacts = set(self.recorder.list_artifacts()) flist = cls.list() for item in flist: From 67d3b7fe4322ad968df96fceb026e829d2c9913b Mon Sep 17 00:00:00 2001 From: zhupr Date: Wed, 29 Sep 2021 12:41:46 +0800 Subject: [PATCH 2/7] add Copyright --- examples/benchmarks/LightGBM/average_ops.py | 3 +++ examples/benchmarks/LightGBM/features_resample_N.py | 3 +++ examples/benchmarks/LightGBM/features_sample.py | 3 +++ examples/benchmarks/LightGBM/multi_freq_handler.py | 3 +++ 4 files changed, 12 insertions(+) diff --git a/examples/benchmarks/LightGBM/average_ops.py b/examples/benchmarks/LightGBM/average_ops.py index 6de7710338b..a67976fb677 100644 --- a/examples/benchmarks/LightGBM/average_ops.py +++ b/examples/benchmarks/LightGBM/average_ops.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import math import numpy as np diff --git a/examples/benchmarks/LightGBM/features_resample_N.py b/examples/benchmarks/LightGBM/features_resample_N.py index 8496ab00f09..13061513cb6 100644 --- a/examples/benchmarks/LightGBM/features_resample_N.py +++ b/examples/benchmarks/LightGBM/features_resample_N.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import pandas as pd from qlib.data.inst_processor import InstProcessor diff --git a/examples/benchmarks/LightGBM/features_sample.py b/examples/benchmarks/LightGBM/features_sample.py index 0b996bd1f98..4cf9121dda5 100644 --- a/examples/benchmarks/LightGBM/features_sample.py +++ b/examples/benchmarks/LightGBM/features_sample.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import datetime import pandas as pd diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index 33a486e7e13..13e24d9283e 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import warnings from pathlib import Path from typing import Union From 5c0af31c6387fff45a33330dcad08e0a2eeed9d8 Mon Sep 17 00:00:00 2001 From: zhupr Date: Wed, 29 Sep 2021 13:06:47 +0800 Subject: [PATCH 3/7] add a comment to average_ops.py --- examples/benchmarks/LightGBM/average_ops.py | 27 +++++++++++++++++++ .../benchmarks/LightGBM/multi_freq_handler.py | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/examples/benchmarks/LightGBM/average_ops.py b/examples/benchmarks/LightGBM/average_ops.py index a67976fb677..18813a4ed2f 100644 --- a/examples/benchmarks/LightGBM/average_ops.py +++ b/examples/benchmarks/LightGBM/average_ops.py @@ -8,6 +8,33 @@ class Avg(ElemOperator): + """On the 1min data, calculate the mean value of the specified range within the day + + Parameters + ---------- + feature : Expression + feature instance + start_index: int + start index, [0, 239) + end_index: int + end index, [1, 240] + func: str + value from ["nanmean", "mean"], same as "np.nanmean" or "np.mean", by default "nanmean" + Notes + ------ + start_index < end_index + Examples + ------ + close = [0, 1, 2, 3, 4, 5] + Avg($close, 0, 2) == [np.nan, 0.5, np.nan, np.nan, np.nan, np.nan] + Avg($close, 2, 4) == [np.nan, np.nan, np.nan, 2.5, np.nan, np.nan] + + Returns + ---------- + Expression + The data for each trading day is: data[end_index-1] = data[start_index: end_index]).mean() + """ + MINUTES = 240 def __init__(self, feature, start_index, end_index, func="nanmean"): diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index 13e24d9283e..f62d24e06ed 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -58,7 +58,7 @@ def load_group_df( ) df.columns = names else: - raise ValueError(f"not support") + raise ValueError(f"Unsupported gp_name: {gp_name}") if self.swap_level: df = df.swaplevel().sort_index() # NOTE: if swaplevel, return From 57152f0f10b93cb2410d682ecb7d6a97f48c0562 Mon Sep 17 00:00:00 2001 From: zhupr Date: Thu, 30 Sep 2021 15:22:01 +0800 Subject: [PATCH 4/7] modify the example of multi-freq --- examples/benchmarks/LightGBM/average_ops.py | 67 ------------- .../benchmarks/LightGBM/features_sample.py | 19 ---- .../benchmarks/LightGBM/multi_freq_handler.py | 98 +++++-------------- ...w_config_lightgbm_Alpha158_multi_freq.yaml | 8 +- 4 files changed, 25 insertions(+), 167 deletions(-) delete mode 100644 examples/benchmarks/LightGBM/average_ops.py delete mode 100644 examples/benchmarks/LightGBM/features_sample.py diff --git a/examples/benchmarks/LightGBM/average_ops.py b/examples/benchmarks/LightGBM/average_ops.py deleted file mode 100644 index 18813a4ed2f..00000000000 --- a/examples/benchmarks/LightGBM/average_ops.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import math -import numpy as np - -from qlib.data.ops import ElemOperator - - -class Avg(ElemOperator): - """On the 1min data, calculate the mean value of the specified range within the day - - Parameters - ---------- - feature : Expression - feature instance - start_index: int - start index, [0, 239) - end_index: int - end index, [1, 240] - func: str - value from ["nanmean", "mean"], same as "np.nanmean" or "np.mean", by default "nanmean" - Notes - ------ - start_index < end_index - Examples - ------ - close = [0, 1, 2, 3, 4, 5] - Avg($close, 0, 2) == [np.nan, 0.5, np.nan, np.nan, np.nan, np.nan] - Avg($close, 2, 4) == [np.nan, np.nan, np.nan, 2.5, np.nan, np.nan] - - Returns - ---------- - Expression - The data for each trading day is: data[end_index-1] = data[start_index: end_index]).mean() - """ - - MINUTES = 240 - - def __init__(self, feature, start_index, end_index, func="nanmean"): - assert start_index < end_index, "Avg in end_index must be greater than start_index" - self.feature = feature - self.s_i = start_index - self.e_i = end_index - self.func = func - self.min_periods = 1 if self.func == "nanmean" else self.e_i - self.s_i - super().__init__(feature) - - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) - if series.empty: - return series - start_index = math.ceil(series.index[0] / self.MINUTES) * self.MINUTES - res = series.rolling(self.e_i - self.s_i, min_periods=self.min_periods).mean() - mask = [] - while start_index <= series.index[-1]: - mask.append(start_index + self.e_i - 1) - start_index += self.MINUTES - res.loc[~series.index.isin(mask)] = np.nan - return res - - def get_extended_window_size(self): - lft_etd, rght_etd = self.feature.get_extended_window_size() - return lft_etd + self.MINUTES, rght_etd + self.MINUTES - - def __str__(self): - return "{}({},{},{},{})".format(type(self).__name__, self.feature, self.s_i, self.e_i, self.func) diff --git a/examples/benchmarks/LightGBM/features_sample.py b/examples/benchmarks/LightGBM/features_sample.py deleted file mode 100644 index 4cf9121dda5..00000000000 --- a/examples/benchmarks/LightGBM/features_sample.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import datetime -import pandas as pd - -from qlib.data.inst_processor import InstProcessor - - -class Resample1minProcessor(InstProcessor): - def __init__(self, hour: int, minute: int, **kwargs): - self.hour = hour - self.minute = minute - - def __call__(self, df: pd.DataFrame, *args, **kwargs): - df.index = pd.to_datetime(df.index) - df = df.loc[df.index.time == datetime.time(self.hour, self.minute)] - df.index = df.index.normalize() - return df diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index f62d24e06ed..08956aac8fb 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -1,67 +1,17 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import warnings -from pathlib import Path -from typing import Union import pandas as pd -from qlib.data import D from qlib.data.dataset.loader import QlibDataLoader from qlib.contrib.data.handler import DataHandlerLP, _DEFAULT_LEARN_PROCESSORS, check_transform_proc -class MultiFreqLoader(QlibDataLoader): - def load_group_df( - self, - instruments, - exprs: list, - names: list, - start_time: Union[str, pd.Timestamp] = None, - end_time: Union[str, pd.Timestamp] = None, - gp_name: str = None, - ) -> pd.DataFrame: - if instruments is None: - warnings.warn("`instruments` is not set, will load all stocks") - instruments = "all" - if isinstance(instruments, str): - instruments = D.instruments(instruments, filter_pipe=self.filter_pipe) - elif self.filter_pipe is not None: - warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list") - - if gp_name == "feature": - # freq == day - _exps = list(filter(lambda x: not x.startswith("Avg"), exprs)) - _day_df = D.features(instruments, _exps, start_time, end_time, freq="day") - _day_df.columns = list(filter(lambda x: int("".join(filter(str.isdigit, x))) == 0, names)) - # freq == 1min - _exps = list(filter(lambda x: x.startswith("Avg"), exprs)) - _min_df = D.features( - instruments, - _exps, - start_time, - end_time, - freq="1min", - inst_processors=self.inst_processor.get("feature", []), - ) - _min_df.columns = list(filter(lambda x: int("".join(filter(str.isdigit, x))) > 0, names)) - df = pd.concat([_day_df, _min_df], axis=1, sort=False) - elif gp_name == "label": - freq = self.freq[gp_name] if isinstance(self.freq, dict) else self.freq - df = D.features( - instruments, - exprs, - start_time, - end_time, - freq=freq, - inst_processors=self.inst_processor.get(gp_name, []), - ) - df.columns = names - else: - raise ValueError(f"Unsupported gp_name: {gp_name}") - - if self.swap_level: - df = df.swaplevel().sort_index() # NOTE: if swaplevel, return +class Avg15minLoader(QlibDataLoader): + def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame: + df = super(Avg15minLoader, self).load(instruments, start_time, end_time) + if self.is_group: + df.columns = df.columns.map(lambda x: ("feature", x[1]) if x[0].startswith("feature") else x) return df @@ -83,20 +33,9 @@ def __init__( ): infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) - - data_loader = { - "class": "MultiFreqLoader", - "module_path": str(Path(__file__).resolve()), - "kwargs": { - "config": { - "feature": self.get_feature_config(), - "label": kwargs.get("label", self.get_label_config()), - }, - "filter_pipe": filter_pipe, - "freq": freq, - "inst_processor": inst_processor, - }, - } + data_loader = Avg15minLoader( + config=self.loader_config(), filter_pipe=filter_pipe, freq=freq, inst_processor=inst_processor + ) super().__init__( instruments=instruments, start_time=start_time, @@ -107,17 +46,24 @@ def __init__( process_type=process_type, ) - def get_feature_config(self): + def loader_config(self): fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"] names = list(map(lambda x: x.strip("$") + "0", fields)) + + config = {"feature_day": (fields, names)} + # features day + # features 15min tmp_fields = [] tmp_names = [] + # Ref(Mean($close, 15), 0), Ref(Mean($close, 15), 14) for i, _f in enumerate(fields): - _fields = [f"Avg({_f}, {15 * j}, {15 * j + 15}, 'nanmean')" for j in range(0, 240 // 15)] - _names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(1, 240 // 15 + 1)] + _fields = [f"Ref(Mean({_f}, 15), {j * 15})" for j in range(1, 240 // 15)] + _names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(240 // 15 - 1, 0, -1)] + _fields.append(f"Mean({_f}, 15)") + _names.append(f"{names[i][:-1]}{int(names[i][-1])+240 // 15}") tmp_fields += _fields tmp_names += _names - return fields + tmp_fields, names + tmp_names - - def get_label_config(self): - return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + config["feature_15min"] = (tmp_fields, tmp_names) + # label + config["label"] = (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + return config diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml index b19dfb8083b..829c8711594 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml @@ -5,9 +5,6 @@ qlib_init: region: cn dataset_cache: null maxtasksperchild: null - custom_ops: - - class: Avg - module_path: average_ops.py market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config @@ -19,10 +16,11 @@ data_handler_config: &data_handler_config instruments: *market freq: label: day - feature: 1min + feature_15min: 1min + feature_day: day # with label as reference inst_processor: - feature: + feature_15min: - class: ResampleNProcessor module_path: features_resample_N.py kwargs: From eeaacfbed0265c591a1f12536a8a80406ca370c9 Mon Sep 17 00:00:00 2001 From: zhupr Date: Thu, 30 Sep 2021 16:10:17 +0800 Subject: [PATCH 5/7] add comment to multi_freq_handler.py --- .../benchmarks/LightGBM/multi_freq_handler.py | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index 08956aac8fb..39667a53ba8 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -11,6 +11,7 @@ class Avg15minLoader(QlibDataLoader): def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame: df = super(Avg15minLoader, self).load(instruments, start_time, end_time) if self.is_group: + # feature_day(day freq) and feature_15min(1min freq, Average every 15 minutes) renamed feature df.columns = df.columns.map(lambda x: ("feature", x[1]) if x[0].startswith("feature") else x) return df @@ -47,15 +48,56 @@ def __init__( ) def loader_config(self): + + # Results for dataset: df: pd.DataFrame + # len(df.columns) == 6 + 6 * 16, len(df.index.get_level_values(level="datetime").unique()) == T + # df.columns: close0, close1, ..., close16, open0, ..., open16, ..., vwap16 + # freq == day: + # close0, open0, low0, high0, volume0, vwap0 + # freq == 1min: + # close1, ..., close16, ..., vwap1, ..., vwap16 + # df.index.name == ["datetime", "instrument"]: pd.MultiIndex + # Example: + # feature ... label + # close0 open0 low0 ... vwap1 vwap16 LABEL0 + # datetime instrument ... + # 2020-10-09 SH600000 11.794546 11.819587 11.769505 ... NaN NaN -0.005214 + # 2020-10-15 SH600000 12.044961 11.944795 11.932274 ... NaN NaN -0.007202 + # ... ... ... ... ... ... ... ... + # 2021-05-28 SZ300676 6.369684 6.495406 6.306568 ... NaN NaN -0.001321 + # 2021-05-31 SZ300676 6.601626 6.465643 6.465130 ... NaN NaN -0.023428 + + # features day: len(columns) == 6 fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"] + # names: close0, open0, ..., vwap0 names = list(map(lambda x: x.strip("$") + "0", fields)) config = {"feature_day": (fields, names)} - # features day - # features 15min + + # features 15min: len(columns) == 6 * 16 + # time: 09:00 --> 09:14, ..., 14:45 --> 14:59 + # fields: Ref(Mean($close, 15), 225), ..., Mean($close, 15) + # name: close1, ..., close16 + + # Expression description: take close as an example + # Mean($close, 15) ==> df["$close"].rolling(15, min_periods=1).mean() + # Ref(Mean($close, 15), 15) ==> df["$close"].rolling(15, min_periods=1).mean().shift(15) + + # NOTE: The last data of each trading day, which is the average of the i-th 15 minutes + + # Average: + # Average of the i-th 15-minute period of each trading day: 1 <= i <= 250 // 16 + # Avg(15minutes): Ref(Mean($close, 15), 240 - i * 15) + # + # Average of the first 15 minutes of each trading day; i = 1 + # Avg(09:00 --> 09:14), df.index.loc["09:14"]: Ref(Mean($close, 15), 240- 1 * 15) ==> Ref(Mean($close, 15), 225) + # Average of the last 15 minutes of each trading day; i = 16 + # Avg(14:45 --> 14:59), df.index.loc["14:59"]: Ref(Mean($close, 15), 240 - 16 * 15) ==> Ref(Mean($close, 15), 0) ==> Mean($close, 15) + + # 15min resample to day + # df.resample("1d").last() tmp_fields = [] tmp_names = [] - # Ref(Mean($close, 15), 0), Ref(Mean($close, 15), 14) for i, _f in enumerate(fields): _fields = [f"Ref(Mean({_f}, 15), {j * 15})" for j in range(1, 240 // 15)] _names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(240 // 15 - 1, 0, -1)] From 23b0320f7291765c0cfdd920103bc4fa14f4dac2 Mon Sep 17 00:00:00 2001 From: zhupr Date: Thu, 30 Sep 2021 21:50:14 +0800 Subject: [PATCH 6/7] add the Ref expression description to multi_freq_handler.py --- examples/benchmarks/LightGBM/multi_freq_handler.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index 39667a53ba8..eec629b5551 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -68,6 +68,15 @@ def loader_config(self): # 2021-05-31 SZ300676 6.601626 6.465643 6.465130 ... NaN NaN -0.023428 # features day: len(columns) == 6 + # $close is the closing price of the current trading day: + # if the user needs to get the `close` before the last T days, use Ref($close, T-1), for example: + # $close Ref($close, 1) Ref($close, 2) Ref($close, 3) Ref($close, 4) + # instrument datetime + # SH600519 2021-06-01 244.271530 + # 2021-06-02 242.205917 244.271530 + # 2021-06-03 242.229889 242.205917 244.271530 + # 2021-06-04 245.421524 242.229889 242.205917 244.271530 + # 2021-06-07 247.547089 245.421524 242.229889 242.205917 244.271530 fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"] # names: close0, open0, ..., vwap0 names = list(map(lambda x: x.strip("$") + "0", fields)) From ac1e09982da9208581c736c051ce84516ddbd167 Mon Sep 17 00:00:00 2001 From: zhupr Date: Thu, 30 Sep 2021 23:07:23 +0800 Subject: [PATCH 7/7] add expression description to multi_freq_handler.py --- .../benchmarks/LightGBM/multi_freq_handler.py | 21 ++++++++++++++++--- qlib/data/data.py | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index eec629b5551..07d7ac27c41 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -67,8 +67,8 @@ def loader_config(self): # 2021-05-28 SZ300676 6.369684 6.495406 6.306568 ... NaN NaN -0.001321 # 2021-05-31 SZ300676 6.601626 6.465643 6.465130 ... NaN NaN -0.023428 - # features day: len(columns) == 6 - # $close is the closing price of the current trading day: + # features day: len(columns) == 6, freq = day + # $close is the closing price of the current trading day: # if the user needs to get the `close` before the last T days, use Ref($close, T-1), for example: # $close Ref($close, 1) Ref($close, 2) Ref($close, 3) Ref($close, 4) # instrument datetime @@ -77,16 +77,31 @@ def loader_config(self): # 2021-06-03 242.229889 242.205917 244.271530 # 2021-06-04 245.421524 242.229889 242.205917 244.271530 # 2021-06-07 247.547089 245.421524 242.229889 242.205917 244.271530 + + # WARNING: Ref($close, N), if N == 0, Ref($close, N) ==> $close + fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"] # names: close0, open0, ..., vwap0 names = list(map(lambda x: x.strip("$") + "0", fields)) config = {"feature_day": (fields, names)} - # features 15min: len(columns) == 6 * 16 + # features 15min: len(columns) == 6 * 16, freq = 1min + # $close is the closing price of the current trading day: + # if the user gets 'close' for the i-th 15min of the last T days, use `Ref(Mean($close, 15), (T-1) * 240 + i * 15)`, for example: + # Ref(Mean($close, 15), 225) Ref(Mean($close, 15), 465) Ref(Mean($close, 15), 705) + # instrument datetime + # SH600519 2021-05-31 241.769897 243.077942 244.712997 + # 2021-06-01 244.271530 241.769897 243.077942 + # 2021-06-02 242.205917 244.271530 241.769897 + + # WARNING: Ref(Mean($close, 15), N), if N == 0, Ref(Mean($close, 15), N) ==> Mean($close, 15) + + # Results of the current script: # time: 09:00 --> 09:14, ..., 14:45 --> 14:59 # fields: Ref(Mean($close, 15), 225), ..., Mean($close, 15) # name: close1, ..., close16 + # # Expression description: take close as an example # Mean($close, 15) ==> df["$close"].rolling(15, min_periods=1).mean() diff --git a/qlib/data/data.py b/qlib/data/data.py index 8bb9cb89df4..7fbc48f715f 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -549,7 +549,7 @@ def dataset_processor(instruments_d, column_names, start_time, end_time, freq, i inst_l.append(inst) task_l.append( delayed(DatasetProvider.expression_calculator)( - inst, start_time, end_time, freq, normalize_column_names, spans, C + inst, start_time, end_time, freq, normalize_column_names, spans, C, inst_processors ) )