Skip to content

Commit a3a2b5a

Browse files
authored
Merge pull request #358 from javaThonc/high_freq_demp
update high freq demo
2 parents 70c84cb + 941c980 commit a3a2b5a

File tree

6 files changed

+340
-2
lines changed

6 files changed

+340
-2
lines changed

examples/benchmarks/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
1717
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 |
1818
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 |
1919
| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha360 | 0.0407±0.00| 0.3053±0.00 | 0.0490±0.00 | 0.3840±0.00 | 0.0380±0.02 | 0.5000±0.21 | -0.0984±0.02 |
20+
| TabNet (Sercan O. Arik, et al.)| Alpha360 | 0.0192±0.00 | 0.1401±0.00| 0.0291±0.00 | 0.2163±0.00 | -0.0258±0.00 | -0.2961±0.00| -0.1429±0.00 |
2021

2122
## Alpha158 dataset
2223
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
@@ -32,6 +33,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
3233
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 |
3334
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 |
3435
| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha158 | 0.0544±0.00 | 0.4338±0.01 | 0.0523±0.00 | 0.4257±0.01 | 0.1253±0.01 | 1.4105±0.14 | -0.0902±0.01 |
36+
| TabNet (Sercan O. Arik, et al.)| Alpha158 | 0.0383±0.00 | 0.3414±0.00| 0.0388±0.00 | 0.3460±0.00 | 0.0226±0.00 | 0.2652±0.00| -0.1072±0.00 |
3537

3638
- The selected 20 features are based on the feature importance of a lightgbm-based model.
3739
- The base model of DoubleEnsemble is LGBM.

examples/highfreq/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,11 @@ The example is given in `workflow.py`, users can run the code as follows.
2525
Run the example by running the following command:
2626
```bash
2727
python workflow.py dump_and_load_dataset
28-
```
28+
```
29+
30+
## Benchmarks Performance
31+
### Signal Test
32+
Here are the results of signal test for benchmark models. We will keep updating benchmark models in future.
33+
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Long precision| Short Precision | Long-Short Average Return | Long-Short Average Sharpe |
34+
|---|---|---|---|---|---|---|---|---|---|
35+
| LightGBM | Alpha158 | 0.3042±0.00 | 1.5372±0.00| 0.3117±0.00 | 1.6258±0.00 | 0.6720±0.00 | 0.6870±0.00 | 0.000769±0.00 | 1.0190±0.00 |
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
qlib_init:
2+
provider_uri: "~/.qlib/qlib_data/cn_data_1min"
3+
region: cn
4+
market: &market 'csi300'
5+
start_time: &start_time "2020-09-15 00:00:00"
6+
end_time: &end_time "2021-01-18 16:00:00"
7+
train_end_time: &train_end_time "2020-11-15 16:00:00"
8+
valid_start_time: &valid_start_time "2020-11-16 00:00:00"
9+
valid_end_time: &valid_end_time "2020-11-30 16:00:00"
10+
test_start_time: &test_start_time "2020-12-01 00:00:00"
11+
data_handler_config: &data_handler_config
12+
start_time: *start_time
13+
end_time: *end_time
14+
fit_start_time: *start_time
15+
fit_end_time: *train_end_time
16+
instruments: *market
17+
freq: '1min'
18+
infer_processors:
19+
- class: 'RobustZScoreNorm'
20+
kwargs:
21+
fields_group: 'feature'
22+
clip_outlier: false
23+
- class: "Fillna"
24+
kwargs:
25+
fields_group: 'feature'
26+
learn_processors:
27+
- class: 'DropnaLabel'
28+
- class: 'CSRankNorm'
29+
kwargs:
30+
fields_group: 'label'
31+
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
32+
33+
task:
34+
model:
35+
class: "HFLGBModel"
36+
module_path: "qlib.contrib.model.highfreq_gdbt_model"
37+
kwargs:
38+
objective: 'binary'
39+
metric: ['binary_logloss','auc']
40+
verbosity: -1
41+
learning_rate: 0.01
42+
max_depth: 8
43+
num_leaves: 150
44+
lambda_l1: 1.5
45+
lambda_l2: 1
46+
num_threads: 20
47+
dataset:
48+
class: "DatasetH"
49+
module_path: "qlib.data.dataset"
50+
kwargs:
51+
handler:
52+
class: "Alpha158"
53+
module_path: "qlib.contrib.data.handler"
54+
kwargs: *data_handler_config
55+
segments:
56+
train: [*start_time, *train_end_time]
57+
valid: [*train_end_time, *valid_end_time]
58+
test: [*test_start_time, *end_time]
59+
record:
60+
- class: "SignalRecord"
61+
module_path: "qlib.workflow.record_temp"
62+
kwargs: {}
63+
- class: "HFSignalRecord"
64+
module_path: "qlib.workflow.record_temp"
65+
kwargs: {}

qlib/contrib/eva/alpha.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,59 @@
88
from typing import Tuple
99

1010

11+
def calc_long_short_prec(
12+
pred: pd.Series, label: pd.Series, date_col="datetime", quantile: float = 0.2, dropna=False, is_alpha=False
13+
) -> Tuple[pd.Series, pd.Series]:
14+
"""
15+
calculate the precision for long and short operation
16+
17+
18+
:param pred/label: index is **pd.MultiIndex**, index name is **[datetime, instruments]**; columns names is **[score]**.
19+
20+
.. code-block:: python
21+
score
22+
datetime instrument
23+
2020-12-01 09:30:00 SH600068 0.553634
24+
SH600195 0.550017
25+
SH600276 0.540321
26+
SH600584 0.517297
27+
SH600715 0.544674
28+
label :
29+
label
30+
date_col :
31+
date_col
32+
33+
Returns
34+
-------
35+
(pd.Series, pd.Series)
36+
long precision and short precision in time level
37+
"""
38+
if is_alpha:
39+
label = label - label.mean(level=date_col)
40+
if int(1 / quantile) >= len(label.index.get_level_values(1).unique()):
41+
raise ValueError("Need more instruments to calculate precision")
42+
43+
df = pd.DataFrame({"pred": pred, "label": label})
44+
if dropna:
45+
df.dropna(inplace=True)
46+
47+
group = df.groupby(level=date_col)
48+
49+
N = lambda x: int(len(x) * quantile)
50+
# find the top/low quantile of prediction and treat them as long and short target
51+
long = group.apply(lambda x: x.nlargest(N(x), columns="pred").label).reset_index(level=0, drop=True)
52+
short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label).reset_index(level=0, drop=True)
53+
54+
groupll = long.groupby(date_col)
55+
l_dom = groupll.apply(lambda x: x > 0)
56+
l_c = groupll.count()
57+
58+
groups = short.groupby(date_col)
59+
s_dom = groups.apply(lambda x: x < 0)
60+
s_c = groups.count()
61+
return (l_dom.groupby(date_col).sum() / l_c), (s_dom.groupby(date_col).sum() / s_c)
62+
63+
1164
def calc_ic(pred: pd.Series, label: pd.Series, date_col="datetime", dropna=False) -> Tuple[pd.Series, pd.Series]:
1265
"""calc_ic.
1366
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
import numpy as np
5+
import pandas as pd
6+
import lightgbm as lgb
7+
8+
from qlib.model.base import ModelFT
9+
from qlib.data.dataset import DatasetH
10+
from qlib.data.dataset.handler import DataHandlerLP
11+
import warnings
12+
13+
14+
class HFLGBModel(ModelFT):
15+
"""LightGBM Model for high frequency prediction"""
16+
17+
def __init__(self, loss="mse", **kwargs):
18+
if loss not in {"mse", "binary"}:
19+
raise NotImplementedError
20+
self.params = {"objective": loss, "verbosity": -1}
21+
self.params.update(kwargs)
22+
self.model = None
23+
24+
def _cal_signal_metrics(self, y_test, l_cut, r_cut):
25+
"""
26+
Calcaute the signal metrics by daily level
27+
"""
28+
up_pre, down_pre = [], []
29+
up_alpha_ll, down_alpha_ll = [], []
30+
for date in y_test.index.get_level_values(0).unique():
31+
df_res = y_test.loc[date].sort_values("pred")
32+
if int(l_cut * len(df_res)) < 10:
33+
warnings.warn("Warning: threhold is too low or instruments number is not enough")
34+
continue
35+
top = df_res.iloc[: int(l_cut * len(df_res))]
36+
bottom = df_res.iloc[int(r_cut * len(df_res)) :]
37+
38+
down_precision = len(top[top[top.columns[0]] < 0]) / (len(top))
39+
up_precision = len(bottom[bottom[top.columns[0]] > 0]) / (len(bottom))
40+
41+
down_alpha = top[top.columns[0]].mean()
42+
up_alpha = bottom[bottom.columns[0]].mean()
43+
44+
up_pre.append(up_precision)
45+
down_pre.append(down_precision)
46+
up_alpha_ll.append(up_alpha)
47+
down_alpha_ll.append(down_alpha)
48+
49+
return (
50+
np.array(up_pre).mean(),
51+
np.array(down_pre).mean(),
52+
np.array(up_alpha_ll).mean(),
53+
np.array(down_alpha_ll).mean(),
54+
)
55+
56+
def hf_signal_test(self, dataset: DatasetH, threhold=0.2):
57+
"""
58+
Test the sigal in high frequency test set
59+
"""
60+
if self.model == None:
61+
raise ValueError("Model hasn't been trained yet")
62+
df_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
63+
df_test.dropna(inplace=True)
64+
x_test, y_test = df_test["feature"], df_test["label"]
65+
# Convert label into alpha
66+
y_test[y_test.columns[0]] = y_test[y_test.columns[0]] - y_test[y_test.columns[0]].mean(level=0)
67+
68+
res = pd.Series(self.model.predict(x_test.values), index=x_test.index)
69+
y_test["pred"] = res
70+
71+
up_p, down_p, up_a, down_a = self._cal_signal_metrics(y_test, threhold, 1 - threhold)
72+
print("===============================")
73+
print("High frequency signal test")
74+
print("===============================")
75+
print("Test set precision: ")
76+
print("Positive precision: {}, Negative precision: {}".format(up_p, down_p))
77+
print("Test Alpha Average in test set: ")
78+
print("Positive average alpha: {}, Negative average alpha: {}".format(up_a, down_a))
79+
80+
def _prepare_data(self, dataset: DatasetH):
81+
df_train, df_valid = dataset.prepare(
82+
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
83+
)
84+
85+
x_train, y_train = df_train["feature"], df_train["label"]
86+
x_valid, y_valid = df_train["feature"], df_valid["label"]
87+
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
88+
l_name = df_train["label"].columns[0]
89+
# Convert label into alpha
90+
df_train["label"][l_name] = df_train["label"][l_name] - df_train["label"][l_name].mean(level=0)
91+
df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid["label"][l_name].mean(level=0)
92+
mapping_fn = lambda x: 0 if x < 0 else 1
93+
df_train["label_c"] = df_train["label"][l_name].apply(mapping_fn)
94+
df_valid["label_c"] = df_valid["label"][l_name].apply(mapping_fn)
95+
x_train, y_train = df_train["feature"], df_train["label_c"].values
96+
x_valid, y_valid = df_valid["feature"], df_valid["label_c"].values
97+
else:
98+
raise ValueError("LightGBM doesn't support multi-label training")
99+
100+
dtrain = lgb.Dataset(x_train.values, label=y_train)
101+
dvalid = lgb.Dataset(x_valid.values, label=y_valid)
102+
return dtrain, dvalid
103+
104+
def fit(
105+
self,
106+
dataset: DatasetH,
107+
num_boost_round=1000,
108+
early_stopping_rounds=50,
109+
verbose_eval=20,
110+
evals_result=dict(),
111+
**kwargs
112+
):
113+
dtrain, dvalid = self._prepare_data(dataset)
114+
self.model = lgb.train(
115+
self.params,
116+
dtrain,
117+
num_boost_round=num_boost_round,
118+
valid_sets=[dtrain, dvalid],
119+
valid_names=["train", "valid"],
120+
early_stopping_rounds=early_stopping_rounds,
121+
verbose_eval=verbose_eval,
122+
evals_result=evals_result,
123+
**kwargs
124+
)
125+
evals_result["train"] = list(evals_result["train"].values())[0]
126+
evals_result["valid"] = list(evals_result["valid"].values())[0]
127+
128+
def predict(self, dataset):
129+
if self.model is None:
130+
raise ValueError("model is not fitted yet!")
131+
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
132+
return pd.Series(self.model.predict(x_test.values), index=x_test.index)
133+
134+
def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20):
135+
"""
136+
finetune model
137+
138+
Parameters
139+
----------
140+
dataset : DatasetH
141+
dataset for finetuning
142+
num_boost_round : int
143+
number of round to finetune model
144+
verbose_eval : int
145+
verbose level
146+
"""
147+
# Based on existing model and finetune by train more rounds
148+
dtrain, _ = self._prepare_data(dataset)
149+
self.model = lgb.train(
150+
self.params,
151+
dtrain,
152+
num_boost_round=num_boost_round,
153+
init_model=self.model,
154+
valid_sets=[dtrain],
155+
valid_names=["train"],
156+
verbose_eval=verbose_eval,
157+
)

qlib/workflow/record_temp.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from ..utils import init_instance_by_config, get_module_by_module_path
1414
from ..log import get_module_logger
1515
from ..utils import flatten_dict
16-
from ..contrib.eva.alpha import calc_ic, calc_long_short_return
16+
from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec
1717
from ..contrib.strategy.strategy import BaseStrategy
1818

1919
logger = get_module_logger("workflow", "INFO")
@@ -162,6 +162,60 @@ def load(self, name="pred.pkl"):
162162
return super().load(name)
163163

164164

165+
class HFSignalRecord(SignalRecord):
166+
"""
167+
This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class.
168+
"""
169+
170+
artifact_path = "hg_sig_analysis"
171+
172+
def __init__(self, recorder, **kwargs):
173+
super().__init__(recorder=recorder)
174+
175+
def generate(self):
176+
pred = self.load("pred.pkl")
177+
raw_label = self.load("label.pkl")
178+
long_pre, short_pre = calc_long_short_prec(pred.iloc[:, 0], raw_label.iloc[:, 0], is_alpha=True)
179+
ic, ric = calc_ic(pred.iloc[:, 0], raw_label.iloc[:, 0])
180+
metrics = {
181+
"IC": ic.mean(),
182+
"ICIR": ic.mean() / ic.std(),
183+
"Rank IC": ric.mean(),
184+
"Rank ICIR": ric.mean() / ric.std(),
185+
"Long precision": long_pre.mean(),
186+
"Short precision": short_pre.mean(),
187+
}
188+
objects = {"ic.pkl": ic, "ric.pkl": ric}
189+
objects.update({"long_pre.pkl": long_pre, "short_pre.pkl": short_pre})
190+
long_short_r, long_avg_r = calc_long_short_return(pred.iloc[:, 0], raw_label.iloc[:, 0])
191+
metrics.update(
192+
{
193+
"Long-Short Average Return": long_short_r.mean(),
194+
"Long-Short Average Sharpe": long_short_r.mean() / long_short_r.std(),
195+
}
196+
)
197+
objects.update(
198+
{
199+
"long_short_r.pkl": long_short_r,
200+
"long_avg_r.pkl": long_avg_r,
201+
}
202+
)
203+
self.recorder.log_metrics(**metrics)
204+
self.recorder.save_objects(**objects, artifact_path=self.get_path())
205+
pprint(metrics)
206+
207+
def list(self):
208+
paths = [
209+
self.get_path("ic.pkl"),
210+
self.get_path("ric.pkl"),
211+
self.get_path("long_pre.pkl"),
212+
self.get_path("short_pre.pkl"),
213+
self.get_path("long_short_r.pkl"),
214+
self.get_path("long_avg_r.pkl"),
215+
]
216+
return paths
217+
218+
165219
class SigAnaRecord(SignalRecord):
166220
"""
167221
This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class.

0 commit comments

Comments
 (0)