Skip to content

Commit 57bcd29

Browse files
committed
Add data analysis feature for report
1 parent 60d45ad commit 57bcd29

File tree

8 files changed

+572
-32
lines changed

8 files changed

+572
-32
lines changed

qlib/contrib/eva/alpha.py

Lines changed: 105 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
The interface should be redesigned carefully in the future.
55
"""
66
import pandas as pd
7-
87
from typing import Tuple
8+
from qlib import get_module_logger
9+
from qlib.utils.paral import complex_parallel, DelayedDict
10+
from joblib import Parallel, delayed
911

1012

1113
def calc_long_short_prec(
@@ -61,32 +63,6 @@ def calc_long_short_prec(
6163
return (l_dom.groupby(date_col).sum() / l_c), (s_dom.groupby(date_col).sum() / s_c)
6264

6365

64-
def calc_ic(pred: pd.Series, label: pd.Series, date_col="datetime", dropna=False) -> Tuple[pd.Series, pd.Series]:
65-
"""calc_ic.
66-
67-
Parameters
68-
----------
69-
pred :
70-
pred
71-
label :
72-
label
73-
date_col :
74-
date_col
75-
76-
Returns
77-
-------
78-
(pd.Series, pd.Series)
79-
ic and rank ic
80-
"""
81-
df = pd.DataFrame({"pred": pred, "label": label})
82-
ic = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"]))
83-
ric = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
84-
if dropna:
85-
return ic.dropna(), ric.dropna()
86-
else:
87-
return ic, ric
88-
89-
9066
def calc_long_short_return(
9167
pred: pd.Series,
9268
label: pd.Series,
@@ -127,3 +103,105 @@ def calc_long_short_return(
127103
r_short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label.mean())
128104
r_avg = group.label.mean()
129105
return (r_long - r_short) / 2, r_avg
106+
107+
108+
def pred_autocorr(pred: pd.Series, lag=1, inst_col="instrument", date_col="datetime"):
109+
"""pred_autocorr.
110+
111+
Limitation:
112+
- If the datetime is not sequential densely, the correlation will be calulated based on adjacent dates. (some users may expected NaN)
113+
114+
:param pred: pd.Series with following format
115+
instrument datetime
116+
SH600000 2016-01-04 -0.000403
117+
2016-01-05 -0.000753
118+
2016-01-06 -0.021801
119+
2016-01-07 -0.065230
120+
2016-01-08 -0.062465
121+
:type pred: pd.Series
122+
:param lag:
123+
"""
124+
if isinstance(pred, pd.DataFrame):
125+
pred = pred.iloc[:, 0]
126+
get_module_logger("pred_autocorr").warning("Only the first column in {pred.columns} of `pred` is kept")
127+
pred_ustk = pred.sort_index().unstack(inst_col)
128+
corr_s = {}
129+
for (idx, cur), (_, prev) in zip(pred_ustk.iterrows(), pred_ustk.shift(lag).iterrows()):
130+
corr_s[idx] = cur.corr(prev)
131+
corr_s = pd.Series(corr_s).sort_index()
132+
return corr_s
133+
134+
135+
def pred_autocorr_all(pred_dict, n_jobs=-1, **kwargs):
136+
"""
137+
calculate auto correlation for pred_dict
138+
139+
Parameters
140+
----------
141+
pred_dict : dict
142+
A dict like {<method_name>: <prediction>}
143+
kwargs :
144+
all these arguments will be passed into pred_autocorr
145+
"""
146+
ac_dict = {}
147+
for k, pred in pred_dict.items():
148+
ac_dict[k] = delayed(pred_autocorr)(pred, **kwargs)
149+
return complex_parallel(Parallel(n_jobs=n_jobs, verbose=10), ac_dict)
150+
151+
152+
def calc_ic(pred: pd.Series, label: pd.Series, date_col="datetime", dropna=False) -> (pd.Series, pd.Series):
153+
"""calc_ic.
154+
155+
Parameters
156+
----------
157+
pred :
158+
pred
159+
label :
160+
label
161+
date_col :
162+
date_col
163+
164+
Returns
165+
-------
166+
(pd.Series, pd.Series)
167+
ic and rank ic
168+
"""
169+
df = pd.DataFrame({"pred": pred, "label": label})
170+
ic = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"]))
171+
ric = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
172+
if dropna:
173+
return ic.dropna(), ric.dropna()
174+
else:
175+
return ic, ric
176+
177+
178+
def calc_all_ic(pred_dict_all, label, date_col="datetime", dropna=False, n_jobs=-1):
179+
"""calc_all_ic.
180+
181+
Parameters
182+
----------
183+
pred_dict_all :
184+
A dict like {<method_name>: <prediction>}
185+
label:
186+
A pd.Series of label values
187+
188+
Returns
189+
-------
190+
{'Q2+IND_z': {'ic': <ic series like>
191+
2016-01-04 -0.057407
192+
...
193+
2020-05-28 0.183470
194+
2020-05-29 0.171393
195+
'ric': <rank ic series like>
196+
2016-01-04 -0.040888
197+
...
198+
2020-05-28 0.236665
199+
2020-05-29 0.183886
200+
}
201+
...}
202+
"""
203+
pred_all_ics = {}
204+
for k, pred in pred_dict_all.items():
205+
pred_all_ics[k] = DelayedDict(["ic", "ric"], delayed(calc_ic)(pred, label, date_col=date_col, dropna=dropna))
206+
pred_all_ics = complex_parallel(Parallel(n_jobs=n_jobs, verbose=10), pred_all_ics)
207+
return pred_all_ics

qlib/contrib/model/pytorch_nn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def __init__(
7474
data_parall=False,
7575
scheduler: Optional[Union[Callable]] = "default", # when it is Callable, it accept one argument named optimizer
7676
init_model=None,
77-
eval_train_metric=True,
77+
eval_train_metric=False,
7878
pt_model_uri="qlib.contrib.model.pytorch_nn.Net",
7979
pt_model_kwargs={
8080
"input_dim": 360,
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""
5+
This module is designed to analysis data
6+
7+
"""

qlib/contrib/report/data/ana.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
import pandas as pd
4+
import numpy as np
5+
from qlib.contrib.report.data.base import FeaAnalyser
6+
from qlib.contrib.report.utils import sub_fig_generator
7+
from qlib.utils.paral import datetime_groupby_apply
8+
from qlib.contrib.eva.alpha import pred_autocorr_all
9+
from loguru import logger
10+
import seaborn as sns
11+
12+
DT_COL_NAME = "datetime"
13+
14+
15+
class CombFeaAna(FeaAnalyser):
16+
"""
17+
Combine the sub feature analysers and plot then in a single graph
18+
"""
19+
20+
def __init__(self, dataset: pd.DataFrame, *fea_ana_cls):
21+
if len(fea_ana_cls) <= 1:
22+
raise NotImplementedError(f"This type of input is not supported")
23+
self._fea_ana_l = [fcls(dataset) for fcls in fea_ana_cls]
24+
super().__init__(dataset=dataset)
25+
26+
def skip(self, col):
27+
return np.all(list(map(lambda fa: fa.skip(col), self._fea_ana_l)))
28+
29+
def calc_stat_values(self):
30+
"""The statistics of features are finished in the underlying analysers"""
31+
32+
def plot_all(self, *args, **kwargs):
33+
34+
ax_gen = iter(sub_fig_generator(row_n=len(self._fea_ana_l), *args, **kwargs))
35+
36+
for col in self._dataset:
37+
if not self.skip(col):
38+
axes = next(ax_gen)
39+
for fa, ax in zip(self._fea_ana_l, axes):
40+
if not fa.skip(col):
41+
fa.plot_single(col, ax)
42+
ax.set_xlabel("")
43+
ax.set_title("")
44+
axes[0].set_title(col)
45+
46+
47+
class NumFeaAnalyser(FeaAnalyser):
48+
def skip(self, col):
49+
is_obj = np.issubdtype(self._dataset[col], np.dtype("O"))
50+
if is_obj:
51+
logger.info(f"{col} is not numeric and is skipped")
52+
return is_obj
53+
54+
55+
class ValueCNT(FeaAnalyser):
56+
def __init__(self, dataset: pd.DataFrame, ratio=False):
57+
self.ratio = ratio
58+
super().__init__(dataset)
59+
60+
def calc_stat_values(self):
61+
self._val_cnt = {}
62+
for col, item in self._dataset.items():
63+
if not super().skip(col):
64+
self._val_cnt[col] = item.groupby(DT_COL_NAME).apply(lambda s: len(s.unique()))
65+
self._val_cnt = pd.DataFrame(self._val_cnt)
66+
if self.ratio:
67+
self._val_cnt = self._val_cnt.div(self._dataset.groupby(DT_COL_NAME).size(), axis=0)
68+
69+
# TODO: 这部分其他的也可以借鉴
70+
ymin, ymax = self._val_cnt.min().min(), self._val_cnt.max().max()
71+
self.ylim = (ymin - 0.05 * (ymax - ymin), ymax + 0.05 * (ymax - ymin))
72+
73+
def plot_single(self, col, ax):
74+
self._val_cnt[col].plot(ax=ax, title=col, ylim=self.ylim)
75+
ax.set_xlabel("")
76+
77+
78+
class FeaDistAna(NumFeaAnalyser):
79+
def plot_single(self, col, ax):
80+
sns.histplot(self._dataset[col], ax=ax, kde=False, bins=100)
81+
ax.set_xlabel("")
82+
ax.set_title(col)
83+
84+
85+
class FeaInfAna(NumFeaAnalyser):
86+
def calc_stat_values(self):
87+
self._inf_cnt = {}
88+
for col, item in self._dataset.items():
89+
if not super().skip(col):
90+
self._inf_cnt[col] = item.apply(np.isinf).astype(np.int).groupby(DT_COL_NAME).sum()
91+
self._inf_cnt = pd.DataFrame(self._inf_cnt)
92+
93+
def skip(self, col):
94+
return (col not in self._inf_cnt) or (self._inf_cnt[col].sum() == 0)
95+
96+
def plot_single(self, col, ax):
97+
self._inf_cnt[col].plot(ax=ax, title=col)
98+
ax.set_xlabel("")
99+
100+
101+
class FeaNanAna(FeaAnalyser):
102+
def calc_stat_values(self):
103+
self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME).sum()
104+
105+
def skip(self, col):
106+
return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0)
107+
108+
def plot_single(self, col, ax):
109+
self._nan_cnt[col].plot(ax=ax, title=col)
110+
ax.set_xlabel("")
111+
112+
113+
class FeaNanAnaRatio(FeaAnalyser):
114+
def calc_stat_values(self):
115+
self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME).sum()
116+
self._total_cnt = self._dataset.groupby(DT_COL_NAME).size()
117+
118+
def skip(self, col):
119+
return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0)
120+
121+
def plot_single(self, col, ax):
122+
(self._nan_cnt[col] / self._total_cnt).plot(ax=ax, title=col)
123+
ax.set_xlabel("")
124+
125+
126+
class FeaACAna(FeaAnalyser):
127+
"""Analysis the auto-correlation of features"""
128+
129+
def calc_stat_values(self):
130+
self._fea_corr = pred_autocorr_all(self._dataset.to_dict("series"))
131+
df = pd.DataFrame(self._fea_corr)
132+
ymin, ymax = df.min().min(), df.max().max()
133+
self.ylim = (ymin - 0.05 * (ymax - ymin), ymax + 0.05 * (ymax - ymin))
134+
135+
def plot_single(self, col, ax):
136+
self._fea_corr[col].plot(ax=ax, title=col, ylim=self.ylim)
137+
ax.set_xlabel("")
138+
139+
140+
class FeaSkewTurt(NumFeaAnalyser):
141+
def calc_stat_values(self):
142+
self._skew = datetime_groupby_apply(self._dataset, "skew", skip_group=True)
143+
self._kurt = datetime_groupby_apply(self._dataset, pd.DataFrame.kurt, skip_group=True)
144+
145+
def plot_single(self, col, ax):
146+
self._skew[col].plot(ax=ax, label="skew")
147+
ax.set_xlabel("")
148+
ax.set_ylabel("skew")
149+
ax.legend()
150+
151+
right_ax = ax.twinx()
152+
153+
self._kurt[col].plot(ax=right_ax, label="kurt", color="green")
154+
right_ax.set_xlabel("")
155+
right_ax.set_ylabel("kurt")
156+
157+
h1, l1 = ax.get_legend_handles_labels()
158+
h2, l2 = right_ax.get_legend_handles_labels()
159+
160+
ax.legend().set_visible(False)
161+
right_ax.legend(h1 + h2, l1 + l2)
162+
ax.set_title(col)
163+
164+
165+
class FeaMeanStd(NumFeaAnalyser):
166+
def calc_stat_values(self):
167+
self._std = self._dataset.groupby(DT_COL_NAME).std()
168+
self._mean = self._dataset.groupby(DT_COL_NAME).mean()
169+
170+
def plot_single(self, col, ax):
171+
self._mean[col].plot(ax=ax, label="mean")
172+
ax.set_xlabel("")
173+
ax.set_ylabel("mean")
174+
ax.legend()
175+
176+
right_ax = ax.twinx()
177+
178+
self._std[col].plot(ax=right_ax, label="std", color="green")
179+
right_ax.set_xlabel("")
180+
right_ax.set_ylabel("std")
181+
182+
h1, l1 = ax.get_legend_handles_labels()
183+
h2, l2 = right_ax.get_legend_handles_labels()
184+
185+
ax.legend().set_visible(False)
186+
right_ax.legend(h1 + h2, l1 + l2)
187+
ax.set_title(col)
188+
189+
190+
class RawFeaAna(FeaAnalyser):
191+
"""
192+
Motivation:
193+
- display the values without further analysis
194+
"""
195+
196+
def calc_stat_values(self):
197+
ymin, ymax = self._dataset.min().min(), self._dataset.max().max()
198+
self.ylim = (ymin - 0.05 * (ymax - ymin), ymax + 0.05 * (ymax - ymin))
199+
200+
def plot_single(self, col, ax):
201+
self._dataset[col].plot(ax=ax, title=col, ylim=self.ylim)
202+
ax.set_xlabel("")

0 commit comments

Comments
 (0)