Skip to content

Commit 756bd0f

Browse files
authored
Fix ZScoreNorm processor bug (microsoft#1398)
* fix_ZScoreNorm_bug * fix_CI_error * fix_CI_error * add_test_processor * fix_pylint_error * fix_some_error_and_optimize_code * modify_terrible_code * optimize_code * optimize_code
1 parent 667fb0e commit 756bd0f

File tree

4 files changed

+105
-20
lines changed

4 files changed

+105
-20
lines changed

.github/workflows/test_qlib_from_source.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,7 @@ jobs:
140140
141141
- name: Test workflow by config (install from source)
142142
run: |
143-
# Version 0.52.0 of numba must be installed manually in CI, otherwise it will cause incompatibility with the latest version of numpy.
144-
python -m pip install numba==0.52.0
145-
# You must update numpy manually, because when installing python tools, it will try to uninstall numpy and cause CI to fail.
146-
python -m pip install --upgrade numpy
143+
python -m pip install numba
147144
python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
148145
149146
- name: Unit tests with Pytest

qlib/data/dataset/processor.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -211,16 +211,19 @@ def fit(self, df: pd.DataFrame = None):
211211
self.min_val = np.nanmin(df[cols].values, axis=0)
212212
self.max_val = np.nanmax(df[cols].values, axis=0)
213213
self.ignore = self.min_val == self.max_val
214+
# To improve the speed, we set the value of `min_val` to `0` for the columns that do not need to be processed,
215+
# and the value of `max_val` to `1`, when using `(x - min_val) / (max_val - min_val)` for uniform calculation,
216+
# the columns that do not need to be processed will be calculated by `(x - 0) / (1 - 0)`,
217+
# as you can see, the columns that do not need to be processed, will not be affected.
218+
for _i, _con in enumerate(self.ignore):
219+
if _con:
220+
self.min_val[_i] = 0
221+
self.max_val[_i] = 1
214222
self.cols = cols
215223

216224
def __call__(self, df):
217-
def normalize(x, min_val=self.min_val, max_val=self.max_val, ignore=self.ignore):
218-
if (~ignore).all():
219-
return (x - min_val) / (max_val - min_val)
220-
for i in range(ignore.size):
221-
if not ignore[i]:
222-
x[i] = (x[i] - min_val) / (max_val - min_val)
223-
return x
225+
def normalize(x, min_val=self.min_val, max_val=self.max_val):
226+
return (x - min_val) / (max_val - min_val)
224227

225228
df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
226229
return df
@@ -242,16 +245,19 @@ def fit(self, df: pd.DataFrame = None):
242245
self.mean_train = np.nanmean(df[cols].values, axis=0)
243246
self.std_train = np.nanstd(df[cols].values, axis=0)
244247
self.ignore = self.std_train == 0
248+
# To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed,
249+
# and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation,
250+
# the columns that do not need to be processed will be calculated by `(x - 0) / 1`,
251+
# as you can see, the columns that do not need to be processed, will not be affected.
252+
for _i, _con in enumerate(self.ignore):
253+
if _con:
254+
self.std_train[_i] = 1
255+
self.mean_train[_i] = 0
245256
self.cols = cols
246257

247258
def __call__(self, df):
248-
def normalize(x, mean_train=self.mean_train, std_train=self.std_train, ignore=self.ignore):
249-
if (~ignore).all():
250-
return (x - mean_train) / std_train
251-
for i in range(ignore.size):
252-
if not ignore[i]:
253-
x[i] = (x[i] - mean_train) / std_train
254-
return x
259+
def normalize(x, mean_train=self.mean_train, std_train=self.std_train):
260+
return (x - mean_train) / std_train
255261

256262
df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
257263
return df
@@ -361,7 +367,7 @@ def __init__(self, fields_group=None):
361367

362368
def __call__(self, df):
363369
cols = get_group_columns(df, self.fields_group)
364-
df[cols] = df[cols].groupby("datetime").apply(lambda x: x.fillna(x.mean()))
370+
df[cols] = df[cols].groupby("datetime", group_keys=False).apply(lambda x: x.fillna(x.mean()))
365371
return df
366372

367373

setup.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,14 @@ def get_version(rel_path: str) -> str:
156156
"baostock",
157157
"yahooquery",
158158
"beautifulsoup4",
159-
"tianshou",
159+
# In version 0.4.11 of tianshou, the code:
160+
# logits, hidden = self.actor(batch.obs, state=state, info=batch.info)
161+
# was changed in PR787,
162+
# which causes pytest errors(AttributeError: 'dict' object has no attribute 'info') in CI,
163+
# so we restricted the version of tianshou.
164+
# References:
165+
# https://github.com/thu-ml/tianshou/releases
166+
"tianshou<=0.4.10",
160167
"gym>=0.24", # If you do not put gym at the end, gym will degrade causing pytest results to fail.
161168
],
162169
"rl": [

tests/test_processor.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
import unittest
5+
import numpy as np
6+
from qlib.data import D
7+
from qlib.tests import TestAutoData
8+
from qlib.data.dataset.processor import MinMaxNorm, ZScoreNorm, CSZScoreNorm, CSZFillna
9+
10+
11+
class TestProcessor(TestAutoData):
12+
TEST_INST = "SH600519"
13+
14+
def test_MinMaxNorm(self):
15+
def normalize(df):
16+
min_val = np.nanmin(df.values, axis=0)
17+
max_val = np.nanmax(df.values, axis=0)
18+
ignore = min_val == max_val
19+
for _i, _con in enumerate(ignore):
20+
if _con:
21+
max_val[_i] = 1
22+
min_val[_i] = 0
23+
df.loc(axis=1)[df.columns] = (df.values - min_val) / (max_val - min_val)
24+
return df
25+
26+
origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
27+
origin_df["test"] = 0
28+
df = origin_df.copy()
29+
mmn = MinMaxNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
30+
mmn.fit(df)
31+
mmn.__call__(df)
32+
origin_df = normalize(origin_df)
33+
assert (df == origin_df).all().all()
34+
35+
def test_ZScoreNorm(self):
36+
def normalize(df):
37+
mean_train = np.nanmean(df.values, axis=0)
38+
std_train = np.nanstd(df.values, axis=0)
39+
ignore = std_train == 0
40+
for _i, _con in enumerate(ignore):
41+
if _con:
42+
std_train[_i] = 1
43+
mean_train[_i] = 0
44+
df.loc(axis=1)[df.columns] = (df.values - mean_train) / std_train
45+
return df
46+
47+
origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
48+
origin_df["test"] = 0
49+
df = origin_df.copy()
50+
zsn = ZScoreNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
51+
zsn.fit(df)
52+
zsn.__call__(df)
53+
origin_df = normalize(origin_df)
54+
assert (df == origin_df).all().all()
55+
56+
def test_CSZFillna(self):
57+
origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
58+
origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[97:99])[228:238]
59+
df = origin_df.copy()
60+
CSZFillna(fields_group=None).__call__(df)
61+
assert ~df[1:2].isna().all().all() and origin_df[1:2].isna().all().all()
62+
63+
def test_CSZScoreNorm(self):
64+
origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
65+
origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[10:12])[50:60]
66+
df = origin_df.copy()
67+
CSZScoreNorm(fields_group=None).__call__(df)
68+
# If we use the formula directly on the original data, we cannot get the correct result,
69+
# because the original data is processed by `groupby`, so we use the method of slicing,
70+
# taking the 2nd group of data from the original data, to calculate and compare.
71+
assert (df[2:4] == ((origin_df[2:4] - origin_df[2:4].mean()).div(origin_df[2:4].std()))).all().all()
72+
73+
74+
if __name__ == "__main__":
75+
unittest.main()

0 commit comments

Comments
 (0)