Fix ZScoreNorm processor bug (microsoft#1398)

SunsetWolf · web-flow · commit 756bd0f65b66 · 2022-12-30T20:42:37.000+08:00
* fix_ZScoreNorm_bug

* fix_CI_error

* fix_CI_error

* add_test_processor

* fix_pylint_error

* fix_some_error_and_optimize_code

* modify_terrible_code

* optimize_code

* optimize_code
diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml
@@ -140,10 +140,7 @@ jobs:
 
     - name: Test workflow by config (install from source)
       run: |
-        # Version 0.52.0 of numba must be installed manually in CI, otherwise it will cause incompatibility with the latest version of numpy.
-        python -m pip install numba==0.52.0
-        # You must update numpy manually, because when installing python tools, it will try to uninstall numpy and cause CI to fail.
-        python -m pip install --upgrade numpy
+        python -m pip install numba
         python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
 
     - name: Unit tests with Pytest
diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py
@@ -211,16 +211,19 @@ def fit(self, df: pd.DataFrame = None):
         self.min_val = np.nanmin(df[cols].values, axis=0)
         self.max_val = np.nanmax(df[cols].values, axis=0)
         self.ignore = self.min_val == self.max_val
+        # To improve the speed, we set the value of `min_val` to `0` for the columns that do not need to be processed,
+        # and the value of `max_val` to `1`, when using `(x - min_val) / (max_val - min_val)` for uniform calculation,
+        # the columns that do not need to be processed will be calculated by `(x - 0) / (1 - 0)`,
+        # as you can see, the columns that do not need to be processed, will not be affected.
+        for _i, _con in enumerate(self.ignore):
+            if _con:
+                self.min_val[_i] = 0
+                self.max_val[_i] = 1
         self.cols = cols
 
     def __call__(self, df):
-        def normalize(x, min_val=self.min_val, max_val=self.max_val, ignore=self.ignore):
-            if (~ignore).all():
-                return (x - min_val) / (max_val - min_val)
-            for i in range(ignore.size):
-                if not ignore[i]:
-                    x[i] = (x[i] - min_val) / (max_val - min_val)
-            return x
+        def normalize(x, min_val=self.min_val, max_val=self.max_val):
+            return (x - min_val) / (max_val - min_val)
 
         df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
         return df
@@ -242,16 +245,19 @@ def fit(self, df: pd.DataFrame = None):
         self.mean_train = np.nanmean(df[cols].values, axis=0)
         self.std_train = np.nanstd(df[cols].values, axis=0)
         self.ignore = self.std_train == 0
+        # To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed,
+        # and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation,
+        # the columns that do not need to be processed will be calculated by `(x - 0) / 1`,
+        # as you can see, the columns that do not need to be processed, will not be affected.
+        for _i, _con in enumerate(self.ignore):
+            if _con:
+                self.std_train[_i] = 1
+                self.mean_train[_i] = 0
         self.cols = cols
 
     def __call__(self, df):
-        def normalize(x, mean_train=self.mean_train, std_train=self.std_train, ignore=self.ignore):
-            if (~ignore).all():
-                return (x - mean_train) / std_train
-            for i in range(ignore.size):
-                if not ignore[i]:
-                    x[i] = (x[i] - mean_train) / std_train
-            return x
+        def normalize(x, mean_train=self.mean_train, std_train=self.std_train):
+            return (x - mean_train) / std_train
 
         df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
         return df
@@ -361,7 +367,7 @@ def __init__(self, fields_group=None):
 
     def __call__(self, df):
         cols = get_group_columns(df, self.fields_group)
-        df[cols] = df[cols].groupby("datetime").apply(lambda x: x.fillna(x.mean()))
+        df[cols] = df[cols].groupby("datetime", group_keys=False).apply(lambda x: x.fillna(x.mean()))
         return df
 
 
diff --git a/setup.py b/setup.py
@@ -156,7 +156,14 @@ def get_version(rel_path: str) -> str:
             "baostock",
             "yahooquery",
             "beautifulsoup4",
-            "tianshou",
+            # In version 0.4.11 of tianshou, the code:
+            # logits, hidden = self.actor(batch.obs, state=state, info=batch.info)
+            # was changed in PR787,
+            # which causes pytest errors(AttributeError: 'dict' object has no attribute 'info') in CI,
+            # so we restricted the version of tianshou.
+            # References:
+            # https://github.com/thu-ml/tianshou/releases
+            "tianshou<=0.4.10",
             "gym>=0.24",  # If you do not put gym at the end, gym will degrade causing pytest results to fail.
         ],
         "rl": [
diff --git a/tests/test_processor.py b/tests/test_processor.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+from qlib.data import D
+from qlib.tests import TestAutoData
+from qlib.data.dataset.processor import MinMaxNorm, ZScoreNorm, CSZScoreNorm, CSZFillna
+
+
+class TestProcessor(TestAutoData):
+    TEST_INST = "SH600519"
+
+    def test_MinMaxNorm(self):
+        def normalize(df):
+            min_val = np.nanmin(df.values, axis=0)
+            max_val = np.nanmax(df.values, axis=0)
+            ignore = min_val == max_val
+            for _i, _con in enumerate(ignore):
+                if _con:
+                    max_val[_i] = 1
+                    min_val[_i] = 0
+            df.loc(axis=1)[df.columns] = (df.values - min_val) / (max_val - min_val)
+            return df
+
+        origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
+        origin_df["test"] = 0
+        df = origin_df.copy()
+        mmn = MinMaxNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
+        mmn.fit(df)
+        mmn.__call__(df)
+        origin_df = normalize(origin_df)
+        assert (df == origin_df).all().all()
+
+    def test_ZScoreNorm(self):
+        def normalize(df):
+            mean_train = np.nanmean(df.values, axis=0)
+            std_train = np.nanstd(df.values, axis=0)
+            ignore = std_train == 0
+            for _i, _con in enumerate(ignore):
+                if _con:
+                    std_train[_i] = 1
+                    mean_train[_i] = 0
+            df.loc(axis=1)[df.columns] = (df.values - mean_train) / std_train
+            return df
+
+        origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
+        origin_df["test"] = 0
+        df = origin_df.copy()
+        zsn = ZScoreNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
+        zsn.fit(df)
+        zsn.__call__(df)
+        origin_df = normalize(origin_df)
+        assert (df == origin_df).all().all()
+
+    def test_CSZFillna(self):
+        origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
+        origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[97:99])[228:238]
+        df = origin_df.copy()
+        CSZFillna(fields_group=None).__call__(df)
+        assert ~df[1:2].isna().all().all() and origin_df[1:2].isna().all().all()
+
+    def test_CSZScoreNorm(self):
+        origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
+        origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[10:12])[50:60]
+        df = origin_df.copy()
+        CSZScoreNorm(fields_group=None).__call__(df)
+        # If we use the formula directly on the original data, we cannot get the correct result,
+        # because the original data is processed by `groupby`, so we use the method of slicing,
+        # taking the 2nd group of data from the original data, to calculate and compare.
+        assert (df[2:4] == ((origin_df[2:4] - origin_df[2:4].mean()).div(origin_df[2:4].std()))).all().all()
+
+
+if __name__ == "__main__":
+    unittest.main()