fast fillna (#1074)

you-n-g · web-flow · commit cd5e5d52350d · 2022-04-24T23:24:32.000+08:00
* fast fillna

* fix TSDataSampler bug
diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
@@ -350,7 +350,7 @@ def __init__(
             flt_data = flt_data.reindex(self.data_index).fillna(False).astype(np.bool)
             self.flt_data = flt_data.values
             self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map)
-            self.data_index = self.data_index[np.where(self.flt_data is True)[0]]
+            self.data_index = self.data_index[np.where(self.flt_data)[0]]
         self.idx_map = self.idx_map2arr(self.idx_map)
 
         self.start_idx, self.end_idx = self.data_index.slice_locs(
diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py
@@ -187,7 +187,13 @@ def __call__(self, df):
             df.fillna(self.fill_value, inplace=True)
         else:
             cols = get_group_columns(df, self.fields_group)
-            df.fillna({col: self.fill_value for col in cols}, inplace=True)
+            # this implementation is extremely slow
+            # df.fillna({col: self.fill_value for col in cols}, inplace=True)
+
+            # So we use numpy to accelerate filling values
+            nan_select = np.isnan(df.values)
+            nan_select[:, ~df.columns.isin(cols)] = False
+            df.values[nan_select] = self.fill_value
         return df