Skip to content

Commit cd5e5d5

Browse files
authored
fast fillna (#1074)
* fast fillna * fix TSDataSampler bug
1 parent caea495 commit cd5e5d5

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

qlib/data/dataset/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ def __init__(
350350
flt_data = flt_data.reindex(self.data_index).fillna(False).astype(np.bool)
351351
self.flt_data = flt_data.values
352352
self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map)
353-
self.data_index = self.data_index[np.where(self.flt_data is True)[0]]
353+
self.data_index = self.data_index[np.where(self.flt_data)[0]]
354354
self.idx_map = self.idx_map2arr(self.idx_map)
355355

356356
self.start_idx, self.end_idx = self.data_index.slice_locs(

qlib/data/dataset/processor.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,13 @@ def __call__(self, df):
187187
df.fillna(self.fill_value, inplace=True)
188188
else:
189189
cols = get_group_columns(df, self.fields_group)
190-
df.fillna({col: self.fill_value for col in cols}, inplace=True)
190+
# this implementation is extremely slow
191+
# df.fillna({col: self.fill_value for col in cols}, inplace=True)
192+
193+
# So we use numpy to accelerate filling values
194+
nan_select = np.isnan(df.values)
195+
nan_select[:, ~df.columns.isin(cols)] = False
196+
df.values[nan_select] = self.fill_value
191197
return df
192198

193199

0 commit comments

Comments
 (0)