Skip to content

Commit e457ca8

Browse files
authored
Improve annotation & documentation for handler (microsoft#1312)
* Improve annotation & documentation for handler * Add type
1 parent 4dbb8ec commit e457ca8

File tree

1 file changed

+23
-7
lines changed

1 file changed

+23
-7
lines changed

qlib/data/dataset/handler.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pandas as pd
99

10+
from qlib.typehint import Literal
1011
from ...log import get_module_logger, TimeInspector
1112
from ...utils import init_instance_by_config
1213
from ...utils.serial import Serializable
@@ -49,6 +50,8 @@ class DataHandler(Serializable):
4950
- Fetching data with `col_set=CS_RAW` will return the raw data and may avoid pandas from copying the data when calling `loc`
5051
"""
5152

53+
_data: pd.DataFrame # underlying data.
54+
5255
def __init__(
5356
self,
5457
instruments=None,
@@ -155,6 +158,11 @@ def fetch(
155158
"""
156159
fetch data from underlying data source
157160
161+
Design motivation:
162+
- providing a unified interface for underlying data.
163+
- Potential to make the interface more friendly.
164+
- User can improve performance when fetching data in this extra layer
165+
158166
Parameters
159167
----------
160168
selector : Union[pd.Timestamp, slice, str]
@@ -328,6 +336,9 @@ def get_range_iterator(
328336
yield cur_date, self.fetch(selector, **kwargs)
329337

330338

339+
DATA_KEY_TYPE = Literal["raw", "infer", "learn"]
340+
341+
331342
class DataHandlerLP(DataHandler):
332343
"""
333344
DataHandler with **(L)earnable (P)rocessor**
@@ -353,10 +364,15 @@ class DataHandlerLP(DataHandler):
353364
- `drop_raw=True`: this will modify the data inplace on raw data;
354365
"""
355366

367+
# based on `self._data`, _infer and _learn are genrated after processors
368+
_infer: pd.DataFrame # data for inference
369+
_learn: pd.DataFrame # data for learning models
370+
356371
# data key
357-
DK_R = "raw"
358-
DK_I = "infer"
359-
DK_L = "learn"
372+
DK_R: DATA_KEY_TYPE = "raw"
373+
DK_I: DATA_KEY_TYPE = "infer"
374+
DK_L: DATA_KEY_TYPE = "learn"
375+
# map data_key to attribute name
360376
ATTR_MAP = {DK_R: "_data", DK_I: "_infer", DK_L: "_learn"}
361377

362378
# process type
@@ -600,7 +616,7 @@ def setup_data(self, init_type: str = IT_FIT_SEQ, **kwargs):
600616

601617
# TODO: Be able to cache handler data. Save the memory for data processing
602618

603-
def _get_df_by_key(self, data_key: str = DK_I) -> pd.DataFrame:
619+
def _get_df_by_key(self, data_key: DATA_KEY_TYPE = DK_I) -> pd.DataFrame:
604620
if data_key == self.DK_R and self.drop_raw:
605621
raise AttributeError(
606622
"DataHandlerLP has not attribute _data, please set drop_raw = False if you want to use raw data"
@@ -613,7 +629,7 @@ def fetch(
613629
selector: Union[pd.Timestamp, slice, str] = slice(None, None),
614630
level: Union[str, int] = "datetime",
615631
col_set=DataHandler.CS_ALL,
616-
data_key: str = DK_I,
632+
data_key: DATA_KEY_TYPE = DK_I,
617633
squeeze: bool = False,
618634
proc_func: Callable = None,
619635
) -> pd.DataFrame:
@@ -647,15 +663,15 @@ def fetch(
647663
proc_func=proc_func,
648664
)
649665

650-
def get_cols(self, col_set=DataHandler.CS_ALL, data_key: str = DK_I) -> list:
666+
def get_cols(self, col_set=DataHandler.CS_ALL, data_key: DATA_KEY_TYPE = DK_I) -> list:
651667
"""
652668
get the column names
653669
654670
Parameters
655671
----------
656672
col_set : str
657673
select a set of meaningful columns.(e.g. features, columns).
658-
data_key : str
674+
data_key : DATA_KEY_TYPE
659675
the data to fetch: DK_*.
660676
661677
Returns

0 commit comments

Comments
 (0)