77
88import pandas as pd
99
10+ from qlib .typehint import Literal
1011from ...log import get_module_logger , TimeInspector
1112from ...utils import init_instance_by_config
1213from ...utils .serial import Serializable
@@ -49,6 +50,8 @@ class DataHandler(Serializable):
4950 - Fetching data with `col_set=CS_RAW` will return the raw data and may avoid pandas from copying the data when calling `loc`
5051 """
5152
53+ _data : pd .DataFrame # underlying data.
54+
5255 def __init__ (
5356 self ,
5457 instruments = None ,
@@ -155,6 +158,11 @@ def fetch(
155158 """
156159 fetch data from underlying data source
157160
161+ Design motivation:
162+ - providing a unified interface for underlying data.
163+ - Potential to make the interface more friendly.
164+ - User can improve performance when fetching data in this extra layer
165+
158166 Parameters
159167 ----------
160168 selector : Union[pd.Timestamp, slice, str]
@@ -328,6 +336,9 @@ def get_range_iterator(
328336 yield cur_date , self .fetch (selector , ** kwargs )
329337
330338
339+ DATA_KEY_TYPE = Literal ["raw" , "infer" , "learn" ]
340+
341+
331342class DataHandlerLP (DataHandler ):
332343 """
333344 DataHandler with **(L)earnable (P)rocessor**
@@ -353,10 +364,15 @@ class DataHandlerLP(DataHandler):
353364 - `drop_raw=True`: this will modify the data inplace on raw data;
354365 """
355366
367+ # based on `self._data`, _infer and _learn are genrated after processors
368+ _infer : pd .DataFrame # data for inference
369+ _learn : pd .DataFrame # data for learning models
370+
356371 # data key
357- DK_R = "raw"
358- DK_I = "infer"
359- DK_L = "learn"
372+ DK_R : DATA_KEY_TYPE = "raw"
373+ DK_I : DATA_KEY_TYPE = "infer"
374+ DK_L : DATA_KEY_TYPE = "learn"
375+ # map data_key to attribute name
360376 ATTR_MAP = {DK_R : "_data" , DK_I : "_infer" , DK_L : "_learn" }
361377
362378 # process type
@@ -600,7 +616,7 @@ def setup_data(self, init_type: str = IT_FIT_SEQ, **kwargs):
600616
601617 # TODO: Be able to cache handler data. Save the memory for data processing
602618
603- def _get_df_by_key (self , data_key : str = DK_I ) -> pd .DataFrame :
619+ def _get_df_by_key (self , data_key : DATA_KEY_TYPE = DK_I ) -> pd .DataFrame :
604620 if data_key == self .DK_R and self .drop_raw :
605621 raise AttributeError (
606622 "DataHandlerLP has not attribute _data, please set drop_raw = False if you want to use raw data"
@@ -613,7 +629,7 @@ def fetch(
613629 selector : Union [pd .Timestamp , slice , str ] = slice (None , None ),
614630 level : Union [str , int ] = "datetime" ,
615631 col_set = DataHandler .CS_ALL ,
616- data_key : str = DK_I ,
632+ data_key : DATA_KEY_TYPE = DK_I ,
617633 squeeze : bool = False ,
618634 proc_func : Callable = None ,
619635 ) -> pd .DataFrame :
@@ -647,15 +663,15 @@ def fetch(
647663 proc_func = proc_func ,
648664 )
649665
650- def get_cols (self , col_set = DataHandler .CS_ALL , data_key : str = DK_I ) -> list :
666+ def get_cols (self , col_set = DataHandler .CS_ALL , data_key : DATA_KEY_TYPE = DK_I ) -> list :
651667 """
652668 get the column names
653669
654670 Parameters
655671 ----------
656672 col_set : str
657673 select a set of meaningful columns.(e.g. features, columns).
658- data_key : str
674+ data_key : DATA_KEY_TYPE
659675 the data to fetch: DK_*.
660676
661677 Returns
0 commit comments