Skip to content

Commit d087054

Browse files
authored
Add Cache to avoid frequently loading calendar (#766)
1 parent 350fbe9 commit d087054

File tree

6 files changed

+101
-52
lines changed

6 files changed

+101
-52
lines changed

qlib/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@
1515

1616
# init qlib
1717
def init(default_conf="client", **kwargs):
18+
"""
19+
20+
Parameters
21+
----------
22+
**kwargs :
23+
clear_mem_cache: str
24+
the default value is True;
25+
Will the memory cache be clear.
26+
It is often used to improve performance when init will be called for multiple times
27+
"""
1828
from .config import C
1929
from .data.cache import H
2030

@@ -28,7 +38,9 @@ def init(default_conf="client", **kwargs):
2838
logger.warning("Skip initialization because `skip_if_reg is True`")
2939
return
3040

31-
H.clear()
41+
clear_mem_cache = kwargs.pop("clear_mem_cache", True)
42+
if clear_mem_cache:
43+
H.clear()
3244
C.set(default_conf, **kwargs)
3345

3446
# mount nfs

qlib/config.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
- server
1111
1212
"""
13+
from __future__ import annotations
1314

1415
import os
1516
import re
@@ -18,7 +19,11 @@
1819
import platform
1920
import multiprocessing
2021
from pathlib import Path
21-
from typing import Union
22+
from typing import Optional, Union
23+
from typing import TYPE_CHECKING
24+
25+
if TYPE_CHECKING:
26+
from qlib.utils.time import Freq
2227

2328

2429
class Config:
@@ -296,7 +301,9 @@ def get_uri_type(uri: Union[str, Path]):
296301
else:
297302
return QlibConfig.LOCAL_URI
298303

299-
def get_data_uri(self, freq: str = None) -> Path:
304+
def get_data_uri(self, freq: Optional[Union[str, Freq]] = None) -> Path:
305+
if freq is not None:
306+
freq = str(freq) # converting Freq to string
300307
if freq is None or freq not in self.provider_uri:
301308
freq = QlibConfig.DEFAULT_FREQ
302309
_provider_uri = self.provider_uri[freq]

qlib/data/data.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727

2828
from ..log import get_module_logger
2929
from ..utils.time import Freq
30-
from ..utils.resam import resam_calendar
3130
from .cache import DiskDatasetCache, DiskExpressionCache
3231
from ..utils import (
3332
Wrapper,

qlib/data/storage/file_storage.py

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from qlib.utils.time import Freq
1212
from qlib.utils.resam import resam_calendar
1313
from qlib.config import C
14+
from qlib.data.cache import H
1415
from qlib.log import get_module_logger
1516
from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage, CalVT, InstKT, InstVT
1617

@@ -33,15 +34,15 @@ def support_freq(self) -> List[str]:
3334
if hasattr(self, _v):
3435
return getattr(self, _v)
3536
if len(self.provider_uri) == 1 and C.DEFAULT_FREQ in self.provider_uri:
36-
freq = filter(
37+
freq_l = filter(
3738
lambda _freq: not _freq.endswith("_future"),
3839
map(lambda x: x.stem, self.dpm.get_data_uri(C.DEFAULT_FREQ).joinpath("calendars").glob("*.txt")),
3940
)
4041
else:
41-
freq = self.provider_uri.keys()
42-
freq = list(freq)
43-
setattr(self, _v, freq)
44-
return freq
42+
freq_l = self.provider_uri.keys()
43+
freq_l = [Freq(freq) for freq in freq_l]
44+
setattr(self, _v, freq_l)
45+
return freq_l
4546

4647
@property
4748
def uri(self) -> Path:
@@ -65,15 +66,28 @@ def __init__(self, freq: str, future: bool, provider_uri: dict, **kwargs):
6566
super(FileCalendarStorage, self).__init__(freq, future, **kwargs)
6667
self.future = future
6768
self.provider_uri = C.DataPathManager.format_provider_uri(provider_uri)
68-
self.resample_freq = None
69+
self.enable_read_cache = True # TODO: make it configurable
6970

7071
@property
7172
def file_name(self) -> str:
72-
return f"{self.use_freq}_future.txt" if self.future else f"{self.use_freq}.txt".lower()
73+
return f"{self._freq_file}_future.txt" if self.future else f"{self._freq_file}.txt".lower()
7374

7475
@property
75-
def use_freq(self) -> str:
76-
return self.freq if self.resample_freq is None else self.resample_freq
76+
def _freq_file(self) -> str:
77+
"""the freq to read from file"""
78+
if not hasattr(self, "_freq_file_cache"):
79+
freq = Freq(self.freq)
80+
if freq not in self.support_freq:
81+
# NOTE: uri
82+
# 1. If `uri` does not exist
83+
# - Get the `min_uri` of the closest `freq` under the same "directory" as the `uri`
84+
# - Read data from `min_uri` and resample to `freq`
85+
86+
freq = Freq.get_recent_freq(freq, self.support_freq)
87+
if freq is None:
88+
raise ValueError(f"can't find a freq from {self.support_freq} that can resample to {self.freq}!")
89+
self._freq_file_cache = freq
90+
return self._freq_file_cache
7791

7892
def _read_calendar(self, skip_rows: int = 0, n_rows: int = None) -> List[CalVT]:
7993
if not self.uri.exists():
@@ -90,25 +104,21 @@ def _write_calendar(self, values: Iterable[CalVT], mode: str = "wb"):
90104

91105
@property
92106
def uri(self) -> Path:
93-
freq = self.freq
94-
if freq not in self.support_freq:
95-
# NOTE: uri
96-
# 1. If `uri` does not exist
97-
# - Get the `min_uri` of the closest `freq` under the same "directory" as the `uri`
98-
# - Read data from `min_uri` and resample to `freq`
99-
100-
freq = Freq.get_recent_freq(freq, self.support_freq)
101-
if freq is None:
102-
raise ValueError(f"can't find a freq from {self.support_freq} that can resample to {self.freq}!")
103-
self.resample_freq = freq
104-
return self.dpm.get_data_uri(self.use_freq).joinpath(f"{self.storage_name}s", self.file_name)
107+
return self.dpm.get_data_uri(self._freq_file).joinpath(f"{self.storage_name}s", self.file_name)
105108

106109
@property
107110
def data(self) -> List[CalVT]:
108111
self.check()
109-
_calendar = self._read_calendar()
110-
if self.resample_freq is not None:
111-
_calendar = resam_calendar(np.array(list(map(pd.Timestamp, _calendar))), self.resample_freq, self.freq)
112+
# If cache is enabled, then return cache directly
113+
if self.enable_read_cache:
114+
key = "orig_file" + str(self.uri)
115+
if not key in H["c"]:
116+
H["c"][key] = self._read_calendar()
117+
_calendar = H["c"][key]
118+
else:
119+
_calendar = self._read_calendar()
120+
if Freq(self._freq_file) != Freq(self.freq):
121+
_calendar = resam_calendar(np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq)
112122
return _calendar
113123

114124
def _get_storage_freq(self) -> List[str]:

qlib/utils/resam.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .time import Freq, cal_sam_minute
99

1010

11-
def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np.ndarray:
11+
def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sam: Union[str, Freq]) -> np.ndarray:
1212
"""
1313
Resample the calendar with frequency freq_raw into the calendar with frequency freq_sam
1414
Assumption:
@@ -28,36 +28,36 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np
2828
np.ndarray
2929
The calendar with frequency freq_sam
3030
"""
31-
raw_count, freq_raw = Freq.parse(freq_raw)
32-
sam_count, freq_sam = Freq.parse(freq_sam)
31+
freq_raw = Freq(freq_raw)
32+
freq_sam = Freq(freq_sam)
3333
if not len(calendar_raw):
3434
return calendar_raw
3535

3636
# if freq_sam is xminute, divide each trading day into several bars evenly
37-
if freq_sam == Freq.NORM_FREQ_MINUTE:
38-
if freq_raw != Freq.NORM_FREQ_MINUTE:
37+
if freq_sam.base == Freq.NORM_FREQ_MINUTE:
38+
if freq_raw.base != Freq.NORM_FREQ_MINUTE:
3939
raise ValueError("when sampling minute calendar, freq of raw calendar must be minute or min")
4040
else:
41-
if raw_count > sam_count:
41+
if freq_raw.count > freq_sam.count:
4242
raise ValueError("raw freq must be higher than sampling freq")
43-
_calendar_minute = np.unique(list(map(lambda x: cal_sam_minute(x, sam_count), calendar_raw)))
43+
_calendar_minute = np.unique(list(map(lambda x: cal_sam_minute(x, freq_sam.count), calendar_raw)))
4444
return _calendar_minute
4545

4646
# else, convert the raw calendar into day calendar, and divide the whole calendar into several bars evenly
4747
else:
4848
_calendar_day = np.unique(list(map(lambda x: pd.Timestamp(x.year, x.month, x.day, 0, 0, 0), calendar_raw)))
49-
if freq_sam == Freq.NORM_FREQ_DAY:
50-
return _calendar_day[::sam_count]
49+
if freq_sam.base == Freq.NORM_FREQ_DAY:
50+
return _calendar_day[:: freq_sam.count]
5151

52-
elif freq_sam == Freq.NORM_FREQ_WEEK:
52+
elif freq_sam.base == Freq.NORM_FREQ_WEEK:
5353
_day_in_week = np.array(list(map(lambda x: x.dayofweek, _calendar_day)))
5454
_calendar_week = _calendar_day[np.ediff1d(_day_in_week, to_begin=-1) < 0]
55-
return _calendar_week[::sam_count]
55+
return _calendar_week[:: freq_sam.count]
5656

57-
elif freq_sam == Freq.NORM_FREQ_MONTH:
57+
elif freq_sam.base == Freq.NORM_FREQ_MONTH:
5858
_day_in_month = np.array(list(map(lambda x: x.day, _calendar_day)))
5959
_calendar_month = _calendar_day[np.ediff1d(_day_in_month, to_begin=-1) < 0]
60-
return _calendar_month[::sam_count]
60+
return _calendar_month[:: freq_sam.count]
6161
else:
6262
raise ValueError("sampling freq must be xmin, xd, xw, xm")
6363

qlib/utils/time.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66
import bisect
77
from datetime import datetime, time, date
8-
from typing import List, Tuple, Union
8+
from typing import List, Optional, Tuple, Union
99
import functools
1010
import re
1111

@@ -69,13 +69,29 @@ class Freq:
6969
NORM_FREQ_MONTH = "month"
7070
NORM_FREQ_WEEK = "week"
7171
NORM_FREQ_DAY = "day"
72-
NORM_FREQ_MINUTE = "minute"
72+
NORM_FREQ_MINUTE = "min" # using min instead of minute for align with Qlib's data filename
7373
SUPPORT_CAL_LIST = [NORM_FREQ_MINUTE, NORM_FREQ_DAY] # FIXME: this list should from data
7474

7575
MIN_CAL = get_min_cal()
7676

77-
def __init__(self, freq: str) -> None:
78-
self.count, self.base = self.parse(freq)
77+
def __init__(self, freq: Union[str, "Freq"]) -> None:
78+
if isinstance(freq, str):
79+
self.count, self.base = self.parse(freq)
80+
elif isinstance(freq, Freq):
81+
self.count, self.base = freq.count, freq.base
82+
else:
83+
raise NotImplementedError(f"This type of input is not supported")
84+
85+
def __eq__(self, freq):
86+
freq = Freq(freq)
87+
return freq.count == self.count and freq.base == self.base
88+
89+
def __str__(self):
90+
# trying to align to the filename of Qlib: day, 30min, 5min, 1min...
91+
return f"{self.count if self.count != 1 or self.base != 'day' else ''}{self.base}"
92+
93+
def __repr__(self) -> str:
94+
return f"{self.__class__.__name__}({str(self)})"
7995

8096
@staticmethod
8197
def parse(freq: str) -> Tuple[int, str]:
@@ -159,14 +175,14 @@ def get_min_delta(left_frq: str, right_freq: str):
159175
Freq.NORM_FREQ_WEEK: 7 * 60 * 24,
160176
Freq.NORM_FREQ_MONTH: 30 * 7 * 60 * 24,
161177
}
162-
left_freq = Freq.parse(left_frq)
163-
left_minutes = left_freq[0] * minutes_map[left_freq[1]]
164-
right_freq = Freq.parse(right_freq)
165-
right_minutes = right_freq[0] * minutes_map[right_freq[1]]
178+
left_freq = Freq(left_frq)
179+
left_minutes = left_freq.count * minutes_map[left_freq.base]
180+
right_freq = Freq(right_freq)
181+
right_minutes = right_freq.count * minutes_map[right_freq.base]
166182
return left_minutes - right_minutes
167183

168184
@staticmethod
169-
def get_recent_freq(base_freq: str, freq_list: List[str]) -> str:
185+
def get_recent_freq(base_freq: Union[str, "Freq"], freq_list: List[Union[str, "Freq"]]) -> Optional["Freq"]:
170186
"""Get the closest freq to base_freq from freq_list
171187
172188
Parameters
@@ -176,17 +192,22 @@ def get_recent_freq(base_freq: str, freq_list: List[str]) -> str:
176192
177193
Returns
178194
-------
179-
195+
if the recent frequency is found
196+
Freq
197+
else:
198+
None
180199
"""
200+
base_freq = Freq(base_freq)
181201
# use the nearest freq greater than 0
182202
_freq_minutes = []
183203
min_freq = None
184204
for _freq in freq_list:
205+
freq = Freq(_freq)
185206
_min_delta = Freq.get_min_delta(base_freq, _freq)
186207
if _min_delta < 0:
187208
continue
188209
if min_freq is None:
189-
min_freq = (_min_delta, _freq)
210+
min_freq = (_min_delta, str(_freq))
190211
continue
191212
min_freq = min_freq if min_freq[0] <= _min_delta else (_min_delta, _freq)
192213
return min_freq[1] if min_freq else None

0 commit comments

Comments
 (0)