Skip to content

init_instance_by_config(task["dataset"]) Take a lot of time #187

@Cjaimesg

Description

@Cjaimesg

❓ Questions and Help

Hi,

I am running the example 'workflow_by_code.ipynb', not is the first time that I run it, but today this part of code take a lot of time (more than 30 minutes and it has not finished).

<dataset = init_instance_by_config(task["dataset"])>

I am using Google Colab
Whe I force stop the code, this show me

`<---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/qlib/data/data.py in features(self, instruments, fields, start_time, end_time, freq, disk_cache)
974 try:
--> 975 return DatasetD.dataset(instruments, fields, start_time, end_time, freq, disk_cache)
976 except TypeError:

TypeError: dataset() takes from 3 to 6 positional arguments but 7 were given

During handling of the above exception, another exception occurred:

KeyboardInterrupt Traceback (most recent call last)
19 frames
in ()
46 # model initiaiton
47 model = init_instance_by_config(task["model"])
---> 48 dataset = init_instance_by_config(task["dataset"])

/usr/local/lib/python3.6/dist-packages/qlib/utils/init.py in init_instance_by_config(config, module, accept_types, **kwargs)
251
252 klass, cls_kwargs = get_cls_kwargs(config, module)
--> 253 return klass(**cls_kwargs, **kwargs)
254
255

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/init.py in init(self, handler, segments)
86 handler will be passed into setup_data.
87 """
---> 88 super().init(handler, segments)
89
90 def setup_data(self, handler: Union[dict, DataHandler], segments: list):

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/init.py in init(self, *args, **kwargs)
29 The data could specify the info to caculate the essential data for preparation
30 """
---> 31 self.setup_data(*args, **kwargs)
32 super().init()
33

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/init.py in setup_data(self, handler, segments)
117 }
118 """
--> 119 self._handler = init_instance_by_config(handler, accept_types=DataHandler)
120 self._segments = segments.copy()
121

/usr/local/lib/python3.6/dist-packages/qlib/utils/init.py in init_instance_by_config(config, module, accept_types, **kwargs)
251
252 klass, cls_kwargs = get_cls_kwargs(config, module)
--> 253 return klass(**cls_kwargs, **kwargs)
254
255

/usr/local/lib/python3.6/dist-packages/qlib/contrib/data/handler.py in init(self, instruments, start_time, end_time, infer_processors, learn_processors, fit_start_time, fit_end_time, process_type, **kwargs)
154 infer_processors=infer_processors,
155 learn_processors=learn_processors,
--> 156 process_type=process_type,
157 )
158

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/handler.py in init(self, instruments, start_time, end_time, data_loader, infer_processors, learn_processors, process_type, **kwargs)
320
321 self.process_type = process_type
--> 322 super().init(instruments, start_time, end_time, data_loader, **kwargs)
323
324 def get_all_processors(self):

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/handler.py in init(self, instruments, start_time, end_time, data_loader, init_data, fetch_orig)
96 if init_data:
97 with TimeInspector.logt("Init data"):
---> 98 self.init()
99 super().init()
100

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/handler.py in init(self, init_type, enable_cache)
401 """
402 # init raw data
--> 403 super().init(enable_cache=enable_cache)
404
405 with TimeInspector.logt("fit & process data"):

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/handler.py in init(self, enable_cache)
120 # _data may be with multiple column index level. The outer level indicates the feature set name
121 with TimeInspector.logt("Loading data"):
--> 122 self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time)
123 # TODO: cache
124

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/loader.py in load(self, instruments, start_time, end_time)
120 {
121 grp: self.load_group_df(instruments, exprs, names, start_time, end_time)
--> 122 for grp, (exprs, names) in self.fields.items()
123 },
124 axis=1,

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/loader.py in (.0)
120 {
121 grp: self.load_group_df(instruments, exprs, names, start_time, end_time)
--> 122 for grp, (exprs, names) in self.fields.items()
123 },
124 axis=1,

/usr/local/lib/python3.6/dist-packages/qlib/data/dataset/loader.py in load_group_df(self, instruments, exprs, names, start_time, end_time)
154 warnings.warn("filter_pipe is not None, but it will not be used with instruments as list")
155
--> 156 df = D.features(instruments, exprs, start_time, end_time)
157 df.columns = names
158 df = df.swaplevel().sort_index() # NOTE: always return <datetime, instrument>

/usr/local/lib/python3.6/dist-packages/qlib/data/data.py in features(self, instruments, fields, start_time, end_time, freq, disk_cache)
975 return DatasetD.dataset(instruments, fields, start_time, end_time, freq, disk_cache)
976 except TypeError:
--> 977 return DatasetD.dataset(instruments, fields, start_time, end_time, freq)
978
979

/usr/local/lib/python3.6/dist-packages/qlib/data/data.py in dataset(self, instruments, fields, start_time, end_time, freq)
706 end_time = cal[-1]
707
--> 708 data = self.dataset_processor(instruments_d, column_names, start_time, end_time, freq)
709
710 return data

/usr/local/lib/python3.6/dist-packages/qlib/data/data.py in dataset_processor(instruments_d, column_names, start_time, end_time, freq)
456
457 p.close()
--> 458 p.join()
459
460 new_data = dict()

/usr/lib/python3.6/multiprocessing/pool.py in join(self)
544 util.debug('joining pool')
545 assert self._state in (CLOSE, TERMINATE)
--> 546 self._worker_handler.join()
547 self._task_handler.join()
548 self._result_handler.join()

/usr/lib/python3.6/threading.py in join(self, timeout)
1054
1055 if timeout is None:
-> 1056 self._wait_for_tstate_lock()
1057 else:
1058 # the behavior of a negative timeout isn't documented, but

/usr/lib/python3.6/threading.py in _wait_for_tstate_lock(self, block, timeout)
1070 if lock is None: # already determined that the C code is done
1071 assert self._is_stopped
-> 1072 elif lock.acquire(block, timeout):
1073 lock.release()
1074 self._stop()>`

The qlib version is '0.6.1'
Python version in '3.6.9'

Thanks.

Metadata

Metadata

Assignees

Labels

questionFurther information is requested

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions