microsoft · you-n-g · May 17, 2021 · Feb 16, 2021 · Feb 17, 2021 · Feb 26, 2021
diff --git a/examples/workflow_task_rolling.ipynb b/examples/workflow_task_rolling.ipynb
@@ -0,0 +1,177 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import qlib\n",
+    "from qlib.config import REG_CN\n",
+    "from qlib.workflow.task.gen import RollingGen, task_generator\n",
+    "from qlib.workflow.task.manage import TaskManager\n",
+    "from qlib.config import C\n",
+    "\n",
+    "data_handler_config = {\n",
+    "    \"start_time\": \"2008-01-01\",\n",
+    "    \"end_time\": \"2020-08-01\",\n",
+    "    \"fit_start_time\": \"2008-01-01\",\n",
+    "    \"fit_end_time\": \"2014-12-31\",\n",
+    "    \"instruments\": 'csi100',\n",
+    "}\n",
+    "\n",
+    "dataset_config = {\n",
+    "        \"class\": \"DatasetH\",\n",
+    "        \"module_path\": \"qlib.data.dataset\",\n",
+    "        \"kwargs\": {\n",
+    "            \"handler\": {\n",
+    "                \"class\": \"Alpha158\",\n",
+    "                \"module_path\": \"qlib.contrib.data.handler\",\n",
+    "                \"kwargs\": data_handler_config,\n",
+    "            },\n",
+    "            \"segments\": {\n",
+    "                \"train\": (\"2008-01-01\", \"2014-12-31\"),\n",
+    "                \"valid\": (\"2015-01-01\", \"2016-12-31\"),\n",
+    "                \"test\": (\"2017-01-01\", \"2020-08-01\"),\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "\n",
+    "record_config = [\n",
+    "    {\n",
+    "        \"class\": \"SignalRecord\",\n",
+    "        \"module_path\": \"qlib.workflow.record_temp\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"class\": \"SigAnaRecord\",\n",
+    "        \"module_path\": \"qlib.workflow.record_temp\",\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "# use lgb\n",
+    "task_lgb_config = {\n",
+    "    \"model\": {\n",
+    "        \"class\": \"LGBModel\",\n",
+    "        \"module_path\": \"qlib.contrib.model.gbdt\",\n",
+    "    },\n",
+    "    \"dataset\": dataset_config,\n",
+    "    \"record\": record_config,\n",
+    "}\n",
+    "\n",
+    "# use xgboost\n",
+    "task_xgboost_config = {\n",
+    "    \"model\": {\n",
+    "        \"class\": \"XGBModel\",\n",
+    "        \"module_path\": \"qlib.contrib.model.xgboost\",\n",
+    "    },\n",
+    "    \"dataset\": dataset_config,\n",
+    "    \"record\": record_config,\n",
+    "}\n",
+    "provider_uri = r\"../qlib-main/qlib_data/cn_data\"\n",
+    "#provider_uri = \"~/.qlib/qlib_data/cn_data\"  # target_dir\n",
+    "qlib.init(provider_uri=provider_uri, region=REG_CN)\n",
+    "\n",
+    "C[\"mongo\"] = {\n",
+    "    \"task_url\" : \"mongodb://localhost:27017/\", # maybe you need to change it to your url\n",
+    "    \"task_db_name\" : \"rolling_db\"\n",
+    "}\n",
+    "\n",
+    "exp_name = 'rolling_exp' # experiment name, will be used as the experiment in MLflow\n",
+    "task_pool = 'rolling_task' # task pool name, will be used as the document in MongoDB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "tasks = task_generator(\n",
+    "    task_xgboost_config, # default task name\n",
+    "    RollingGen(step=550,rtype=RollingGen.ROLL_SD), # generate different date segment\n",
+    "    task_lgb=task_lgb_config # use \"task_lgb\" as the task name\n",
+    ")\n",
+    "# Uncomment next two lines to see the generated tasks\n",
+    "# from pprint import pprint\n",
+    "# pprint(tasks)\n",
+    "tm = TaskManager(task_pool=task_pool)\n",
+    "tm.create_task(tasks) # all tasks will be saved to MongoDB"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from qlib.workflow.task.manage import run_task\n",
+    "from qlib.workflow.task.collect import RollingCollector\n",
+    "from qlib.model.trainer import task_train\n",
+    "\n",
+    "run_task(task_train, task_pool, experiment_name=exp_name) # all tasks will be trained using \"task_train\" method"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "def get_task_key(task_config):\n",
+    "    task_key = task_config[\"task_key\"]\n",
+    "    rolling_end_timestamp = task_config[\"dataset\"][\"kwargs\"][\"segments\"][\"test\"][1]\n",
+    "    rolling_end_datatime = rolling_end_timestamp.to_pydatetime()\n",
+    "    return task_key, rolling_end_datatime.strftime('%Y-%m-%d')\n",
+    "\n",
+    "def my_filter(task_config):\n",
+    "    # only choose the results of \"task_lgb\" and test in 2019 from all tasks\n",
+    "    task_key, rolling_end = get_task_key(task_config)\n",
+    "    if task_key==\"task_lgb\" and rolling_end.startswith('2019'):\n",
+    "        return True\n",
+    "    return False\n",
+    "\n",
+    "collector = RollingCollector(get_task_key, my_filter)\n",
+    "pred_rolling = collector(exp_name) # name tasks by \"get_task_key\" and filter tasks by \"my_filter\"\n",
+    "pred_rolling"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py
@@ -27,16 +27,22 @@ def task_train(task_config: dict, experiment_name):
         model.fit(dataset)
         recorder = R.get_recorder()
         R.save_objects(**{"params.pkl": model})
+        R.save_objects(param=task_config)  # keep the original format and datatype
 
         # generate records: prediction, backtest, and analysis
-        for record in task_config["record"]:
+        records = task_config.get('record', [])
+        if isinstance(records, dict):  # prevent only one dict
+            records = [records]
+        for record in records:
             if record["class"] == SignalRecord.__name__:
                 srconf = {"model": model, "dataset": dataset, "recorder": recorder}
+                record.setdefault("kwargs", {})
                 record["kwargs"].update(srconf)
                 sr = init_instance_by_config(record)
                 sr.generate()
             else:
                 rconf = {"recorder": recorder}
+                record.setdefault("kwargs", {})
                 record["kwargs"].update(rconf)
                 ar = init_instance_by_config(record)
                 ar.generate()
diff --git a/qlib/workflow/task/collect.py b/qlib/workflow/task/collect.py
@@ -4,17 +4,17 @@
 from tqdm.auto import tqdm
 
 
-class RollingEnsemble:
+class RollingCollector:
     """
     Rolling Models Ensemble based on (R)ecord
 
     This shares nothing with Ensemble
     """
 
-    # TODO: 这边还可以加加速
+    # TODO: speed up this class
     def __init__(self, get_key_func, flt_func=None):
-        self.get_key_func = get_key_func
-        self.flt_func = flt_func
+        self.get_key_func = get_key_func  # user need to implement this method to get the key of a task based on task config
+        self.flt_func = flt_func  # determine whether a task can be retained based on task config
 
     def __call__(self, exp_name) -> Union[pd.Series, dict]:
         # TODO;
@@ -26,7 +26,6 @@ def __call__(self, exp_name) -> Union[pd.Series, dict]:
 
         recs_flt = {}
         for rid, rec in tqdm(recs.items(), desc="Loading data"):
-            # rec = exp.get_recorder(recorder_id=rid)
             params = rec.load_object("param")
             if rec.status == rec.STATUS_FI:
                 if self.flt_func is None or self.flt_func(params):

diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py
@@ -9,11 +9,64 @@
 from .utils import TimeAdjuster
 
 
+def task_generator(*args, **kwargs) -> list:
+    """
+    Accept the dict of task config and the TaskGen to generate different tasks.
+    There is no limit to the number and position of input.
+    The key of input will add to task config.
+
+    for example:
+        There are 3 task_config(a,b,c) and 2 TaskGen(A,B). A will double the task_config and B will triple.
+        task_generator(a=a, b=b, c=c, A=A, B=B) will finally generate 18 task_config.
+
+    Parameters
+    ----------
+    args : dict or TaskGen
+    kwargs : dict or TaskGen
+
+    Returns
+    -------
+    gen_task_list : list
+        a list of task config after generating
+    """
+    tasks_list = []
+    gen_list = []
+
+    tmp_id = 1
+    for task in args:
+        if isinstance(task, dict):
+            task["task_key"] = tmp_id
+            tmp_id += 1
+            tasks_list.append(task)
+        elif isinstance(task, TaskGen):
+            gen_list.append(task)
+        else:
+            raise NotImplementedError(f"{type(task)} is not supported in task_generator")
+
+    for key, task in kwargs.items():
+        if isinstance(task, dict):
+            task["task_key"] = key
+            tasks_list.append(task)
+        elif isinstance(task, TaskGen):
+            gen_list.append(task)
+        else:
+            raise NotImplementedError(f"{type(task)} is not supported in task_generator")
+
+    # generate gen_task_list
+    gen_task_list = []
+    for gen in gen_list:
+        new_task_list = []
+        for task in tasks_list:
+            new_task_list.extend(gen(task))
+        gen_task_list = new_task_list
+    return gen_task_list
+
+
 class TaskGen(metaclass=abc.ABCMeta):
     @abc.abstractmethod
     def __call__(self, *args, **kwargs) -> typing.List[dict]:
         """
-        generate
+        the base class for generate different tasks
 
         Parameters
         ----------
@@ -35,9 +88,8 @@ def __call__(self, *args, **kwargs) -> typing.List[dict]:
 
 
 class RollingGen(TaskGen):
-
-    ROLL_EX = TimeAdjuster.SHIFT_EX
-    ROLL_SD = TimeAdjuster.SHIFT_SD
+    ROLL_EX = TimeAdjuster.SHIFT_EX  # fixed start date, expanding end date
+    ROLL_SD = TimeAdjuster.SHIFT_SD  # fixed window size, slide it from start date
 
     def __init__(self, step: int = 40, rtype: str = ROLL_EX):
         """
@@ -48,7 +100,7 @@ def __init__(self, step: int = 40, rtype: str = ROLL_EX):
         step : int
             step to rolling
         rtype : str
-            rolling type (expanding, rolling)
+            rolling type (expanding, sliding)
         """
         self.step = step
         self.rtype = rtype
@@ -111,12 +163,12 @@ def __call__(self, task: dict):
                 segments = {}
                 try:
                     for k, seg in prev_seg.items():
-                        # 决定怎么shift
+                        # decide how to shift
                         if k == self.train_key and self.rtype == self.ROLL_EX:
                             rtype = self.ta.SHIFT_EX
                         else:
                             rtype = self.ta.SHIFT_SD
-                        # 整段数据做shift
+                        # shift the segments data
                         segments[k] = self.ta.shift(seg, step=self.step, rtype=rtype)
                     if segments[self.test_key][0] > test_end:
                         break

diff --git a/qlib/workflow/task/manage.py b/qlib/workflow/task/manage.py
@@ -36,12 +36,8 @@ class TaskManager:
     The tasks manager assume that you will only update the tasks you fetched.
     The mongo fetch one and update will make it date updating secure.
 
-    Usage Examples from the CLI.
-    python -m blocks.tasks.__init__ task_stat --task_pool meta_task_rule
-
-
     NOTE:
-    - 假设： 存储在db里面的都是encode过的， 拿出来的都是decode过的
+    - assumption: the data in MongoDB was encoded and the data out of MongoDB was decoded
     """
 
     STATUS_WAITING = "waiting"
@@ -85,7 +81,7 @@ def _dict_to_str(self, flt):
         return {k: str(v) for k, v in flt.items()}
 
     def replace_task(self, task, new_task, task_pool=None):
-        # 这里的假设是从接口拿出来的都是decode过的，在接口内部的都是 encode过的
+        # assume that the data out of interface was decoded and the data in interface was encoded
         new_task = self._encode_task(new_task)
         task_pool = self._get_task_pool(task_pool)
         query = {"_id": ObjectId(task["_id"])}