From 632255d9d9a035f97495be2c36af7d93208bf469 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 8 Jun 2022 17:44:12 +0800
Subject: [PATCH 1/2] Stage code

---
 examples/benchmarks/LightGBM/requirements.txt |  2 +-
 ...w_config_lightgbm_Alpha158_multi_freq.yaml |  4 ++--
 examples/benchmarks/README.md                 |  4 ++--
 examples/run_all_model.py                     | 21 ++++++++++++++++---
 qlib/model/ens/ensemble.py                    |  3 +++
 qlib/tests/data.py                            |  3 ++-
 6 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/examples/benchmarks/LightGBM/requirements.txt b/examples/benchmarks/LightGBM/requirements.txt
index 3f455556b85..4ffcc6f8a30 100644
--- a/examples/benchmarks/LightGBM/requirements.txt
+++ b/examples/benchmarks/LightGBM/requirements.txt
@@ -1,3 +1,3 @@
 pandas==1.1.2
 numpy==1.21.0
-lightgbm==3.1.0
+lightgbm
diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml
index 3d0a7859c77..a9f219155f3 100644
--- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml
+++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml
@@ -5,8 +5,8 @@ qlib_init:
     region: cn
     dataset_cache: null
     maxtasksperchild: 1
-market: &market csi300
-benchmark: &benchmark SH000300
+market: &market csi500
+benchmark: &benchmark SH000500
 data_handler_config: &data_handler_config
     start_time: 2008-01-01
     # 1min closing time is 15:00:00
diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
index e1616f4fd26..20b84ecde7f 100644
--- a/examples/benchmarks/README.md
+++ b/examples/benchmarks/README.md
@@ -20,7 +20,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 > NOTE:
 > We have very limited resources to implement and finetune the models. We tried our best effort to fairly compare these models.  But some models may have greater potential than what it looks like in the table below.  Your contribution is highly welcomed to explore their potential.
 
-## Alpha158 dataset
+## Alpha158 dataset (csi300)
 
 | Model Name                               | Dataset                             | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
 |------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
@@ -44,7 +44,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 | DoubleEnsemble(Chuheng Zhang, et al.)    | Alpha158                            | 0.0544±0.00 | 0.4340±0.00 | 0.0523±0.00 | 0.4284±0.01 | 0.1168±0.01       | 1.3384±0.12       | -0.1036±0.01 |
 
 
-## Alpha360 dataset
+## Alpha360 dataset (csi500)
 
 | Model Name                                | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
 |-------------------------------------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index 71ce10a4110..46f9c6a5d76 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -117,8 +117,10 @@ def get_all_folders(models, exclude) -> dict:
 
 
 # function to get all the files under the model folder
-def get_all_files(folder_path, dataset) -> (str, str):
-    yaml_path = str(Path(f"{folder_path}") / f"*{dataset}*.yaml")
+def get_all_files(folder_path, dataset, universe="") -> (str, str):
+    if universe != "":
+        universe = f"_{universe}"
+    yaml_path = str(Path(f"{folder_path}") / f"*{dataset}{universe}.yaml")
     req_path = str(Path(f"{folder_path}") / f"*.txt")
     yaml_file = glob.glob(yaml_path)
     req_file = glob.glob(req_path)
@@ -224,6 +226,7 @@ def run(
         times=1,
         models=None,
         dataset="Alpha360",
+        universe="",
         exclude=False,
         qlib_uri: str = "git+https://github.com/microsoft/qlib#egg=pyqlib",
         exp_folder_name: str = "run_all_model_records",
@@ -245,6 +248,9 @@ def run(
             determines whether the model being used is excluded or included.
         dataset : str
             determines the dataset to be used for each model.
+        universe  : str
+            the stock universe of the dataset.
+            default "" indicates that
         qlib_uri : str
             the uri to install qlib with pip
             it could be url on the we or local path (NOTE: the local path must be a absolute path)
@@ -259,6 +265,15 @@ def run(
         -------
         Here are some use cases of the function in the bash:
 
+        The run_all_models  will decide which config to run based no `models` `dataset`  `universe`
+        Example 1):
+
+            models="lightgbm", dataset="Alpha158", universe="" will result in running the following config
+            examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
+
+            models="lightgbm", dataset="Alpha158", universe="csi500" will result in running the following config
+            examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml
+
         .. code-block:: bash
 
             # Case 1 - run all models multiple times
@@ -290,7 +305,7 @@ def run(
         for fn in folders:
             # get all files
             sys.stderr.write("Retrieving files...\n")
-            yaml_path, req_path = get_all_files(folders[fn], dataset)
+            yaml_path, req_path = get_all_files(folders[fn], dataset, universe=universe)
             if yaml_path is None:
                 sys.stderr.write(f"There is no {dataset}.yaml file in {folders[fn]}")
                 continue
diff --git a/qlib/model/ens/ensemble.py b/qlib/model/ens/ensemble.py
index 863282416e3..ede1f8e3ad1 100644
--- a/qlib/model/ens/ensemble.py
+++ b/qlib/model/ens/ensemble.py
@@ -8,6 +8,7 @@
 from typing import Union
 import pandas as pd
 from qlib.utils import FLATTEN_TUPLE, flatten_dict
+from qlib.log import get_module_logger
 
 
 class Ensemble:
@@ -79,6 +80,7 @@ class RollingEnsemble(Ensemble):
     """
 
     def __call__(self, ensemble_dict: dict) -> pd.DataFrame:
+        get_module_logger("RollingEnsemble").info(f"keys in group: {list(ensemble_dict.keys())}")
         artifact_list = list(ensemble_dict.values())
         artifact_list.sort(key=lambda x: x.index.get_level_values("datetime").min())
         artifact = pd.concat(artifact_list)
@@ -121,6 +123,7 @@ def __call__(self, ensemble_dict: dict) -> pd.DataFrame:
         """
         # need to flatten the nested dict
         ensemble_dict = flatten_dict(ensemble_dict, sep=FLATTEN_TUPLE)
+        get_module_logger("AverageEnsemble").info(f"keys in group: {list(ensemble_dict.keys())}")
         values = list(ensemble_dict.values())
         # NOTE: this may change the style underlying data!!!!
         # from pd.DataFrame to pd.Series
diff --git a/qlib/tests/data.py b/qlib/tests/data.py
index 2a7281203f9..c9ff8ff3842 100644
--- a/qlib/tests/data.py
+++ b/qlib/tests/data.py
@@ -16,7 +16,8 @@
 
 class GetData:
     DATASET_VERSION = "v2"
-    REMOTE_URL = "http://fintech.msra.cn/stock_data/downloads"
+    # REMOTE_URL = "http://fintech.msra.cn/stock_data/downloads"
+    REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
     QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
 
     def __init__(self, delete_zip_file=False):

From 15528e0de1495952463641e950207234b8240bb0 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 14 Jun 2022 21:33:14 +0800
Subject: [PATCH 2/2] Update results and scripts

---
 ...kflow_config_lightgbm_Alpha158_csi500.yaml | 72 +++++++++++++++++
 ...w_config_lightgbm_Alpha158_multi_freq.yaml |  4 +-
 ...kflow_config_lightgbm_Alpha360_csi500.yaml | 80 +++++++++++++++++++
 examples/benchmarks/README.md                 | 38 ++++++++-
 examples/run_all_model.py                     |  3 +
 5 files changed, 193 insertions(+), 4 deletions(-)
 create mode 100644 examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml
 create mode 100644 examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml

diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml
new file mode 100644
index 00000000000..df0f7c79477
--- /dev/null
+++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml
@@ -0,0 +1,72 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi500
+benchmark: &benchmark SH000905
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            model: <MODEL> 
+            dataset: <DATASET>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: LGBModel
+        module_path: qlib.contrib.model.gbdt
+        kwargs:
+            loss: mse
+            colsample_bytree: 0.8879
+            learning_rate: 0.2
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml
index a9f219155f3..3d0a7859c77 100644
--- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml
+++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml
@@ -5,8 +5,8 @@ qlib_init:
     region: cn
     dataset_cache: null
     maxtasksperchild: 1
-market: &market csi500
-benchmark: &benchmark SH000500
+market: &market csi300
+benchmark: &benchmark SH000300
 data_handler_config: &data_handler_config
     start_time: 2008-01-01
     # 1min closing time is 15:00:00
diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml
new file mode 100644
index 00000000000..767050919fa
--- /dev/null
+++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml
@@ -0,0 +1,80 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi500
+benchmark: &benchmark SH000905
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors: []
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal:
+                - <MODEL> 
+                - <DATASET>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: LGBModel
+        module_path: qlib.contrib.model.gbdt
+        kwargs:
+            loss: mse
+            colsample_bytree: 0.8879
+            learning_rate: 0.0421
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha360
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
index 20b84ecde7f..10ae91f3544 100644
--- a/examples/benchmarks/README.md
+++ b/examples/benchmarks/README.md
@@ -20,7 +20,9 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 > NOTE:
 > We have very limited resources to implement and finetune the models. We tried our best effort to fairly compare these models.  But some models may have greater potential than what it looks like in the table below.  Your contribution is highly welcomed to explore their potential.
 
-## Alpha158 dataset (csi300)
+## Results on CSI300
+
+### Alpha158 dataset
 
 | Model Name                               | Dataset                             | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
 |------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
@@ -44,7 +46,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 | DoubleEnsemble(Chuheng Zhang, et al.)    | Alpha158                            | 0.0544±0.00 | 0.4340±0.00 | 0.0523±0.00 | 0.4284±0.01 | 0.1168±0.01       | 1.3384±0.12       | -0.1036±0.01 |
 
 
-## Alpha360 dataset (csi500)
+### Alpha360 dataset
 
 | Model Name                                | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
 |-------------------------------------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
@@ -79,6 +81,38 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
    - Signal-based evaluation:  IC, ICIR, Rank IC, Rank ICIR
    - Portfolio-based metrics:  Annualized Return, Information Ratio, Max Drawdown
 
+## Results on CSI500
+The results on CSI500 is not complete. PR's for models on csi500 are welcome!
+
+Transfer previous models in CSI300 to CSI500 is quite easy.  You can try models with just a few commands below.
+```
+cd examples/benchmarks/LightGBM
+pip install -r requirements.txt
+
+# create new config and set the benchmark to csi500
+cp workflow_config_lightgbm_Alpha158.yaml workflow_config_lightgbm_Alpha158_csi500.yaml
+sed -i "s/csi300/csi500/g"  workflow_config_lightgbm_Alpha158_csi500.yaml
+sed -i "s/SH000300/SH000905/g"  workflow_config_lightgbm_Alpha158_csi500.yaml
+
+# you can either run the model once
+qrun workflow_config_lightgbm_Alpha158_csi500.yaml
+
+# or run it for multiple times automatically and get the summarized results.
+cd  ../../
+python run_all_model.py run 3 lightgbm Alpha158 csi500  # for models with randomness.  please run it for 20 times.
+```
+
+### Alpha158 dataset
+
+| Model Name | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
+|------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
+| LightGBM   | Alpha158 | 0.0377±0.00 | 0.3860±0.00 | 0.0448±0.00 | 0.4675±0.00 | 0.1151±0.00       | 1.3884±0.00       | -0.0898±0.00 |
+
+### Alpha360 dataset
+| Model Name | Dataset  | IC          | ICIR        | Rank IC     | Rank ICIR   | Annualized Return | Information Ratio | Max Drawdown |
+|------------|----------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
+| LightGBM   | Alpha360 | 0.0400±0.00 | 0.3605±0.00 | 0.0536±0.00 | 0.5431±0.00 | 0.0505±0.00       | 0.7658±0.02       | -0.1880±0.00 |
+
 
 # Contributing
 
diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index 46f9c6a5d76..71589049a2a 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -294,6 +294,9 @@ def run(
             # Case 6 - run other models except those are given as arguments for one time
             python run_all_model.py run --models=[mlp,tft,sfm] --exclude=True
 
+            # Case 7 - run lightgbm model on csi500.
+            python run_all_model.py run 3 lightgbm Alpha158 csi500
+
         """
         self._init_qlib(exp_folder_name)