diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 68dfe5b3fdb..0bd3517d55d 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source - uses: actions/checkout@v2 + uses: actions/checkout@v3 + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" + + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Update pip to the latest version # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 - name: Installing pytorch for macos if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} @@ -129,8 +139,7 @@ jobs: - name: Test data downloads run: | python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn - azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive - mv /tmp/qlibpublic/data tests/.data + python scripts/get_data.py download_data --file_name rl_data.zip --target_dir tests/.data/rl - name: Install Lightgbm for MacOS if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml index f8e43fa179b..1dfcc0179c2 100644 --- a/.github/workflows/test_qlib_from_source_slow.yml +++ b/.github/workflows/test_qlib_from_source_slow.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source slow - uses: actions/checkout@v2 + uses: actions/checkout@v3 + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" + + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Set up Python tools # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 pip install --upgrade cython numpy pip install -e .[dev] diff --git a/docs/component/data.rst b/docs/component/data.rst index 60e8d4fa1bd..5a2d458f688 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -119,7 +119,7 @@ Here are some example: for daily data: .. code-block:: bash - python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data + python scripts/get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data for 1min data: .. code-block:: bash diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 2163b4bf7e5..8de32f3f6c0 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import os import re import sys import qlib @@ -11,13 +12,15 @@ from tqdm import tqdm from pathlib import Path from loguru import logger +from cryptography.fernet import Fernet from qlib.utils import exists_qlib_data class GetData: - DATASET_VERSION = "v2" REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" - QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip" + # "?" is not included in the token. + TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" + KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" def __init__(self, delete_zip_file=False): """ @@ -29,24 +32,44 @@ def __init__(self, delete_zip_file=False): """ self.delete_zip_file = delete_zip_file - def normalize_dataset_version(self, dataset_version: str = None): - if dataset_version is None: - dataset_version = self.DATASET_VERSION - return dataset_version + def merge_remote_url(self, file_name: str): + fernet = Fernet(self.KEY) + token = fernet.decrypt(self.TOKEN).decode() + return f"{self.REMOTE_URL}/{file_name}?{token}" - def merge_remote_url(self, file_name: str, dataset_version: str = None): - return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}" + def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): + """ + Download the specified file to the target folder. - def _download_data( - self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None - ): + Parameters + ---------- + target_dir: str + data save directory + file_name: str + dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...] + may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip + delete_old: bool + delete an existing directory, by default True + + Examples + --------- + # get rl data + python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token} + + # get cn csv data + python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token} + ------- + + """ target_dir = Path(target_dir).expanduser() target_dir.mkdir(exist_ok=True, parents=True) # saved file name - _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name + _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name) target_path = target_dir.joinpath(_target_file_name) - url = self.merge_remote_url(file_name, dataset_version) + url = self.merge_remote_url(file_name) resp = requests.get(url, stream=True, timeout=60) resp.raise_for_status() if resp.status_code != 200: @@ -56,7 +79,7 @@ def _download_data( logger.warning( f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" ) - logger.info(f"{file_name} downloading......") + logger.info(f"{os.path.basename(file_name)} downloading......") with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: with target_path.open("wb") as fp: for chunk in resp.iter_content(chunk_size=chunk_size): @@ -67,8 +90,8 @@ def _download_data( if self.delete_zip_file: target_path.unlink() - def check_dataset(self, file_name: str, dataset_version: str = None): - url = self.merge_remote_url(file_name, dataset_version) + def check_dataset(self, file_name: str): + url = self.merge_remote_url(file_name) resp = requests.get(url, stream=True, timeout=60) status = True if resp.status_code == 404: @@ -140,9 +163,11 @@ def qlib_data( --------- # get 1d data python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token} # get 1min data python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token} ------- """ @@ -155,29 +180,12 @@ def qlib_data( qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__)) - def _get_file_name(v): - return self.QLIB_DATA_NAME.format( - dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v - ) - - file_name = _get_file_name(qlib_version) - if not self.check_dataset(file_name, version): - file_name = _get_file_name("latest") - self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version) - - def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"): - """download cn csv data from remote - - Parameters - ---------- - target_dir: str - data save directory - - Examples - --------- - python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data - ------- + def _get_file_name_with_version(qlib_version, dataset_version): + dataset_version = "v2" if dataset_version is None else dataset_version + file_name_with_version = f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip" + return file_name_with_version - """ - file_name = "csv_data_cn.zip" - self._download_data(file_name, target_dir) + file_name = _get_file_name_with_version(qlib_version, dataset_version=version) + if not self.check_dataset(file_name): + file_name = _get_file_name_with_version("latest", dataset_version=version) + self.download_data(file_name.lower(), target_dir, delete_old) diff --git a/setup.py b/setup.py index 109fed21355..9d7c185ab91 100644 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ def get_version(rel_path: str) -> str: "gym", # Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail. "protobuf<=3.20.1;python_version<='3.8'", + "cryptography", ] # Numpy include diff --git a/tests/test_dump_data.py b/tests/test_dump_data.py index dfa7f8556dd..33cae4e8084 100644 --- a/tests/test_dump_data.py +++ b/tests/test_dump_data.py @@ -35,7 +35,7 @@ class TestDumpData(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - GetData().csv_data_cn(SOURCE_DIR) + GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR) TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS) TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) provider_uri = str(QLIB_DIR.resolve()) diff --git a/tests/test_get_data.py b/tests/test_get_data.py index 93a852f554e..94e685e1fbd 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -42,7 +42,7 @@ def test_0_qlib_data(self): self.assertFalse(df.dropna().empty, "get qlib data failed") def test_1_csv_data(self): - GetData().csv_data_cn(SOURCE_DIR) + GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR) stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) self.assertEqual(len(stock_name), 85, "get csv data failed")