From 3419ec94041561abd89b4e9e39c8e90da6a4e196 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 17 Mar 2022 17:04:07 -0300 Subject: [PATCH 01/20] feat: download ibovespa index historic composition ibovespa(ibov) is the largest index in Brazil's stocks exchange. The br_index folder has support for downloading new companies for the current index composition. And has support, as well, for downloading companies from historic composition of ibov index. Partially resolves issue #956 --- scripts/data_collector/br_index/README.md | 62 ++++ scripts/data_collector/br_index/collector.py | 317 ++++++++++++++++++ .../data_collector/br_index/requirements.txt | 34 ++ 3 files changed, 413 insertions(+) create mode 100644 scripts/data_collector/br_index/README.md create mode 100644 scripts/data_collector/br_index/collector.py create mode 100644 scripts/data_collector/br_index/requirements.txt diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md new file mode 100644 index 00000000000..848600666cf --- /dev/null +++ b/scripts/data_collector/br_index/README.md @@ -0,0 +1,62 @@ +# iBOVESPA History Companies Collection + +## Requirements + +- Install the libs from the file `requirements.txt` + + ```bash + pip install -r requirements.txt + ``` + +## For the ibovespa (IBOV) index, we have: + +
+ +### Method `get_new_companies` + +#### Index start date + +- The ibovespa index started on 2 January 1968 ([wiki](https://en.wikipedia.org/wiki/%C3%8Dndice_Bovespa)). In order to use this start date in our `bench_start_date(self)` method, two conditions must be satisfied: + 1) APIs used to download brazilian stocks (B3) historical prices must keep track of such historic data since 2 January 1968 + + 2) Some website or API must provide, from that date, the historic index composition. In other words, the companies used to build the index . + + As a consequence, the method `bench_start_date(self)` inside `collector.py` was implemented using `pd.Timestamp("2003-01-03")` due to two reasons + + 1) The earliest ibov composition that have been found was from the first quarter of 2003. More informations about such composition can be seen on the sections below. + + 2) Yahoo finance, one of the libraries used to download symbols historic prices, keeps track from this date forward. + +- Within the `get_new_companies` method, a logic was implemented to get, for each ibovespa component stock, the start date that yahoo finance keeps track of. + +#### Code Logic + +The code does a web scrapping into the B3's [website](https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br), which keeps track of the ibovespa stocks composition on the current day. + +Other approaches, such as `request` and `Beautiful Soup` could have been used. However, the website shows the table with the stocks with some delay, since it uses a script inside of it to obtain such compositions. +Alternatively, `selenium` was used to download this stocks' composition in order to overcome this problem. + +Futhermore, the data downloaded from the selenium script was preprocessed so it could be saved into the `csv` format stablished by `scripts/data_collector/index.py`. + +
+ +### Method `get_changes` + +No suitable data source that keeps track of ibovespa's history stocks composition has been found. Except from this [repository](https://github.com/igor17400/IBOV-HCI) which provide such information have been used, however it only provides the data from the 1st quarter of 2003 to 3rd quarter of 2021. + +With that reference, the index's composition can be compared quarter by quarter and year by year and then generate a file that keeps track of which stocks have been removed and which have been added each quarter and year. + +
+ +### Collector Data + +```bash +# parse instruments, using in qlib/instruments. +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments + +# parse new companies +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies + +# index_name support: IBOV +``` + diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py new file mode 100644 index 00000000000..fe4d3e4a31d --- /dev/null +++ b/scripts/data_collector/br_index/collector.py @@ -0,0 +1,317 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import sys +from pathlib import Path +import importlib +import datetime + +import fire +import pandas as pd +from tqdm import tqdm +from loguru import logger + +CUR_DIR = Path(__file__).resolve().parent +sys.path.append(str(CUR_DIR.parent.parent)) + +from data_collector.index import IndexBase + +IBOV_INDEX_COMPOSITION = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" + +YEAR_QUARTER = [ + "2003_1Q", + "2003_2Q", + "2003_3Q", + "2004_1Q", + "2004_2Q", + "2004_3Q", + "2005_1Q", + "2005_2Q", + "2005_3Q", + "2006_1Q", + "2006_2Q", + "2006_3Q", + "2007_1Q", + "2007_2Q", + "2007_3Q", + "2008_1Q", + "2008_2Q", + "2008_3Q", + "2009_1Q", + "2009_2Q", + "2009_3Q", + "2010_1Q", + "2010_2Q", + "2010_3Q", + "2011_1Q", + "2011_2Q", + "2011_3Q", + "2012_1Q", + "2012_2Q", + "2012_3Q", + "2013_1Q", + "2013_2Q", + "2013_3Q", + "2014_1Q", + "2014_2Q", + "2014_3Q", + "2015_1Q", + "2015_2Q", + "2015_3Q", + "2016_1Q", + "2016_2Q", + "2016_3Q", + "2017_1Q", + "2017_2Q", + "2017_3Q", + "2018_1Q", + "2018_2Q", + "2018_3Q", + "2019_1Q", + "2019_2Q", + "2019_3Q", + "2020_1Q", + "2020_2Q", + "2020_3Q", + "2021_1Q", + "2021_2Q", + "2021_3Q", + "2022_1Q", +] + +quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"} + + +class IBOVIndex(IndexBase): + def __init__( + self, + index_name: str, + qlib_dir: [str, Path] = None, + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, + ): + super(IBOVIndex, self).__init__( + index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep + ) + + self.today = datetime.date.today() + self.quarter = str(pd.Timestamp(self.today).quarter) + self.year = str(self.today.year) + + @property + def bench_start_date(self) -> pd.Timestamp: + """ + The ibovespa index started on 2 January 1968 (wiki), however, + no suitable data source that keeps track of ibovespa's history + stocks composition has been found. Except from the repo indicated + in README. Which keeps track of such information starting from + the first quarter of 2003 + """ + return pd.Timestamp("2003-01-03") + + def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame: + """formatting the datetime in an instrument + + Parameters + ---------- + inst_df: pd.DataFrame + inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD] + + Returns + ------- + inst_df: pd.DataFrame + + """ + logger.info("Formatting Datetime") + if self.freq != "day": + inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S") + ) + else: + inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d") + ) + + inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d") + ) + return inst_df + + def format_quarter(self, cell: str): + """ + Parameters + ---------- + cell: str + It must be on the format 2003_1Q --> year_quarter + + Returns + ---------- + date: str + Returns date in format 2003-03-01 + """ + cell_split = cell.split("_") + return cell_split[0] + "-" + quarter_dict[cell_split[1]] + + def get_changes(self): + """ + Access the index historic composition and compare it quarter + by quarter and year by year in order to generate a file that + keeps track of which stocks have been removed and which have + been added. + + The Dataframe used as reference will provided the index + composition for each year an quarter: + pd.DataFrame: + symbol + SH600000 + SH600001 + . + . + . + + Parameters + ---------- + self: is used to represent the instance of the class. + + Returns + ---------- + pd.DataFrame: + symbol date type + SH600000 2019-11-11 add + SH600001 2020-11-10 remove + dtypes: + symbol: str + date: pd.Timestamp + type: str, value from ["add", "remove"] + """ + logger.info("Getting companies changes in {} index ...".format(self.index_name)) + + try: + df_changes_list = [] + for i in tqdm(range(len(YEAR_QUARTER) - 1)): + df = pd.read_csv(IBOV_INDEX_COMPOSITION.format(YEAR_QUARTER[i]), on_bad_lines="skip")["symbol"] + df_ = pd.read_csv(IBOV_INDEX_COMPOSITION.format(YEAR_QUARTER[i + 1]), on_bad_lines="skip")["symbol"] + + ## Remove Dataframe + remove_date = YEAR_QUARTER[i].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i].split("_")[1]] + list_remove = list(df[~df.isin(df_)]) + df_removed = pd.DataFrame( + { + "date": len(list_remove) * [remove_date], + "type": len(list_remove) * ["remove"], + "symbol": list_remove, + } + ) + + ## Add Dataframe + add_date = YEAR_QUARTER[i + 1].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i + 1].split("_")[1]] + list_add = list(df_[~df_.isin(df)]) + df_added = pd.DataFrame( + {"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add} + ) + + df_changes_list.append(pd.concat([df_added, df_removed], sort=False)) + df = pd.concat(df_changes_list).reset_index(drop=True) + df['symbol'] = df['symbol'].astype(str) + '.SA' + + return df + + except Exception as E: + logger.error("An error occured while downloading 2008 index composition - {}".format(E)) + + def get_new_companies(self): + """ + Get latest index composition. + The repo indicated on README has implemented a script + to get the latest index composition from B3 website using + selenium. Therefore, this method will download the file + containing such composition + + Parameters + ---------- + self: is used to represent the instance of the class. + + Returns + ---------- + pd.DataFrame: + symbol start_date end_date + RRRP3 2020-11-13 2022-03-02 + ALPA4 2008-01-02 2022-03-02 + dtypes: + symbol: str + start_date: pd.Timestamp + end_date: pd.Timestamp + """ + logger.info("Getting new companies in {} index ...".format(self.index_name)) + + try: + ## Get index composition + + df_index = pd.read_csv( + IBOV_INDEX_COMPOSITION.format(self.year + "_" + self.quarter + "Q"), on_bad_lines="skip" + ) + df_date_first_added = pd.read_csv( + IBOV_INDEX_COMPOSITION.format("date_first_added_" + self.year + "_" + self.quarter + "Q"), + on_bad_lines="skip", + ) + df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]] + df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter) + + # end_date will be our current quarter + 1, since the IBOV index updates itself every quarter + df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[str(int(self.quarter) + 1) + "Q"] + df = df[['symbol', self.START_DATE_FIELD, self.END_DATE_FIELD]] + df['symbol'] = df['symbol'].astype(str) + '.SA' + + return df + + except Exception as E: + logger.error("An error occured while getting new companies - {}".format(E)) + + def filter_df(self, df: pd.DataFrame) -> pd.DataFrame: + if "Código" in df.columns: + return df.loc[:, ["Código"]].copy() + + +def get_instruments( + qlib_dir: str, + index_name: str, + method: str = "parse_instruments", + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, +): + """ + + Parameters + ---------- + qlib_dir: str + qlib data dir, default "Path(__file__).parent/qlib_data" + index_name: str + index name, value from ["IBOV"] + method: str + method, value from ["parse_instruments", "save_new_companies"] + freq: str + freq, value from ["day", "1min"] + request_retry: int + request retry, by default 5 + retry_sleep: int + request sleep, by default 3 + + Examples + ------- + # parse instruments + $ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments + + # parse new companies + $ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies + + """ + _cur_module = importlib.import_module("data_collector.br_index.collector") + obj = getattr(_cur_module, f"{index_name.upper()}Index")( + qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep + ) + getattr(obj, method)() + + +if __name__ == "__main__": + fire.Fire(get_instruments) diff --git a/scripts/data_collector/br_index/requirements.txt b/scripts/data_collector/br_index/requirements.txt new file mode 100644 index 00000000000..c77e932879d --- /dev/null +++ b/scripts/data_collector/br_index/requirements.txt @@ -0,0 +1,34 @@ +async-generator==1.10 +attrs==21.4.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.12 +cryptography==36.0.1 +fire==0.4.0 +h11==0.13.0 +idna==3.3 +loguru==0.6.0 +lxml==4.8.0 +multitasking==0.0.10 +numpy==1.22.2 +outcome==1.1.0 +pandas==1.4.1 +pycoingecko==2.2.0 +pycparser==2.21 +pyOpenSSL==22.0.0 +PySocks==1.7.1 +python-dateutil==2.8.2 +pytz==2021.3 +requests==2.27.1 +requests-futures==1.0.0 +six==1.16.0 +sniffio==1.2.0 +sortedcontainers==2.4.0 +termcolor==1.1.0 +tqdm==4.63.0 +trio==0.20.0 +trio-websocket==0.9.2 +urllib3==1.26.8 +wget==3.2 +wsproto==1.1.0 +yahooquery==2.2.15 From c2f933b8303d04324f29d846a49cc46eb5d8c267 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 17 Mar 2022 17:10:35 -0300 Subject: [PATCH 02/20] fix: typo error instead of end_date, it was written end_ate --- scripts/data_collector/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_collector/index.py b/scripts/data_collector/index.py index 497c1994824..a23614b4134 100644 --- a/scripts/data_collector/index.py +++ b/scripts/data_collector/index.py @@ -19,7 +19,7 @@ class IndexBase: SYMBOL_FIELD_NAME = "symbol" DATE_FIELD_NAME = "date" START_DATE_FIELD = "start_date" - END_DATE_FIELD = "end_ate" + END_DATE_FIELD = "end_date" CHANGE_TYPE_FIELD = "type" INSTRUMENTS_COLUMNS = [SYMBOL_FIELD_NAME, START_DATE_FIELD, END_DATE_FIELD] REMOVE = "remove" From 09b8ad97161bfef9e340a895b75773b29b0755ce Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 17 Mar 2022 17:11:44 -0300 Subject: [PATCH 03/20] feat: adds support for downloading stocks historic prices from Brazil's stocks exchange (B3) Together with commit c2f933 it resolves issue #956 --- scripts/data_collector/utils.py | 52 +++++++++++++++++++++++ scripts/data_collector/yahoo/README.md | 7 +++ scripts/data_collector/yahoo/collector.py | 25 ++++++++++- 3 files changed, 83 insertions(+), 1 deletion(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 1814b75eae9..c644525cd37 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -19,6 +19,7 @@ from tqdm import tqdm from functools import partial from concurrent.futures import ProcessPoolExecutor +from bs4 import BeautifulSoup HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" @@ -34,6 +35,7 @@ # NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks "US_ALL": "^GSPC", "IN_ALL": "^NSEI", + "BR_ALL": "^BVSP", } _BENCH_CALENDAR_LIST = None @@ -41,6 +43,7 @@ _HS_SYMBOLS = None _US_SYMBOLS = None _IN_SYMBOLS = None +_BR_SYMBOLS = None _EN_FUND_SYMBOLS = None _CALENDAR_MAP = {} @@ -344,6 +347,55 @@ def _format(s_): return _IN_SYMBOLS +def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list: + """get Brazil(B3) stock symbols + + Returns + ------- + B3 stock symbols + """ + global _BR_SYMBOLS + + @deco_retry + def _get_ibovespa(): + _symbols = [] + url = 'https://www.fundamentus.com.br/detalhes.php?papel=' + + # Request + agent = {"User-Agent":"Mozilla/5.0"} + page = requests.get(url, headers=agent) + + # BeautifulSoup + soup = BeautifulSoup(page.content, 'html.parser') + tbody = soup.find('tbody') + + children = tbody.findChildren("a" , recursive=True) + for child in children: + _symbols.append(str(child).split("\"")[-1].split(">")[1].split("<")[0]) + + return _symbols + + if _BR_SYMBOLS is None: + _all_symbols = _get_ibovespa() + if qlib_data_path is not None: + for _index in ["ibov"]: + ins_df = pd.read_csv( + Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"), + sep="\t", + names=["symbol", "start_date", "end_date"], + ) + _all_symbols += ins_df["symbol"].unique().tolist() + + def _format(s_): + s_ = s_.strip() + s_ = s_.strip("$") + s_ = s_.strip("*") + s_ = s_ + ".SA" + return s_ + + _BR_SYMBOLS = sorted(set(map(_format, _all_symbols))) + + return _BR_SYMBOLS def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list: """get en fund symbols diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index 71f2b75f8e5..0f3656b7c5a 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -80,14 +80,21 @@ pip install -r requirements.txt python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region CN # cn 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --delay 1 --interval 1min --region CN + # us 1d data python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US # us 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data_1min --delay 1 --interval 1min --region US + # in 1d data python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region IN # in 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data_1min --delay 1 --interval 1min --region IN + + # br 1d data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR + # br 1min data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR ``` 2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data` diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index e99a30d2a6b..29575b06884 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -35,6 +35,7 @@ get_hs_stock_symbols, get_us_stock_symbols, get_in_stock_symbols, + get_br_stock_symbols, generate_minutes_calendar_from_daily, ) @@ -310,7 +311,29 @@ class YahooCollectorIN1d(YahooCollectorIN): class YahooCollectorIN1min(YahooCollectorIN): pass +class YahooCollectorBR(YahooCollector, ABC): + def get_instrument_list(self): + logger.info("get BR stock symbols......") + symbols = get_br_stock_symbols() + [ + "^BVSP", + ] + logger.info(f"get {len(symbols)} symbols.") + return symbols + def download_index_data(self): + pass + + def normalize_symbol(self, symbol): + return code_to_fname(symbol).upper() + + @property + def _timezone(self): + return "Brazil/East" + +class YahooCollectorBR1d(YahooCollectorBR): + pass +class YahooCollectorBR1min(YahooCollectorBR): + pass class YahooNormalize(BaseNormalize): COLUMNS = ["open", "close", "high", "low", "volume"] DAILY_FORMAT = "%Y-%m-%d" @@ -848,7 +871,7 @@ def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval= interval: str freq, value from [1min, 1d], default 1d region: str - region, value from ["CN", "US"], default "CN" + region, value from ["CN", "US", "BR"], default "CN" """ super().__init__(source_dir, normalize_dir, max_workers, interval) self.region = region From 77107f3f616b3bcc3ff836332842255560757fcd Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sat, 19 Mar 2022 11:03:22 -0300 Subject: [PATCH 04/20] fix: code formatted with black. --- scripts/data_collector/br_index/collector.py | 6 +++--- scripts/data_collector/utils.py | 16 +++++++++------- scripts/data_collector/yahoo/collector.py | 6 ++++++ 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index fe4d3e4a31d..07d015f954c 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -212,7 +212,7 @@ def get_changes(self): df_changes_list.append(pd.concat([df_added, df_removed], sort=False)) df = pd.concat(df_changes_list).reset_index(drop=True) - df['symbol'] = df['symbol'].astype(str) + '.SA' + df["symbol"] = df["symbol"].astype(str) + ".SA" return df @@ -259,8 +259,8 @@ def get_new_companies(self): # end_date will be our current quarter + 1, since the IBOV index updates itself every quarter df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[str(int(self.quarter) + 1) + "Q"] - df = df[['symbol', self.START_DATE_FIELD, self.END_DATE_FIELD]] - df['symbol'] = df['symbol'].astype(str) + '.SA' + df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]] + df["symbol"] = df["symbol"].astype(str) + ".SA" return df diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index c644525cd37..c7f76467b93 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -347,6 +347,7 @@ def _format(s_): return _IN_SYMBOLS + def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list: """get Brazil(B3) stock symbols @@ -359,19 +360,19 @@ def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list: @deco_retry def _get_ibovespa(): _symbols = [] - url = 'https://www.fundamentus.com.br/detalhes.php?papel=' + url = "https://www.fundamentus.com.br/detalhes.php?papel=" # Request - agent = {"User-Agent":"Mozilla/5.0"} + agent = {"User-Agent": "Mozilla/5.0"} page = requests.get(url, headers=agent) # BeautifulSoup - soup = BeautifulSoup(page.content, 'html.parser') - tbody = soup.find('tbody') + soup = BeautifulSoup(page.content, "html.parser") + tbody = soup.find("tbody") - children = tbody.findChildren("a" , recursive=True) + children = tbody.findChildren("a", recursive=True) for child in children: - _symbols.append(str(child).split("\"")[-1].split(">")[1].split("<")[0]) + _symbols.append(str(child).split('"')[-1].split(">")[1].split("<")[0]) return _symbols @@ -385,7 +386,7 @@ def _get_ibovespa(): names=["symbol", "start_date", "end_date"], ) _all_symbols += ins_df["symbol"].unique().tolist() - + def _format(s_): s_ = s_.strip() s_ = s_.strip("$") @@ -397,6 +398,7 @@ def _format(s_): return _BR_SYMBOLS + def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list: """get en fund symbols diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 29575b06884..bfbdc7d001c 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -311,6 +311,7 @@ class YahooCollectorIN1d(YahooCollectorIN): class YahooCollectorIN1min(YahooCollectorIN): pass + class YahooCollectorBR(YahooCollector, ABC): def get_instrument_list(self): logger.info("get BR stock symbols......") @@ -330,10 +331,15 @@ def normalize_symbol(self, symbol): def _timezone(self): return "Brazil/East" + class YahooCollectorBR1d(YahooCollectorBR): pass + + class YahooCollectorBR1min(YahooCollectorBR): pass + + class YahooNormalize(BaseNormalize): COLUMNS = ["open", "close", "high", "low", "volume"] DAILY_FORMAT = "%Y-%m-%d" From 3aaf1df5f6cb2942e40355e3560111472d24c254 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sun, 20 Mar 2022 20:06:32 -0300 Subject: [PATCH 05/20] wip: Creating code logic for brazils stock market data normalization --- scripts/data_collector/utils.py | 7 ++++++- scripts/data_collector/yahoo/collector.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index c7f76467b93..a94a03c6857 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -67,12 +67,17 @@ def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]: logger.info(f"get calendar list: {bench_code}......") def _get_calendar(url): + print("--------") + print(url) + print("--------") _value_list = requests.get(url).json()["data"]["klines"] return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list)) calendar = _CALENDAR_MAP.get(bench_code, None) if calendar is None: - if bench_code.startswith("US_") or bench_code.startswith("IN_"): + if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"): + print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code])) + print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")) df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max") calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist() else: diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index bfbdc7d001c..c3f06e272a6 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -861,6 +861,27 @@ def symbol_to_yahoo(self, symbol): def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]: return get_calendar_list("ALL") +class YahooNormalizeBR: + def _get_calendar_list(self) -> Iterable[pd.Timestamp]: + return get_calendar_list("BR_ALL") + + +class YahooNormalizeBR1d(YahooNormalizeBR, YahooNormalize1d): + pass + + +class YahooNormalizeBR1min(YahooNormalizeBR, YahooNormalize1minOffline): + CALC_PAUSED_NUM = False + + def _get_calendar_list(self) -> Iterable[pd.Timestamp]: + # TODO: support 1min + raise ValueError("Does not support 1min") + + def _get_1d_calendar_list(self): + return get_calendar_list("BR_ALL") + + def symbol_to_yahoo(self, symbol): + return fname_to_code(symbol) class Run(BaseRun): def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN): From 9ceb592c02aac4b6741e067d51dfcdd6933e64cd Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Wed, 23 Mar 2022 15:07:56 -0300 Subject: [PATCH 06/20] docs: brazils stock market data normalization code documentation --- scripts/data_collector/yahoo/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index 0f3656b7c5a..28acfbd2c9d 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -123,8 +123,15 @@ pip install -r requirements.txt ```bash # normalize 1d cn python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_data --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d + # normalize 1min cn python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min + + # normalize 1d br + python scripts/data_collector/yahoo/collector.py normalize_data --source_dir ~/.qlib/stock_data/source/br_data --normalize_dir ~/.qlib/stock_data/source/br_1d_nor --region BR --interval 1d + + # normalize 1min br + python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min ``` 3. dump data: `python scripts/dump_bin.py dump_all` From d1b73b385f72343b8be951d557809a106794b327 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 24 Mar 2022 00:14:40 -0300 Subject: [PATCH 07/20] fix: code formatted the with black --- scripts/data_collector/yahoo/collector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index c3f06e272a6..3b1cd79fe67 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -861,6 +861,7 @@ def symbol_to_yahoo(self, symbol): def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]: return get_calendar_list("ALL") + class YahooNormalizeBR: def _get_calendar_list(self) -> Iterable[pd.Timestamp]: return get_calendar_list("BR_ALL") @@ -883,6 +884,7 @@ def _get_1d_calendar_list(self): def symbol_to_yahoo(self, symbol): return fname_to_code(symbol) + class Run(BaseRun): def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN): """ From cc0e1265bce0894828bb72ca2d0e0a9587ffb696 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Tue, 29 Mar 2022 20:22:40 -0300 Subject: [PATCH 08/20] docs: fixed typo --- scripts/data_collector/br_index/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md index 848600666cf..c3143baa5c6 100644 --- a/scripts/data_collector/br_index/README.md +++ b/scripts/data_collector/br_index/README.md @@ -56,7 +56,5 @@ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --met # parse new companies python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies - -# index_name support: IBOV ``` From 95938ea6455faf43c5b5db5c8c280009ea087e73 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Tue, 29 Mar 2022 23:52:43 -0300 Subject: [PATCH 09/20] docs: more info about python version used to generate requirements.txt file --- scripts/data_collector/br_index/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md index c3143baa5c6..ca31e3f7a54 100644 --- a/scripts/data_collector/br_index/README.md +++ b/scripts/data_collector/br_index/README.md @@ -7,6 +7,7 @@ ```bash pip install -r requirements.txt ``` +- `requirements.txt` file was generated using python3.8 ## For the ibovespa (IBOV) index, we have: From b0aafa2d08145f2e9f26964700f09c1740ba8942 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 31 Mar 2022 21:07:43 -0300 Subject: [PATCH 10/20] docs: added BeautifulSoup requirements --- scripts/data_collector/yahoo/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/data_collector/yahoo/requirements.txt b/scripts/data_collector/yahoo/requirements.txt index 61422c7ab62..1a58eda1f67 100644 --- a/scripts/data_collector/yahoo/requirements.txt +++ b/scripts/data_collector/yahoo/requirements.txt @@ -7,3 +7,6 @@ tqdm lxml yahooquery joblib +beautifulsoup4 +bs4 +soupsieve \ No newline at end of file From 592559ac5be08cc89866f4e62ba2b98ed91664fb Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 31 Mar 2022 21:08:41 -0300 Subject: [PATCH 11/20] feat: removed debug prints --- scripts/data_collector/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index a94a03c6857..943b8e9f347 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -67,9 +67,6 @@ def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]: logger.info(f"get calendar list: {bench_code}......") def _get_calendar(url): - print("--------") - print(url) - print("--------") _value_list = requests.get(url).json()["data"]["klines"] return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list)) From 92aa00332bb94156f072b32b38ab0991c93286eb Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Thu, 31 Mar 2022 21:29:37 -0300 Subject: [PATCH 12/20] feat: added ibov_index_composition variable as a class attribute of IBOVIndex --- scripts/data_collector/br_index/collector.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index 07d015f954c..9e876ffe7fd 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -15,8 +15,6 @@ from data_collector.index import IndexBase -IBOV_INDEX_COMPOSITION = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" - YEAR_QUARTER = [ "2003_1Q", "2003_2Q", @@ -82,6 +80,9 @@ class IBOVIndex(IndexBase): + + ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" + def __init__( self, index_name: str, @@ -189,8 +190,8 @@ def get_changes(self): try: df_changes_list = [] for i in tqdm(range(len(YEAR_QUARTER) - 1)): - df = pd.read_csv(IBOV_INDEX_COMPOSITION.format(YEAR_QUARTER[i]), on_bad_lines="skip")["symbol"] - df_ = pd.read_csv(IBOV_INDEX_COMPOSITION.format(YEAR_QUARTER[i + 1]), on_bad_lines="skip")["symbol"] + df = pd.read_csv(self.ibov_index_composition.format(YEAR_QUARTER[i]), on_bad_lines="skip")["symbol"] + df_ = pd.read_csv(self.ibov_index_composition.format(YEAR_QUARTER[i + 1]), on_bad_lines="skip")["symbol"] ## Remove Dataframe remove_date = YEAR_QUARTER[i].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i].split("_")[1]] @@ -248,10 +249,10 @@ def get_new_companies(self): ## Get index composition df_index = pd.read_csv( - IBOV_INDEX_COMPOSITION.format(self.year + "_" + self.quarter + "Q"), on_bad_lines="skip" + self.ibov_index_composition.format(self.year + "_" + self.quarter + "Q"), on_bad_lines="skip" ) df_date_first_added = pd.read_csv( - IBOV_INDEX_COMPOSITION.format("date_first_added_" + self.year + "_" + self.quarter + "Q"), + self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.quarter + "Q"), on_bad_lines="skip", ) df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]] From 49038458f2f7f12b90d2c2e4b8bfd2c254803a94 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Fri, 1 Apr 2022 21:02:19 -0300 Subject: [PATCH 13/20] feat: added increment to generate the four month period used by the ibov index --- scripts/data_collector/br_index/collector.py | 141 +++++++++---------- 1 file changed, 69 insertions(+), 72 deletions(-) diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index 9e876ffe7fd..8c2cca1e245 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -15,73 +15,13 @@ from data_collector.index import IndexBase -YEAR_QUARTER = [ - "2003_1Q", - "2003_2Q", - "2003_3Q", - "2004_1Q", - "2004_2Q", - "2004_3Q", - "2005_1Q", - "2005_2Q", - "2005_3Q", - "2006_1Q", - "2006_2Q", - "2006_3Q", - "2007_1Q", - "2007_2Q", - "2007_3Q", - "2008_1Q", - "2008_2Q", - "2008_3Q", - "2009_1Q", - "2009_2Q", - "2009_3Q", - "2010_1Q", - "2010_2Q", - "2010_3Q", - "2011_1Q", - "2011_2Q", - "2011_3Q", - "2012_1Q", - "2012_2Q", - "2012_3Q", - "2013_1Q", - "2013_2Q", - "2013_3Q", - "2014_1Q", - "2014_2Q", - "2014_3Q", - "2015_1Q", - "2015_2Q", - "2015_3Q", - "2016_1Q", - "2016_2Q", - "2016_3Q", - "2017_1Q", - "2017_2Q", - "2017_3Q", - "2018_1Q", - "2018_2Q", - "2018_3Q", - "2019_1Q", - "2019_2Q", - "2019_3Q", - "2020_1Q", - "2020_2Q", - "2020_3Q", - "2021_1Q", - "2021_2Q", - "2021_3Q", - "2022_1Q", -] - quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"} class IBOVIndex(IndexBase): ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" + years_4_month_periods = [] def __init__( self, @@ -95,9 +35,10 @@ def __init__( index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep ) - self.today = datetime.date.today() - self.quarter = str(pd.Timestamp(self.today).quarter) + self.today: datetime = datetime.date.today() + self.current_4_month_period = self.get_current_4_month_period(self.today.month) self.year = str(self.today.year) + self.years_4_month_periods = self.get_four_month_period() @property def bench_start_date(self) -> pd.Timestamp: @@ -110,6 +51,62 @@ def bench_start_date(self) -> pd.Timestamp: """ return pd.Timestamp("2003-01-03") + def get_current_4_month_period(self, current_month: int): + """ + This function is used to calculated what is the current + four month period for the current month. For example, + If the current month is August 8, its four month period + is 2Q. + + OBS: In english Q is used to represent *quarter* + which means a three month period. However, in + portuguese we use Q to represent a four month period. + In other words, + + Jan, Feb, Mar, Apr: 1Q + May, Jun, Jul, Aug: 2Q + Sep, Oct, Nov, Dez: 3Q + + Parameters + ---------- + month : int + Current month (1 <= month <= 12) + + Returns + ------- + current_4m_period:str + Current Four Month Period (1Q or 2Q or 3Q) + """ + if current_month < 5: + return "1Q" + if current_month < 9: + return "2Q" + if current_month <= 12: + return "3Q" + else: + return -1 + + def get_four_month_period(self): + """ + The ibovespa index is updated every four months. + Therefore, we will represent each time period as 2003_1Q + which means 2003 first four mount period (Jan, Feb, Mar, Apr) + """ + four_months_period = ["1Q", "2Q", "3Q"] + init_year = 2003 + now = datetime.datetime.now() + current_year = now.year + current_month = now.month + for year in [item for item in range(init_year, current_year)]: + for el in four_months_period: + self.years_4_month_periods.append(str(year)+"_"+el) + # For current year the logic must be a little different + current_4_month_period = self.get_current_4_month_period(current_month) + for i in range(int(current_4_month_period[0])): + self.years_4_month_periods.append(str(current_year) + "_" + str(i+1) + "Q") + return self.years_4_month_periods + + def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame: """formatting the datetime in an instrument @@ -143,7 +140,7 @@ def format_quarter(self, cell: str): Parameters ---------- cell: str - It must be on the format 2003_1Q --> year_quarter + It must be on the format 2003_1Q --> years_4_month_periods Returns ---------- @@ -189,12 +186,12 @@ def get_changes(self): try: df_changes_list = [] - for i in tqdm(range(len(YEAR_QUARTER) - 1)): - df = pd.read_csv(self.ibov_index_composition.format(YEAR_QUARTER[i]), on_bad_lines="skip")["symbol"] - df_ = pd.read_csv(self.ibov_index_composition.format(YEAR_QUARTER[i + 1]), on_bad_lines="skip")["symbol"] + for i in tqdm(range(len(self.years_4_month_periods) - 1)): + df = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i]), on_bad_lines="skip")["symbol"] + df_ = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i + 1]), on_bad_lines="skip")["symbol"] ## Remove Dataframe - remove_date = YEAR_QUARTER[i].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i].split("_")[1]] + remove_date = self.years_4_month_periods[i].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i].split("_")[1]] list_remove = list(df[~df.isin(df_)]) df_removed = pd.DataFrame( { @@ -205,7 +202,7 @@ def get_changes(self): ) ## Add Dataframe - add_date = YEAR_QUARTER[i + 1].split("_")[0] + "-" + quarter_dict[YEAR_QUARTER[i + 1].split("_")[1]] + add_date = self.years_4_month_periods[i + 1].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i + 1].split("_")[1]] list_add = list(df_[~df_.isin(df)]) df_added = pd.DataFrame( {"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add} @@ -249,17 +246,17 @@ def get_new_companies(self): ## Get index composition df_index = pd.read_csv( - self.ibov_index_composition.format(self.year + "_" + self.quarter + "Q"), on_bad_lines="skip" + self.ibov_index_composition.format(self.year + "_" + self.current_4_month_period), on_bad_lines="skip" ) df_date_first_added = pd.read_csv( - self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.quarter + "Q"), + self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.current_4_month_period), on_bad_lines="skip", ) df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]] df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter) # end_date will be our current quarter + 1, since the IBOV index updates itself every quarter - df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[str(int(self.quarter) + 1) + "Q"] + df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[self.current_4_month_period] df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]] df["symbol"] = df["symbol"].astype(str) + ".SA" From 6db33efe0b551944747a34072a2cb7e22ffd3a3d Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Fri, 1 Apr 2022 21:40:25 -0300 Subject: [PATCH 14/20] refactor: Added get_instruments() method inside utils.py for better code usability. Message in the PR request to understand the context of the change In the course of reviewing this PR we found two issues. 1. there are multiple places where the get_instruments() method is used, and we feel that scripts.index.py is the best place for the get_instruments() method to go. 2. data_collector.utils has some very generic stuff put inside it. --- scripts/data_collector/br_index/README.md | 4 +- scripts/data_collector/br_index/collector.py | 41 +----------------- scripts/data_collector/cn_index/collector.py | 44 +------------------ scripts/data_collector/us_index/README.md | 4 +- scripts/data_collector/us_index/collector.py | 41 +----------------- scripts/data_collector/utils.py | 45 ++++++++++++++++++++ 6 files changed, 53 insertions(+), 126 deletions(-) diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md index ca31e3f7a54..c24e4160915 100644 --- a/scripts/data_collector/br_index/README.md +++ b/scripts/data_collector/br_index/README.md @@ -53,9 +53,9 @@ With that reference, the index's composition can be compared quarter by quarter ```bash # parse instruments, using in qlib/instruments. -python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments --market_index br_index # parse new companies -python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies --market_index br_index ``` diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index 8c2cca1e245..fa12c1a89df 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -14,6 +14,7 @@ sys.path.append(str(CUR_DIR.parent.parent)) from data_collector.index import IndexBase +from data_collector.utils import get_instruments quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"} @@ -270,46 +271,6 @@ def filter_df(self, df: pd.DataFrame) -> pd.DataFrame: return df.loc[:, ["Código"]].copy() -def get_instruments( - qlib_dir: str, - index_name: str, - method: str = "parse_instruments", - freq: str = "day", - request_retry: int = 5, - retry_sleep: int = 3, -): - """ - - Parameters - ---------- - qlib_dir: str - qlib data dir, default "Path(__file__).parent/qlib_data" - index_name: str - index name, value from ["IBOV"] - method: str - method, value from ["parse_instruments", "save_new_companies"] - freq: str - freq, value from ["day", "1min"] - request_retry: int - request retry, by default 5 - retry_sleep: int - request sleep, by default 3 - - Examples - ------- - # parse instruments - $ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments - - # parse new companies - $ python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies - - """ - _cur_module = importlib.import_module("data_collector.br_index.collector") - obj = getattr(_cur_module, f"{index_name.upper()}Index")( - qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep - ) - getattr(obj, method)() - if __name__ == "__main__": fire.Fire(get_instruments) diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py index e5970c256d7..0fdfc658b4c 100644 --- a/scripts/data_collector/cn_index/collector.py +++ b/scripts/data_collector/cn_index/collector.py @@ -21,6 +21,7 @@ from data_collector.index import IndexBase from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry +from data_collector.utils import get_instruments NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls" @@ -315,7 +316,7 @@ def get_new_companies(self) -> pd.DataFrame: return df -class CSI300(CSIIndex): +class CSI300Index(CSIIndex): @property def index_code(self): return "000300" @@ -458,46 +459,5 @@ def get_new_companies(self) -> pd.DataFrame: return df -def get_instruments( - qlib_dir: str, - index_name: str, - method: str = "parse_instruments", - freq: str = "day", - request_retry: int = 5, - retry_sleep: int = 3, -): - """ - - Parameters - ---------- - qlib_dir: str - qlib data dir, default "Path(__file__).parent/qlib_data" - index_name: str - index name, value from ["csi100", "csi300"] - method: str - method, value from ["parse_instruments", "save_new_companies"] - freq: str - freq, value from ["day", "1min"] - request_retry: int - request retry, by default 5 - retry_sleep: int - request sleep, by default 3 - - Examples - ------- - # parse instruments - $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments - - # parse new companies - $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies - - """ - _cur_module = importlib.import_module("data_collector.cn_index.collector") - obj = getattr(_cur_module, f"{index_name.upper()}")( - qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep - ) - getattr(obj, method)() - - if __name__ == "__main__": fire.Fire(get_instruments) diff --git a/scripts/data_collector/us_index/README.md b/scripts/data_collector/us_index/README.md index f8244491c36..5a36bf94146 100644 --- a/scripts/data_collector/us_index/README.md +++ b/scripts/data_collector/us_index/README.md @@ -10,10 +10,10 @@ pip install -r requirements.txt ```bash # parse instruments, using in qlib/instruments. -python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments +python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments --market_index us_index # parse new companies -python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies +python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies --market_index us_index # index_name support: SP500, NASDAQ100, DJIA, SP400 # help diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 576b3c32ae3..3b3c629e07f 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -20,6 +20,7 @@ from data_collector.index import IndexBase from data_collector.utils import deco_retry, get_calendar_list, get_trading_date_by_shift +from data_collector.utils import get_instruments WIKI_URL = "https://en.wikipedia.org/wiki" @@ -269,46 +270,6 @@ def parse_instruments(self): logger.warning(f"No suitable data source has been found!") -def get_instruments( - qlib_dir: str, - index_name: str, - method: str = "parse_instruments", - freq: str = "day", - request_retry: int = 5, - retry_sleep: int = 3, -): - """ - - Parameters - ---------- - qlib_dir: str - qlib data dir, default "Path(__file__).parent/qlib_data" - index_name: str - index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"] - method: str - method, value from ["parse_instruments", "save_new_companies"] - freq: str - freq, value from ["day", "1min"] - request_retry: int - request retry, by default 5 - retry_sleep: int - request sleep, by default 3 - - Examples - ------- - # parse instruments - $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments - - # parse new companies - $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies - - """ - _cur_module = importlib.import_module("data_collector.us_index.collector") - obj = getattr(_cur_module, f"{index_name.upper()}Index")( - qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep - ) - getattr(obj, method)() - if __name__ == "__main__": fire.Fire(get_instruments) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 943b8e9f347..3bb40fa3a16 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import re +import importlib import time import bisect import pickle @@ -561,3 +562,47 @@ def generate_minutes_calendar_from_daily( if __name__ == "__main__": assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM + + +def get_instruments( + qlib_dir: str, + index_name: str, + method: str = "parse_instruments", + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, + market_index: str = "cn_index" +): + """ + + Parameters + ---------- + qlib_dir: str + qlib data dir, default "Path(__file__).parent/qlib_data" + index_name: str + index name, value from ["csi100", "csi300"] + method: str + method, value from ["parse_instruments", "save_new_companies"] + freq: str + freq, value from ["day", "1min"] + request_retry: int + request retry, by default 5 + retry_sleep: int + request sleep, by default 3 + market_index: str + Where the files to obtain the index is located, for example data_collector.cn_index.collector + + Examples + ------- + # parse instruments + $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments + + # parse new companies + $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies + + """ + _cur_module = importlib.import_module("data_collector.{}.collector".format(market_index)) + obj = getattr(_cur_module, f"{index_name.upper()}Index")( + qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep + ) + getattr(obj, method)() \ No newline at end of file From ae6380aeead7c458a1097e125169c29faa9f0f9d Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Fri, 1 Apr 2022 21:57:11 -0300 Subject: [PATCH 15/20] refactor: improve brazils stocks download speed The reason to use retry=2 is due to the fact that Yahoo Finance unfortunately does not keep track of the majority of Brazilian stocks. Therefore, the decorator deco_retry with retry argument set to 5 will keep trying to get the stock data 5 times, which makes the code to download Brazilians stocks very slow. In future, this may change, but for now I suggest to leave retry argument to 1 or 2 in order to improve download speed. In order to achieve this code logic an argument called retry_config was added into YahooCollectorBR1d and YahooCollectorBR1min --- scripts/data_collector/yahoo/collector.py | 27 ++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 3b1cd79fe67..c85cb1f2c13 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import abc +from re import I import sys import copy import time @@ -147,7 +148,29 @@ def _show_logging_func(): def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp ) -> pd.DataFrame: - @deco_retry(retry_sleep=self.delay) + if hasattr(self, 'retry_config'): + """" + The reason to use retry=2 is due to the fact that + Yahoo Finance unfortunately does not keep track of the majority + of Brazilian stocks. + + Therefore, the decorator deco_retry with retry argument + set to 5 will keep trying to get the stock data 5 times, + which makes the code to download Brazilians stocks very slow. + + In future, this may change, but for now + I suggest to leave retry argument to 1 or 2 in + order to improve download speed. + + In order to achieve this code logic an argument called retry_config + was added into YahooCollectorBR1d and YahooCollectorBR1min + """ + retry = self.retry_config + else: + # Default value + retry = 5 + + @deco_retry(retry_sleep=self.delay, retry=retry) def _get_simple(start_, end_): self.sleep() _remote_interval = "1m" if interval == self.INTERVAL_1min else interval @@ -333,10 +356,12 @@ def _timezone(self): class YahooCollectorBR1d(YahooCollectorBR): + retry_config = 2 pass class YahooCollectorBR1min(YahooCollectorBR): + retry_config = 2 pass From 1d80c4c14462b75b82845cfd5b1a5f4dd251b543 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sat, 2 Apr 2022 08:39:28 -0300 Subject: [PATCH 16/20] fix: added __main__ at the bottom of the script --- scripts/data_collector/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 3bb40fa3a16..7ef1cdf959d 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -559,11 +559,6 @@ def generate_minutes_calendar_from_daily( return pd.Index(sorted(set(np.hstack(res)))) - -if __name__ == "__main__": - assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM - - def get_instruments( qlib_dir: str, index_name: str, @@ -590,7 +585,8 @@ def get_instruments( retry_sleep: int request sleep, by default 3 market_index: str - Where the files to obtain the index is located, for example data_collector.cn_index.collector + Where the files to obtain the index are located, + for example data_collector.cn_index.collector Examples ------- @@ -605,4 +601,8 @@ def get_instruments( obj = getattr(_cur_module, f"{index_name.upper()}Index")( qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep ) - getattr(obj, method)() \ No newline at end of file + getattr(obj, method)() + + +if __name__ == "__main__": + assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM \ No newline at end of file From dc72c6beb86cae97288cd663798c8379f434c97b Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sat, 2 Apr 2022 08:39:54 -0300 Subject: [PATCH 17/20] refactor: changed interface inside each index Using partial as `fire.Fire(partial(get_instruments, market_index="br_index" ))` will make the interface easier for the user to execute the script. Then all the collector.py CLI in each folder can remove a redundant arguments. --- scripts/data_collector/br_index/README.md | 4 ++-- scripts/data_collector/br_index/collector.py | 3 ++- scripts/data_collector/us_index/README.md | 4 ++-- scripts/data_collector/us_index/collector.py | 3 ++- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/data_collector/br_index/README.md b/scripts/data_collector/br_index/README.md index c24e4160915..ca31e3f7a54 100644 --- a/scripts/data_collector/br_index/README.md +++ b/scripts/data_collector/br_index/README.md @@ -53,9 +53,9 @@ With that reference, the index's composition can be compared quarter by quarter ```bash # parse instruments, using in qlib/instruments. -python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments --market_index br_index +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments # parse new companies -python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies --market_index br_index +python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies ``` diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index fa12c1a89df..bbb012b5c90 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from functools import partial import sys from pathlib import Path import importlib @@ -273,4 +274,4 @@ def filter_df(self, df: pd.DataFrame) -> pd.DataFrame: if __name__ == "__main__": - fire.Fire(get_instruments) + fire.Fire(partial(get_instruments, market_index="br_index" )) diff --git a/scripts/data_collector/us_index/README.md b/scripts/data_collector/us_index/README.md index 5a36bf94146..f8244491c36 100644 --- a/scripts/data_collector/us_index/README.md +++ b/scripts/data_collector/us_index/README.md @@ -10,10 +10,10 @@ pip install -r requirements.txt ```bash # parse instruments, using in qlib/instruments. -python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments --market_index us_index +python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments # parse new companies -python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies --market_index us_index +python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies # index_name support: SP500, NASDAQ100, DJIA, SP400 # help diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 3b3c629e07f..06c48f8f627 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import abc +from functools import partial import sys import importlib from pathlib import Path @@ -272,4 +273,4 @@ def parse_instruments(self): if __name__ == "__main__": - fire.Fire(get_instruments) + fire.Fire(partial(get_instruments, market_index="us_index")) From 6cc96cc432cc7b31f641b1bc6c84035c14cbae40 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sat, 2 Apr 2022 09:52:08 -0300 Subject: [PATCH 18/20] refactor: implemented class interface retry into YahooCollectorBR --- scripts/data_collector/yahoo/collector.py | 43 ++++++++++++----------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index c85cb1f2c13..3f77c07ee26 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -148,24 +148,8 @@ def _show_logging_func(): def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp ) -> pd.DataFrame: - if hasattr(self, 'retry_config'): - """" - The reason to use retry=2 is due to the fact that - Yahoo Finance unfortunately does not keep track of the majority - of Brazilian stocks. - - Therefore, the decorator deco_retry with retry argument - set to 5 will keep trying to get the stock data 5 times, - which makes the code to download Brazilians stocks very slow. - - In future, this may change, but for now - I suggest to leave retry argument to 1 or 2 in - order to improve download speed. - - In order to achieve this code logic an argument called retry_config - was added into YahooCollectorBR1d and YahooCollectorBR1min - """ - retry = self.retry_config + if hasattr(self, 'retry'): + retry = self.retry else: # Default value retry = 5 @@ -336,6 +320,25 @@ class YahooCollectorIN1min(YahooCollectorIN): class YahooCollectorBR(YahooCollector, ABC): + def retry(cls): + """" + The reason to use retry=2 is due to the fact that + Yahoo Finance unfortunately does not keep track of the majority + of Brazilian stocks. + + Therefore, the decorator deco_retry with retry argument + set to 5 will keep trying to get the stock data 5 times, + which makes the code to download Brazilians stocks very slow. + + In future, this may change, but for now + I suggest to leave retry argument to 1 or 2 in + order to improve download speed. + + In order to achieve this code logic an argument called retry_config + was added into YahooCollectorBR base class + """ + raise NotImplementedError + def get_instrument_list(self): logger.info("get BR stock symbols......") symbols = get_br_stock_symbols() + [ @@ -356,12 +359,12 @@ def _timezone(self): class YahooCollectorBR1d(YahooCollectorBR): - retry_config = 2 + retry = 2 pass class YahooCollectorBR1min(YahooCollectorBR): - retry_config = 2 + retry = 2 pass From 1cbfb5cf38d690b4ef196d6a59cb25a359a17531 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sun, 3 Apr 2022 11:50:31 -0300 Subject: [PATCH 19/20] docs: added BR as a possible region into the documentation --- scripts/data_collector/yahoo/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index 28acfbd2c9d..3ce9bae7f61 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -66,7 +66,7 @@ pip install -r requirements.txt - `source_dir`: save the directory - `interval`: `1d` or `1min`, by default `1d` > **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`** - - `region`: `CN` or `US` or `IN`, by default `CN` + - `region`: `CN` or `US` or `IN` or `BR`, by default `CN` - `delay`: `time.sleep(delay)`, by default *0.5* - `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)* - `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)* From c31380481d189d5b7e05490dd9c662f975dffa97 Mon Sep 17 00:00:00 2001 From: Igor Lima Date: Sun, 3 Apr 2022 11:51:09 -0300 Subject: [PATCH 20/20] refactor: make retry attribute part of the interface This way we don't have to use hasattr to access the retry attribute as previously done --- scripts/data_collector/yahoo/collector.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 3f77c07ee26..d57a3057b85 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -44,6 +44,8 @@ class YahooCollector(BaseCollector): + retry = 5 # Configuration attribute. How many times will it try to re-request the data if the network fails. + def __init__( self, save_dir: [str, Path], @@ -148,13 +150,7 @@ def _show_logging_func(): def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp ) -> pd.DataFrame: - if hasattr(self, 'retry'): - retry = self.retry - else: - # Default value - retry = 5 - - @deco_retry(retry_sleep=self.delay, retry=retry) + @deco_retry(retry_sleep=self.delay, retry=self.retry) def _get_simple(start_, end_): self.sleep() _remote_interval = "1m" if interval == self.INTERVAL_1min else interval @@ -323,18 +319,18 @@ class YahooCollectorBR(YahooCollector, ABC): def retry(cls): """" The reason to use retry=2 is due to the fact that - Yahoo Finance unfortunately does not keep track of the majority - of Brazilian stocks. + Yahoo Finance unfortunately does not keep track of some + Brazilian stocks. Therefore, the decorator deco_retry with retry argument - set to 5 will keep trying to get the stock data 5 times, + set to 5 will keep trying to get the stock data up to 5 times, which makes the code to download Brazilians stocks very slow. In future, this may change, but for now I suggest to leave retry argument to 1 or 2 in order to improve download speed. - In order to achieve this code logic an argument called retry_config + To achieve this goal an abstract attribute (retry) was added into YahooCollectorBR base class """ raise NotImplementedError