From 7564a821a356cfeed36f4e434dec9c3671a6dadc Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 11:27:59 -0400 Subject: [PATCH 01/25] wip --- config.yaml | 12 ++++++----- download_analytics/__main__.py | 1 + download_analytics/bq.py | 37 +++++++++++++++++++++++++++------- download_analytics/pypi.py | 13 ++++++------ 4 files changed, 45 insertions(+), 18 deletions(-) diff --git a/config.yaml b/config.yaml index 5fd787a..1986c9f 100644 --- a/config.yaml +++ b/config.yaml @@ -8,14 +8,16 @@ projects: - deepecho - sdmetrics - sdgym - - gretel-synthetics - - ydata-synthetic - synthesized - datomize - - gretel-trainer - - ydata-sdk - - mostlyai - synthcity - smartnoise-synth - realtabformer - be-great + - ydata-synthetic + - ydata-sdk + - gretel-synthetics + - gretel-trainer + - gretel-client + - mostlyai + - mostlyai-mock diff --git a/download_analytics/__main__.py b/download_analytics/__main__.py index 312184c..fef2f4a 100644 --- a/download_analytics/__main__.py +++ b/download_analytics/__main__.py @@ -124,6 +124,7 @@ def _get_parser(): '--projects', nargs='*', help='List of projects to collect. If not given use the configured ones.', + default=None, ) collect.add_argument( '-s', diff --git a/download_analytics/bq.py b/download_analytics/bq.py index 7e00a09..e54ab40 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -6,6 +6,7 @@ import logging import os import pathlib +import pandas as pd from google.cloud import bigquery from google.oauth2 import service_account @@ -24,11 +25,17 @@ def _get_bq_client(credentials_file): LOGGER.info('Loading BigQuery credentials from BIGQUERY_CREDENTIALS envvar') - service_account_info = json.loads(credentials_contents) - credentials = service_account.Credentials.from_service_account_info( - service_account_info, - scopes=['https://www.googleapis.com/auth/cloud-platform'], - ) + if os.path.exists(credentials_contents): + credentials = service_account.Credentials.from_service_account_file( + credentials_contents, + scopes=['https://www.googleapis.com/auth/cloud-platform'], + ) + else: + service_account_info = json.loads(credentials_contents) + credentials = service_account.Credentials.from_service_account_info( + service_account_info, + scopes=['https://www.googleapis.com/auth/cloud-platform'], + ) return bigquery.Client( credentials=credentials, @@ -44,7 +51,13 @@ def run_query(query, dry_run=False, credentials_file=None): job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) dry_run_job = client.query(query, job_config=job_config) - LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3) + LOGGER.info('Estimated data processed in query (GBs): %.2f', dry_run_job.total_bytes_processed / 1024**3) + # https://cloud.google.com/bigquery/pricing#on_demand_pricing + # assuming have hit 1 terabyte processed in month + cost_per_terabyte = 6.15 + bytes = dry_run_job.total_bytes_processed + cost = cost_per_terabyte * bytes_to_terabytes(bytes) + LOGGER.info('Estimated cost for query: $%.2f', cost) if dry_run: return None @@ -53,5 +66,15 @@ def run_query(query, dry_run=False, credentials_file=None): data = query_job.to_dataframe() LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3) - + cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed) + LOGGER.info('Total cost for query: $%.2f', cost) return data + +def bytes_to_megabytes(bytes): + return bytes / 1024 / 1024 + +def bytes_to_gigabytes(bytes): + return bytes_to_megabytes(bytes) / 1024 + +def bytes_to_terabytes(bytes): + return bytes_to_gigabytes(bytes) / 1024 diff --git a/download_analytics/pypi.py b/download_analytics/pypi.py index dd834c7..84538df 100644 --- a/download_analytics/pypi.py +++ b/download_analytics/pypi.py @@ -1,7 +1,7 @@ """Functions to get PyPI downloads from Google Big Query.""" import logging -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone import pandas as pd @@ -67,10 +67,12 @@ def _get_query(projects, start_date, end_date): def _get_query_dates(start_date, min_date, max_date, max_days, force=False): - end_date = datetime.utcnow().date() + end_date = datetime.now(timezone.utc).date() if start_date is None: start_date = end_date - timedelta(days=max_days) + start_date = start_date.date() + if pd.notna(min_date): min_date = pd.Timestamp(min_date).date() if min_date > start_date: @@ -131,8 +133,8 @@ def get_pypi_downloads( projects = (projects,) previous_projects = previous[previous.project.isin(projects)] - min_date = previous_projects.timestamp.min() - max_date = previous_projects.timestamp.max() + min_date = previous_projects.timestamp.min().date() + max_date = previous_projects.timestamp.max().date() else: previous = pd.DataFrame(columns=OUTPUT_COLUMNS) min_date = None @@ -150,7 +152,7 @@ def get_pypi_downloads( if max_date is None: all_downloads = new_downloads else: - if pd.Timestamp(max_date) < pd.Timestamp(end_date): + if max_date <= end_date: before = previous[previous.timestamp < new_downloads.timestamp.min()] after = new_downloads else: @@ -160,5 +162,4 @@ def get_pypi_downloads( all_downloads = pd.concat([before, after], ignore_index=True) LOGGER.info('Obtained %s new downloads', len(all_downloads) - len(previous)) - return all_downloads From b4bcf8729293708420d02ecdc613d94c86e889a1 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 11:30:16 -0400 Subject: [PATCH 02/25] lint --- download_analytics/bq.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index e54ab40..fe6e74a 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -6,7 +6,6 @@ import logging import os import pathlib -import pandas as pd from google.cloud import bigquery from google.oauth2 import service_account @@ -51,7 +50,8 @@ def run_query(query, dry_run=False, credentials_file=None): job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) dry_run_job = client.query(query, job_config=job_config) - LOGGER.info('Estimated data processed in query (GBs): %.2f', dry_run_job.total_bytes_processed / 1024**3) + data_processed_gbs = dry_run_job.total_bytes_processed / 1024**3 + LOGGER.info('Estimated data processed in query (GBs): %.2f', data_processed_gbs) # https://cloud.google.com/bigquery/pricing#on_demand_pricing # assuming have hit 1 terabyte processed in month cost_per_terabyte = 6.15 @@ -70,11 +70,17 @@ def run_query(query, dry_run=False, credentials_file=None): LOGGER.info('Total cost for query: $%.2f', cost) return data + def bytes_to_megabytes(bytes): + """Convert bytes to megabytes.""" return bytes / 1024 / 1024 + def bytes_to_gigabytes(bytes): + """Convert bytes to gigabytes.""" return bytes_to_megabytes(bytes) / 1024 + def bytes_to_terabytes(bytes): + """Convert bytes to terabytes.""" return bytes_to_gigabytes(bytes) / 1024 From cfa02faada4ed2d5659da701c1edaf4ff9f120a4 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 11:31:46 -0400 Subject: [PATCH 03/25] fix start date --- .github/workflows/dryrun.yaml | 8 ++++---- .github/workflows/lint.yaml | 8 ++++---- download_analytics/pypi.py | 2 -- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/dryrun.yaml b/.github/workflows/dryrun.yaml index a6a8350..343e0d8 100644 --- a/.github/workflows/dryrun.yaml +++ b/.github/workflows/dryrun.yaml @@ -1,5 +1,4 @@ name: Health-check Dry Run - on: workflow_dispatch: inputs: @@ -7,10 +6,11 @@ on: description: Slack channel to post the error message to if the builds fail. required: false default: "sdv-alerts-debug" - - push: pull_request: - + types: + - opened + - synchronize + - ready_for_review jobs: dry_run: runs-on: ubuntu-latest diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index e84f94b..af2634e 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -1,10 +1,10 @@ name: Style Checks - on: - push: pull_request: - types: [opened, reopened] - + types: + - opened + - synchronize + - ready_for_review jobs: lint: runs-on: ubuntu-latest diff --git a/download_analytics/pypi.py b/download_analytics/pypi.py index 84538df..e50da07 100644 --- a/download_analytics/pypi.py +++ b/download_analytics/pypi.py @@ -71,8 +71,6 @@ def _get_query_dates(start_date, min_date, max_date, max_days, force=False): if start_date is None: start_date = end_date - timedelta(days=max_days) - start_date = start_date.date() - if pd.notna(min_date): min_date = pd.Timestamp(min_date).date() if min_date > start_date: From 18bbb95256e444544db2fad64ef633255fb231ba Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 11:39:16 -0400 Subject: [PATCH 04/25] add project print --- download_analytics/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/download_analytics/main.py b/download_analytics/main.py index f58b19f..ed5a09d 100644 --- a/download_analytics/main.py +++ b/download_analytics/main.py @@ -47,6 +47,8 @@ def collect_downloads( if not projects: raise ValueError('No projects have been passed') + LOGGER.info('Collecting downloads for projects={projects}') + csv_path = get_path(output_folder, 'pypi.csv') previous = load_csv(csv_path) From 059e9ac044b7e9afe680ba6a775b27d2fe566f52 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 11:44:28 -0400 Subject: [PATCH 05/25] fix print --- download_analytics/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download_analytics/main.py b/download_analytics/main.py index ed5a09d..1050a99 100644 --- a/download_analytics/main.py +++ b/download_analytics/main.py @@ -47,7 +47,7 @@ def collect_downloads( if not projects: raise ValueError('No projects have been passed') - LOGGER.info('Collecting downloads for projects={projects}') + LOGGER.info(f'Collecting downloads for projects={projects}') csv_path = get_path(output_folder, 'pypi.csv') previous = load_csv(csv_path) From c735af7dec72d8f9ca0274d071e0fd6044c5b346 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 11:56:25 -0400 Subject: [PATCH 06/25] update message --- .github/workflows/dryrun.yaml | 3 +++ .github/workflows/lint.yaml | 3 +++ download_analytics/main.py | 6 +++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dryrun.yaml b/.github/workflows/dryrun.yaml index 343e0d8..b1260c5 100644 --- a/.github/workflows/dryrun.yaml +++ b/.github/workflows/dryrun.yaml @@ -11,6 +11,9 @@ on: - opened - synchronize - ready_for_review +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: dry_run: runs-on: ubuntu-latest diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index af2634e..e35cf0a 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -5,6 +5,9 @@ on: - opened - synchronize - ready_for_review +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: lint: runs-on: ubuntu-latest diff --git a/download_analytics/main.py b/download_analytics/main.py index 1050a99..2eb12ab 100644 --- a/download_analytics/main.py +++ b/download_analytics/main.py @@ -65,7 +65,11 @@ def collect_downloads( if pypi_downloads.empty: LOGGER.info('Not creating empty CSV file %s', csv_path) elif pypi_downloads.equals(previous): - LOGGER.info('Skipping update of unmodified CSV file %s', csv_path) + msg = f'Skipping update of unmodified CSV file {csv_path}' + if dry_run: + msg += 'because dry_run={dry_run}, meaning no downloads were returned from BigQuery' + LOGGER.info(msg) + else: create_csv(csv_path, pypi_downloads) From f1d9a31a8793699b2d24333d77b3dfd9b9866299 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 13:21:11 -0400 Subject: [PATCH 07/25] update to use pyarrow dtypes --- .github/workflows/daily.yaml | 1 + download_analytics/bq.py | 14 +++++++++++++- download_analytics/main.py | 2 +- download_analytics/metrics.py | 2 +- download_analytics/output.py | 10 +++++++--- pyproject.toml | 2 ++ 6 files changed, 25 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index eddf8d3..7ae9117 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -13,6 +13,7 @@ on: jobs: collect: runs-on: ubuntu-latest + timeout-minutes: 30 steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/download_analytics/bq.py b/download_analytics/bq.py index fe6e74a..c225e56 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -7,6 +7,8 @@ import os import pathlib +import pandas as pd +import pyarrow as pa from google.cloud import bigquery from google.oauth2 import service_account @@ -25,11 +27,13 @@ def _get_bq_client(credentials_file): LOGGER.info('Loading BigQuery credentials from BIGQUERY_CREDENTIALS envvar') if os.path.exists(credentials_contents): + LOGGER.info('Loading BigQuery credentials from service account file') credentials = service_account.Credentials.from_service_account_file( credentials_contents, scopes=['https://www.googleapis.com/auth/cloud-platform'], ) else: + LOGGER.info('Loading BigQuery credentials from service account info') service_account_info = json.loads(credentials_contents) credentials = service_account.Credentials.from_service_account_info( service_account_info, @@ -63,7 +67,15 @@ def run_query(query, dry_run=False, credentials_file=None): return None query_job = client.query(query) - data = query_job.to_dataframe() + dataframe_args = { + "create_bqstorage_client": True, + "bool_dtype": pd.ArrowDtype(pa.bool_()), + "int_dtype": pd.ArrowDtype(pa.int64()), + "float_dtype": pd.ArrowDtype(pa.float64()), + "string_dtype": pd.ArrowDtype(pa.string()), + "timestamp_dtype": pd.ArrowDtype(pa.timestamp("s", tz="UTC")), + } + data = query_job.to_dataframe(**dataframe_args) LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3) cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed) diff --git a/download_analytics/main.py b/download_analytics/main.py index 2eb12ab..7f985ee 100644 --- a/download_analytics/main.py +++ b/download_analytics/main.py @@ -67,7 +67,7 @@ def collect_downloads( elif pypi_downloads.equals(previous): msg = f'Skipping update of unmodified CSV file {csv_path}' if dry_run: - msg += 'because dry_run={dry_run}, meaning no downloads were returned from BigQuery' + msg += f' because dry_run={dry_run}, meaning no downloads were returned from BigQuery' LOGGER.info(msg) else: diff --git a/download_analytics/metrics.py b/download_analytics/metrics.py index 3b21667..fe337d2 100644 --- a/download_analytics/metrics.py +++ b/download_analytics/metrics.py @@ -135,7 +135,7 @@ def _version_order_key(version_column): def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) downloads['full_python_version'] = downloads['python_version'] - downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0] + downloads['python_version'] = downloads['python_version'].astype("string").str.rsplit('.', n=1).str[0] downloads['project_version'] = downloads['project'] + '-' + downloads['version'] downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version'] downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel'] diff --git a/download_analytics/output.py b/download_analytics/output.py index 3e031a9..a792fef 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -161,13 +161,17 @@ def load_csv(csv_path): LOGGER.info('Trying to load CSV file %s', csv_path) try: + read_csv_kwargs = { + 'parse_dates': ['timestamp'], + 'engine': 'pyarrow', + 'dtype_backend':'pyarrow' + } if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) stream = drive.download(folder, filename) - data = pd.read_csv(stream, parse_dates=['timestamp']) + data = pd.read_csv(stream, **read_csv_kwargs) else: - data = pd.read_csv(csv_path, parse_dates=['timestamp']) - + data = pd.read_csv(csv_path, **read_csv_kwargs) except FileNotFoundError: LOGGER.info('Failed to load CSV file %s: not found', csv_path) return None diff --git a/pyproject.toml b/pyproject.toml index ae5c259..6ae4b88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,8 +21,10 @@ dependencies = [ "PyYAML", "PyDrive", "google-cloud-bigquery", + "google-cloud-bigquery-storage", "db-dtypes", "httplib2==0.15.0", # https://stackoverflow.com/questions/59815620/gcloud-upload-httplib2-redirectmissinglocation-redirected-but-the-response-is-m + 'pyarrow >= 15.0.0', ] [project.urls] From f73291fff0392b47b4c609f0afce306206fac280 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 13:22:02 -0400 Subject: [PATCH 08/25] fix string --- download_analytics/bq.py | 12 ++++++------ download_analytics/metrics.py | 3 ++- download_analytics/output.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index c225e56..d0ee737 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -68,12 +68,12 @@ def run_query(query, dry_run=False, credentials_file=None): query_job = client.query(query) dataframe_args = { - "create_bqstorage_client": True, - "bool_dtype": pd.ArrowDtype(pa.bool_()), - "int_dtype": pd.ArrowDtype(pa.int64()), - "float_dtype": pd.ArrowDtype(pa.float64()), - "string_dtype": pd.ArrowDtype(pa.string()), - "timestamp_dtype": pd.ArrowDtype(pa.timestamp("s", tz="UTC")), + 'create_bqstorage_client': True, + 'bool_dtype': pd.ArrowDtype(pa.bool_()), + 'int_dtype': pd.ArrowDtype(pa.int64()), + 'float_dtype': pd.ArrowDtype(pa.float64()), + 'string_dtype': pd.ArrowDtype(pa.string()), + 'timestamp_dtype': pd.ArrowDtype(pa.timestamp('s', tz='UTC')), } data = query_job.to_dataframe(**dataframe_args) LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) diff --git a/download_analytics/metrics.py b/download_analytics/metrics.py index fe337d2..4813222 100644 --- a/download_analytics/metrics.py +++ b/download_analytics/metrics.py @@ -135,7 +135,8 @@ def _version_order_key(version_column): def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) downloads['full_python_version'] = downloads['python_version'] - downloads['python_version'] = downloads['python_version'].astype("string").str.rsplit('.', n=1).str[0] + downloads['python_version'] = downloads['python_version'].astype('string') + downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0] downloads['project_version'] = downloads['project'] + '-' + downloads['version'] downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version'] downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel'] diff --git a/download_analytics/output.py b/download_analytics/output.py index a792fef..17966e4 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -164,7 +164,7 @@ def load_csv(csv_path): read_csv_kwargs = { 'parse_dates': ['timestamp'], 'engine': 'pyarrow', - 'dtype_backend':'pyarrow' + 'dtype_backend': 'pyarrow', } if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) From 20a0108925d9e14ce56e69286125044962c05c42 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 13:31:59 -0400 Subject: [PATCH 09/25] update to ubuntu-latest-largeA --- .github/workflows/daily.yaml | 2 +- .github/workflows/dryrun.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 7ae9117..2771008 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -12,7 +12,7 @@ on: jobs: collect: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-large timeout-minutes: 30 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/dryrun.yaml b/.github/workflows/dryrun.yaml index b1260c5..28f8959 100644 --- a/.github/workflows/dryrun.yaml +++ b/.github/workflows/dryrun.yaml @@ -16,7 +16,7 @@ concurrency: cancel-in-progress: true jobs: dry_run: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-large steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From ecdc74df6ba1bbf79501c67bc059d23765a8ffd7 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 13:42:15 -0400 Subject: [PATCH 10/25] update to ubuntu --- .github/workflows/dryrun.yaml | 2 +- download_analytics/main.py | 2 +- download_analytics/output.py | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dryrun.yaml b/.github/workflows/dryrun.yaml index 28f8959..b1260c5 100644 --- a/.github/workflows/dryrun.yaml +++ b/.github/workflows/dryrun.yaml @@ -16,7 +16,7 @@ concurrency: cancel-in-progress: true jobs: dry_run: - runs-on: ubuntu-latest-large + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/download_analytics/main.py b/download_analytics/main.py index 7f985ee..0f4cbaa 100644 --- a/download_analytics/main.py +++ b/download_analytics/main.py @@ -50,7 +50,7 @@ def collect_downloads( LOGGER.info(f'Collecting downloads for projects={projects}') csv_path = get_path(output_folder, 'pypi.csv') - previous = load_csv(csv_path) + previous = load_csv(csv_path, dry_run=dry_run) pypi_downloads = get_pypi_downloads( projects=projects, diff --git a/download_analytics/output.py b/download_analytics/output.py index 17966e4..5185aa5 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -145,7 +145,7 @@ def load_spreadsheet(spreadsheet): return sheets -def load_csv(csv_path): +def load_csv(csv_path, dry_run=False): """Load a CSV previously created by download-analytics. Args: @@ -166,6 +166,10 @@ def load_csv(csv_path): 'engine': 'pyarrow', 'dtype_backend': 'pyarrow', } + if dry_run: + nrows = 1_000_000 + LOGGER.info('Only reading first 1 million rows') + read_csv_kwargs['nrows'] = nrows if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) stream = drive.download(folder, filename) From a694d4c618ac5a84b093db04e74b0a4bc412a3dc Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 13:46:36 -0400 Subject: [PATCH 11/25] fix engine --- download_analytics/output.py | 1 - 1 file changed, 1 deletion(-) diff --git a/download_analytics/output.py b/download_analytics/output.py index 5185aa5..32d08e0 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -163,7 +163,6 @@ def load_csv(csv_path, dry_run=False): try: read_csv_kwargs = { 'parse_dates': ['timestamp'], - 'engine': 'pyarrow', 'dtype_backend': 'pyarrow', } if dry_run: From 420cf09a01fa7379122b9fb455eecfeb78669dba Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 13:47:48 -0400 Subject: [PATCH 12/25] docstring --- download_analytics/output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download_analytics/output.py b/download_analytics/output.py index 32d08e0..c7805e8 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -167,7 +167,7 @@ def load_csv(csv_path, dry_run=False): } if dry_run: nrows = 1_000_000 - LOGGER.info('Only reading first 1 million rows') + LOGGER.info('Only reading first 1 million rows because dry-run') read_csv_kwargs['nrows'] = nrows if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) From e6b751be044d063de18ca9630cd8d7f956f57104 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 14:12:08 -0400 Subject: [PATCH 13/25] use category dtype --- download_analytics/metrics.py | 5 ++++- download_analytics/output.py | 17 ++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/download_analytics/metrics.py b/download_analytics/metrics.py index 4813222..0c27ae7 100644 --- a/download_analytics/metrics.py +++ b/download_analytics/metrics.py @@ -134,8 +134,11 @@ def _version_order_key(version_column): def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) + for col in ['python_version', 'project', 'version', + 'distro_name', 'distro_version', 'distro_kernel']: + downloads[col] = downloads[col].astype("string") + downloads['full_python_version'] = downloads['python_version'] - downloads['python_version'] = downloads['python_version'].astype('string') downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0] downloads['project_version'] = downloads['project'] + '-' + downloads['version'] downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version'] diff --git a/download_analytics/output.py b/download_analytics/output.py index c7805e8..b35f184 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -163,12 +163,27 @@ def load_csv(csv_path, dry_run=False): try: read_csv_kwargs = { 'parse_dates': ['timestamp'], - 'dtype_backend': 'pyarrow', + 'dtype': { + 'country_code': pd.CategoricalDtype(), + 'project': pd.CategoricalDtype(), + 'version': pd.CategoricalDtype(), + 'type': pd.CategoricalDtype(), + 'installer_name': pd.CategoricalDtype(), + 'implementation_name': pd.CategoricalDtype(), + 'implementation_version': pd.CategoricalDtype(), + 'distro_name': pd.CategoricalDtype(), + 'distro_version': pd.CategoricalDtype(), + 'system_name': pd.CategoricalDtype(), + 'system_release': pd.CategoricalDtype(), + 'cpu': pd.CategoricalDtype(), + } } if dry_run: nrows = 1_000_000 LOGGER.info('Only reading first 1 million rows because dry-run') read_csv_kwargs['nrows'] = nrows + # else: + # read_csv_kwargs['engine'] = 'pyarrow' if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) stream = drive.download(folder, filename) From 4924d5bdd159b0f225f7387a751ea37e21dbf73b Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 14:40:17 -0400 Subject: [PATCH 14/25] remove pyarrow --- download_analytics/bq.py | 9 +++------ download_analytics/metrics.py | 12 +++++++++--- download_analytics/output.py | 4 +--- pyproject.toml | 1 - 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index d0ee737..d070bd3 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -7,8 +7,8 @@ import os import pathlib +import numpy as np import pandas as pd -import pyarrow as pa from google.cloud import bigquery from google.oauth2 import service_account @@ -69,11 +69,8 @@ def run_query(query, dry_run=False, credentials_file=None): query_job = client.query(query) dataframe_args = { 'create_bqstorage_client': True, - 'bool_dtype': pd.ArrowDtype(pa.bool_()), - 'int_dtype': pd.ArrowDtype(pa.int64()), - 'float_dtype': pd.ArrowDtype(pa.float64()), - 'string_dtype': pd.ArrowDtype(pa.string()), - 'timestamp_dtype': pd.ArrowDtype(pa.timestamp('s', tz='UTC')), + 'string_dtype': pd.StringDtype(), + 'timestamp_dtype': np.dtype('datetime64[ns, UTC]'), } data = query_job.to_dataframe(**dataframe_args) LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) diff --git a/download_analytics/metrics.py b/download_analytics/metrics.py index 0c27ae7..dd2a050 100644 --- a/download_analytics/metrics.py +++ b/download_analytics/metrics.py @@ -134,9 +134,15 @@ def _version_order_key(version_column): def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) - for col in ['python_version', 'project', 'version', - 'distro_name', 'distro_version', 'distro_kernel']: - downloads[col] = downloads[col].astype("string") + for col in [ + 'python_version', + 'project', + 'version', + 'distro_name', + 'distro_version', + 'distro_kernel', + ]: + downloads[col] = downloads[col].astype('string') downloads['full_python_version'] = downloads['python_version'] downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0] diff --git a/download_analytics/output.py b/download_analytics/output.py index b35f184..f9756c2 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -176,14 +176,12 @@ def load_csv(csv_path, dry_run=False): 'system_name': pd.CategoricalDtype(), 'system_release': pd.CategoricalDtype(), 'cpu': pd.CategoricalDtype(), - } + }, } if dry_run: nrows = 1_000_000 LOGGER.info('Only reading first 1 million rows because dry-run') read_csv_kwargs['nrows'] = nrows - # else: - # read_csv_kwargs['engine'] = 'pyarrow' if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) stream = drive.download(folder, filename) diff --git a/pyproject.toml b/pyproject.toml index 6ae4b88..4351af0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "google-cloud-bigquery-storage", "db-dtypes", "httplib2==0.15.0", # https://stackoverflow.com/questions/59815620/gcloud-upload-httplib2-redirectmissinglocation-redirected-but-the-response-is-m - 'pyarrow >= 15.0.0', ] [project.urls] From 3c380e4d800de9c021b91f204b7bbdfdf3ea01c6 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 14:44:34 -0400 Subject: [PATCH 15/25] fix ns --- download_analytics/bq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index d070bd3..e4e0c10 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -70,7 +70,7 @@ def run_query(query, dry_run=False, credentials_file=None): dataframe_args = { 'create_bqstorage_client': True, 'string_dtype': pd.StringDtype(), - 'timestamp_dtype': np.dtype('datetime64[ns, UTC]'), + 'timestamp_dtype': np.dtype('datetime64[ns]') } data = query_job.to_dataframe(**dataframe_args) LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) From 21c47010a7017885e542611b030d664607e0654c Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 14:45:26 -0400 Subject: [PATCH 16/25] lint --- download_analytics/bq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index e4e0c10..0b68234 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -70,7 +70,7 @@ def run_query(query, dry_run=False, credentials_file=None): dataframe_args = { 'create_bqstorage_client': True, 'string_dtype': pd.StringDtype(), - 'timestamp_dtype': np.dtype('datetime64[ns]') + 'timestamp_dtype': np.dtype('datetime64[ns]'), } data = query_job.to_dataframe(**dataframe_args) LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) From b71f64bd79ac8cef8f515d77673f2180d12a6f21 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:00:13 -0400 Subject: [PATCH 17/25] use pyarrow everywhere --- download_analytics/bq.py | 9 ++++++--- download_analytics/output.py | 25 +++++++++++++------------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index 0b68234..2cf96d1 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -7,8 +7,8 @@ import os import pathlib -import numpy as np import pandas as pd +import pyarrow as pa from google.cloud import bigquery from google.oauth2 import service_account @@ -69,8 +69,11 @@ def run_query(query, dry_run=False, credentials_file=None): query_job = client.query(query) dataframe_args = { 'create_bqstorage_client': True, - 'string_dtype': pd.StringDtype(), - 'timestamp_dtype': np.dtype('datetime64[ns]'), + 'bool_dtype': pd.ArrowDtype(pa.bool_()), + 'int_dtype': pd.ArrowDtype(pa.int64()), + 'float_dtype': pd.ArrowDtype(pa.float64()), + 'string_dtype': pd.ArrowDtype(pa.string()), + 'timestamp_dtype': pd.ArrowDtype(pa.timestamp('ns', tz='UTC')), } data = query_job.to_dataframe(**dataframe_args) LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) diff --git a/download_analytics/output.py b/download_analytics/output.py index f9756c2..f7d6b20 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -5,6 +5,7 @@ import pathlib import pandas as pd +import pyarrow as pa from download_analytics import drive @@ -164,18 +165,18 @@ def load_csv(csv_path, dry_run=False): read_csv_kwargs = { 'parse_dates': ['timestamp'], 'dtype': { - 'country_code': pd.CategoricalDtype(), - 'project': pd.CategoricalDtype(), - 'version': pd.CategoricalDtype(), - 'type': pd.CategoricalDtype(), - 'installer_name': pd.CategoricalDtype(), - 'implementation_name': pd.CategoricalDtype(), - 'implementation_version': pd.CategoricalDtype(), - 'distro_name': pd.CategoricalDtype(), - 'distro_version': pd.CategoricalDtype(), - 'system_name': pd.CategoricalDtype(), - 'system_release': pd.CategoricalDtype(), - 'cpu': pd.CategoricalDtype(), + 'country_code': pd.ArrowDtype(pa.string()), + 'project': pd.ArrowDtype(pa.string()), + 'version': pd.ArrowDtype(pa.string()), + 'type': pd.ArrowDtype(pa.string()), + 'installer_name': pd.ArrowDtype(pa.string()), + 'implementation_name': pd.ArrowDtype(pa.string()), + 'implementation_version': pd.ArrowDtype(pa.string()), + 'distro_name': pd.ArrowDtype(pa.string()), + 'distro_version': pd.ArrowDtype(pa.string()), + 'system_name': pd.ArrowDtype(pa.string()), + 'system_release': pd.ArrowDtype(pa.string()), + 'cpu': pd.ArrowDtype(pa.string()), }, } if dry_run: From 6384bc9bd1bfaeb9d19a20392ae240d88dc45615 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:05:51 -0400 Subject: [PATCH 18/25] remove pyarrow dtypes --- download_analytics/bq.py | 12 +----------- download_analytics/output.py | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/download_analytics/bq.py b/download_analytics/bq.py index 2cf96d1..0d56588 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -7,8 +7,6 @@ import os import pathlib -import pandas as pd -import pyarrow as pa from google.cloud import bigquery from google.oauth2 import service_account @@ -67,15 +65,7 @@ def run_query(query, dry_run=False, credentials_file=None): return None query_job = client.query(query) - dataframe_args = { - 'create_bqstorage_client': True, - 'bool_dtype': pd.ArrowDtype(pa.bool_()), - 'int_dtype': pd.ArrowDtype(pa.int64()), - 'float_dtype': pd.ArrowDtype(pa.float64()), - 'string_dtype': pd.ArrowDtype(pa.string()), - 'timestamp_dtype': pd.ArrowDtype(pa.timestamp('ns', tz='UTC')), - } - data = query_job.to_dataframe(**dataframe_args) + data = query_job.to_dataframe() LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3) cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed) diff --git a/download_analytics/output.py b/download_analytics/output.py index f7d6b20..f9756c2 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -5,7 +5,6 @@ import pathlib import pandas as pd -import pyarrow as pa from download_analytics import drive @@ -165,18 +164,18 @@ def load_csv(csv_path, dry_run=False): read_csv_kwargs = { 'parse_dates': ['timestamp'], 'dtype': { - 'country_code': pd.ArrowDtype(pa.string()), - 'project': pd.ArrowDtype(pa.string()), - 'version': pd.ArrowDtype(pa.string()), - 'type': pd.ArrowDtype(pa.string()), - 'installer_name': pd.ArrowDtype(pa.string()), - 'implementation_name': pd.ArrowDtype(pa.string()), - 'implementation_version': pd.ArrowDtype(pa.string()), - 'distro_name': pd.ArrowDtype(pa.string()), - 'distro_version': pd.ArrowDtype(pa.string()), - 'system_name': pd.ArrowDtype(pa.string()), - 'system_release': pd.ArrowDtype(pa.string()), - 'cpu': pd.ArrowDtype(pa.string()), + 'country_code': pd.CategoricalDtype(), + 'project': pd.CategoricalDtype(), + 'version': pd.CategoricalDtype(), + 'type': pd.CategoricalDtype(), + 'installer_name': pd.CategoricalDtype(), + 'implementation_name': pd.CategoricalDtype(), + 'implementation_version': pd.CategoricalDtype(), + 'distro_name': pd.CategoricalDtype(), + 'distro_version': pd.CategoricalDtype(), + 'system_name': pd.CategoricalDtype(), + 'system_release': pd.CategoricalDtype(), + 'cpu': pd.CategoricalDtype(), }, } if dry_run: From dac71d554e9d8eb53d8ea821ec776849acf572a9 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:25:56 -0400 Subject: [PATCH 19/25] add readme instructions --- .github/workflows/manual.yaml | 2 +- README.md | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml index 6d9b8d2..aec4ec1 100644 --- a/.github/workflows/manual.yaml +++ b/.github/workflows/manual.yaml @@ -38,7 +38,7 @@ jobs: download-analytics collect \ --verbose \ --projects ${{ github.event.inputs.projects }} \ - --max-days ${{ github.event.inputs.max_days }} \ + ${{ inputs.max_days && '--max_days github.event.inputs.max_days' || ''}} --output-folder gdrive://${{ github.event.inputs.output_folder }} \ ${{ github.event.inputs.extras }} env: diff --git a/README.md b/README.md index 6b2e0ff..80b9de0 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,18 @@ In the future, these sources may also be added: For more information about how to configure and use the software, or about the data that is being collected check the resources below. +### Add new libraries +In order add new libraries, it is important to follow these steps to ensure that data is backfilled. +1. Update `config.yaml` with the new libraries (pypi project names only for now) +2. Run the [Manual collection workflow](https://github.com/datacebo/download-analytics/actions/workflows/manual.yaml) on your branch. + - Use workflow from **your branch name**. + - List the project names you added + - Put '' for max_days to indicate you want all data + - Pass any extra arguments (for example `--dry-run` to test your changes) +3. Let the workflow finish and check that pypi.csv contains the right data. +4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days. + - The collection script looks at timestamps and avoids adding overlapping data. + ## Resources | | Document | Description | From e195d6e652d558833a32c50ab5f62b0abc0431fb Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:26:30 -0400 Subject: [PATCH 20/25] fix manual --- .github/workflows/manual.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml index aec4ec1..dd7840b 100644 --- a/.github/workflows/manual.yaml +++ b/.github/workflows/manual.yaml @@ -22,7 +22,7 @@ on: jobs: collect: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-large steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From b4aca6d9f9e219a73f4a63986e8be3df08c74de9 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:28:38 -0400 Subject: [PATCH 21/25] cleanup --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 80b9de0..e3242da 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,13 @@ collected check the resources below. In order add new libraries, it is important to follow these steps to ensure that data is backfilled. 1. Update `config.yaml` with the new libraries (pypi project names only for now) 2. Run the [Manual collection workflow](https://github.com/datacebo/download-analytics/actions/workflows/manual.yaml) on your branch. - - Use workflow from **your branch name**. - - List the project names you added - - Put '' for max_days to indicate you want all data - - Pass any extra arguments (for example `--dry-run` to test your changes) + - Use workflow from **your branch name**. + - List the project names you added + - Put '' for max_days to indicate you want all data + - Pass any extra arguments (for example `--dry-run` to test your changes) 3. Let the workflow finish and check that pypi.csv contains the right data. -4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days. - - The collection script looks at timestamps and avoids adding overlapping data. +4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days and future days. + - Note: The collection script looks at timestamps and avoids adding overlapping data. ## Resources From e40da58caeab4053d596ef1c2fd6deb5e76abb11 Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:30:33 -0400 Subject: [PATCH 22/25] fix manual --- .github/workflows/manual.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml index dd7840b..7bee6fa 100644 --- a/.github/workflows/manual.yaml +++ b/.github/workflows/manual.yaml @@ -38,7 +38,7 @@ jobs: download-analytics collect \ --verbose \ --projects ${{ github.event.inputs.projects }} \ - ${{ inputs.max_days && '--max_days github.event.inputs.max_days' || ''}} + ${{ inputs.max_days && '--max_days ${{ github.event.inputs.max_days }}' || ''}} --output-folder gdrive://${{ github.event.inputs.output_folder }} \ ${{ github.event.inputs.extras }} env: From 70d339e4b764cab5f1381670b2c29e30fb12fd2a Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:34:37 -0400 Subject: [PATCH 23/25] fix manual --- .github/workflows/manual.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml index 7bee6fa..7a7d91b 100644 --- a/.github/workflows/manual.yaml +++ b/.github/workflows/manual.yaml @@ -38,7 +38,8 @@ jobs: download-analytics collect \ --verbose \ --projects ${{ github.event.inputs.projects }} \ - ${{ inputs.max_days && '--max_days ${{ github.event.inputs.max_days }}' || ''}} + ${{ github.event.inputs.max_days && '--max-days ' || '' }} \ + ${{ github.event.inputs.max_days && github.event.inputs.max_days || '' }} \ --output-folder gdrive://${{ github.event.inputs.output_folder }} \ ${{ github.event.inputs.extras }} env: From e0f728fd36306cc8411f660e4fecb38d23230dce Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:38:09 -0400 Subject: [PATCH 24/25] fix max_days --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e3242da..7981a8f 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ In order add new libraries, it is important to follow these steps to ensure that 2. Run the [Manual collection workflow](https://github.com/datacebo/download-analytics/actions/workflows/manual.yaml) on your branch. - Use workflow from **your branch name**. - List the project names you added - - Put '' for max_days to indicate you want all data + - Remove `7` from max days to indicate you want all data - Pass any extra arguments (for example `--dry-run` to test your changes) 3. Let the workflow finish and check that pypi.csv contains the right data. 4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days and future days. From b8b97416328ffc626b3a40b481d90bd7b220766b Mon Sep 17 00:00:00 2001 From: gsheni Date: Tue, 3 Jun 2025 15:38:20 -0400 Subject: [PATCH 25/25] fix docs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7981a8f..f649867 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ In order add new libraries, it is important to follow these steps to ensure that 1. Update `config.yaml` with the new libraries (pypi project names only for now) 2. Run the [Manual collection workflow](https://github.com/datacebo/download-analytics/actions/workflows/manual.yaml) on your branch. - Use workflow from **your branch name**. - - List the project names you added + - List all project names from config.yaml - Remove `7` from max days to indicate you want all data - Pass any extra arguments (for example `--dry-run` to test your changes) 3. Let the workflow finish and check that pypi.csv contains the right data.