From 8b3b77342fce626ce49cafed4e039982c750fb8c Mon Sep 17 00:00:00 2001 From: priyanka <21082240+priya-gitTest@users.noreply.github.com> Date: Thu, 5 Mar 2026 20:48:09 +0100 Subject: [PATCH 1/5] fix: gracefully handle GitHub archive download failure instead of sys.exit() When neither the primary archive URL nor the main.zip fallback returns HTTP 200, download_github_files() was calling sys.exit() which crashed the entire SOMEF process. Replace with logging.error() + return None, consistent with how download_readme() handles the same situation. Fixes #909. Co-Authored-By: Claude Sonnet 4.6 --- src/somef/process_repository.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 2258e008..81d75c10 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -763,7 +763,8 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): return None if repo_download.status_code != 200: - sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}") + logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") + return None repo_zip = repo_download.content From de4de6e33e20aeb4c6bc8d15b5061de2114680a5 Mon Sep 17 00:00:00 2001 From: priyanka <21082240+priya-gitTest@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:04:29 +0100 Subject: [PATCH 2/5] fix: smart archive fallback for HTTP 300 + fix socket leak in rate_limit_get MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Root cause of issue #909 (balaje/icefem) GitHub's short-form archive URL `/archive/{ref}.zip` returns HTTP 300 (Multiple Choices) when the ref name is ambiguous — i.e. when a branch and a tag share the same name (e.g. default branch `v2.0` and a tag also named `v2.0`). The previous code only handled HTTP 404 with a single `main.zip` fallback, so a 300 response bypassed the fallback and hit `sys.exit()`. ## Changes ### download_github_files — smarter fallback chain Instead of one hard-coded `main.zip` fallback, we now try four candidate URLs in order until one returns HTTP 200: 1. /archive/{ref}.zip — short form, works for unambiguous refs 2. /archive/refs/heads/{ref}.zip — explicit branch (resolves HTTP 300) 3. /archive/refs/tags/{ref}.zip — explicit tag (resolves HTTP 300) 4. /archive/main.zip — legacy fallback for renamed branches Any non-200 status (300, 404, …) moves to the next candidate. If all four fail, we log an error and return None instead of calling sys.exit(). ### rate_limit_get — fix streaming socket leak The size-check before download was using `requests.get(..., stream=True)` to fetch just the Content-Length header. A streaming GET opens a full TCP connection and begins transferring the response body; if the stream is never read to completion and never explicitly closed, the underlying socket is held open (leaked). Replaced with `requests.head(...)` + `response.close()` — HEAD retrieves only headers without a body, which is exactly what is needed here. Also removed a stray `print()` debug statement. Fixes #909 Co-Authored-By: Claude Sonnet 4.6 --- src/somef/process_repository.py | 87 +++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 81d75c10..af460f1e 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -50,25 +50,29 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const parsed = urlparse(url) is_api_request = "api.github.com" in parsed.netloc content_length = None - # just verify size if NOT is a request to api.github.com + # Check file size before downloading the full body (skip for GitHub API requests, + # which are always small JSON payloads). if not is_api_request: try: - head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs) + # Use a proper HEAD request to read only the response headers. + # Previously this used requests.get(..., stream=True) which opens a full + # TCP connection and starts the response stream but never closes it — + # leaking a socket for every archive we inspect. HEAD is the correct + # tool here: it retrieves headers without downloading the body. + head_response = requests.head(url, allow_redirects=True, **kwargs) + head_response.close() # release the connection back to the pool immediately content_length = head_response.headers.get("Content-Length") if content_length is not None: size_bytes = int(content_length) - print(f"HEAD Content-Length: {size_bytes}") if size_bytes > size_limit_bytes: logging.warning( f"Download size {size_bytes} bytes exceeds limit of {size_limit_bytes} bytes. Skipping download." ) return None, None else: - # logging.warning(f"Could not determine file size for {url}. Skipping download.") - # return None, None logging.warning(f"No Content-Length header for {url}. Proceeding with download anyway (unable to estimate size).") except Exception as e: - logging.warning(f"HEAD/stream request failed: {e}. Continuing with GET...") + logging.warning(f"HEAD request failed: {e}. Continuing with GET...") rate_limited = True date = "" @@ -731,10 +735,28 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe def download_github_files(directory, owner, repo_name, repo_ref, authorization): """ - Download all repository files from a GitHub repository + Download all repository files from a GitHub repository. + + GitHub's short-form archive URL ``/archive/{ref}.zip`` returns HTTP 300 (Multiple + Choices) when the ref name is **ambiguous** — i.e. a branch and a tag share the + same name (e.g. a repo whose default branch is ``v2.0`` and also has a tag called + ``v2.0``). In that case we must use the fully-qualified ref URLs: + - ``/archive/refs/heads/{ref}.zip`` (explicit branch) + - ``/archive/refs/tags/{ref}.zip`` (explicit tag) + + We also keep the legacy ``main.zip`` fallback for repositories that renamed their + default branch to ``main`` after being created with ``master`` (or vice-versa) so + that the GitHub API default_branch value is momentarily stale. + + Fallback order tried in sequence until one returns HTTP 200: + 1. ``/archive/{ref}.zip`` — short form, works for unambiguous refs + 2. ``/archive/refs/heads/{ref}.zip`` — unambiguous branch (fixes HTTP 300) + 3. ``/archive/refs/tags/{ref}.zip`` — unambiguous tag (fixes HTTP 300) + 4. ``/archive/main.zip`` — legacy branch-rename fallback + Parameters ---------- - repo_ref: link to branch of the repo + repo_ref: default branch (or tag) returned by the GitHub API repo_name: name of the repo owner: GitHub owner directory: directory where to extract all downloaded files @@ -742,28 +764,43 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): Returns ------- - path to the folder where all files have been downloaded + Path to the folder where all files have been downloaded, or None on failure. """ - # download the repo at the selected branch with the link - repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip" - logging.info(f"Downloading {repo_archive_url}") - repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) + # Candidate archive URLs tried in order. We start with the short form because it + # works for the vast majority of repos and avoids an extra HTTP round-trip. When + # that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the + # fully-qualified refs/heads/ and refs/tags/ forms before falling back to main. + candidate_urls = [ + f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip", + f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip", + f"https://github.com/{owner}/{repo_name}/archive/refs/tags/{repo_ref}.zip", + f"https://github.com/{owner}/{repo_name}/archive/main.zip", + ] - if repo_download is None: - logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.") - return None - - if repo_download.status_code == 404: - logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") - repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip" - logging.info(f"Trying to download {repo_archive_url}") + repo_download = None + repo_archive_url = None + for repo_archive_url in candidate_urls: + logging.info(f"Downloading {repo_archive_url}") repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) if repo_download is None: - logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.") + # Size limit exceeded or streaming error — no point trying other URLs + logging.warning( + f"Repository archive skipped due to size limit: " + f"{constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length." + ) return None - - if repo_download.status_code != 200: - logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") + if repo_download.status_code == 200: + break + logging.warning( + f"Archive URL {repo_archive_url} returned HTTP {repo_download.status_code}, " + f"trying next fallback..." + ) + + if repo_download is None or repo_download.status_code != 200: + logging.error( + f"All archive download attempts failed for {owner}/{repo_name} " + f"(last status: {getattr(repo_download, 'status_code', 'N/A')})" + ) return None repo_zip = repo_download.content From aa1dcb91faa642072078b4373a8bf6c58fc78738 Mon Sep 17 00:00:00 2001 From: priyanka <21082240+priya-gitTest@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:08:47 +0100 Subject: [PATCH 3/5] test: add unit tests for issue #909 archive fallback and socket leak fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestIssue909ArchiveFallback: - HTTP 300 on short URL falls back to refs/heads/ (the balaje/icefem case) - HTTP 300 → refs/heads/ 404 → falls back to refs/tags/ - HTTP 404 on all ref URLs reaches the legacy main.zip fallback - All fallbacks failing returns None instead of calling sys.exit() - Size-limit None response stops the fallback loop immediately TestRateLimitGetHeadRequest: - Verifies requests.head() is now used (not streaming GET) for size check - Verifies HEAD response is explicitly closed even on size-exceeded early return Co-Authored-By: Claude Sonnet 4.6 --- src/somef/test/test_process_repository.py | 173 +++++++++++++++++++++- 1 file changed, 172 insertions(+), 1 deletion(-) diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index dbe477a9..f2eba791 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -1,7 +1,10 @@ +import io import os import tempfile import unittest +import zipfile from pathlib import Path +from unittest.mock import MagicMock, patch, call from .. import process_repository, process_files, somef_cli from ..utils import constants @@ -207,4 +210,172 @@ def test_issue_611(self): github_data = Result() text, github_data = process_files.process_repository_files(test_data_repositories + "termlex-main", github_data, constants.RepositoryType.LOCAL) - assert len(github_data.results[constants.CAT_ONTOLOGIES]) >= 1 \ No newline at end of file + assert len(github_data.results[constants.CAT_ONTOLOGIES]) >= 1 + + +def _make_mock_response(status_code, content=b""): + """Helper: create a minimal mock requests.Response.""" + resp = MagicMock() + resp.status_code = status_code + resp.content = content + resp.headers = {} + return resp + + +def _make_zip_bytes(inner_dir="owner_repo"): + """Helper: build a minimal valid zip archive containing one file.""" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr(f"{inner_dir}/README.md", "# Test") + return buf.getvalue() + + +class TestIssue909ArchiveFallback(unittest.TestCase): + """ + Tests for download_github_files HTTP-300 fallback chain (issue #909). + + GitHub returns HTTP 300 (Multiple Choices) when the short-form archive URL + /archive/{ref}.zip is ambiguous — i.e. a branch and a tag share the same + name. The fix adds a cascade of unambiguous fallback URLs so SOMEF can + still download the archive instead of crashing. + """ + + @patch("somef.process_repository.rate_limit_get") + def test_http_300_falls_back_to_refs_heads(self, mock_rlg): + """ + HTTP 300 on the short-form URL must trigger the refs/heads/ fallback. + Scenario: balaje/icefem whose default branch 'v2.0' is also a tag name. + """ + zip_bytes = _make_zip_bytes("balaje_icefem") + mock_rlg.side_effect = [ + (_make_mock_response(300), ""), # /archive/v2.0.zip → 300 + (_make_mock_response(200, zip_bytes), ""), # refs/heads/v2.0.zip → 200 + ] + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "balaje", "icefem", "v2.0", None) + + self.assertIsNotNone(result, "Should succeed via refs/heads/ fallback") + urls_tried = [c[0][0] for c in mock_rlg.call_args_list] + self.assertIn("archive/v2.0.zip", urls_tried[0]) + self.assertIn("refs/heads/v2.0.zip", urls_tried[1]) + + @patch("somef.process_repository.rate_limit_get") + def test_http_300_falls_back_to_refs_tags(self, mock_rlg): + """ + When refs/heads/ also fails, refs/tags/ must be tried next. + """ + zip_bytes = _make_zip_bytes("owner_repo") + mock_rlg.side_effect = [ + (_make_mock_response(300), ""), # short form → 300 + (_make_mock_response(404), ""), # refs/heads/ → 404 + (_make_mock_response(200, zip_bytes), ""), # refs/tags/ → 200 + ] + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "v1.0", None) + + self.assertIsNotNone(result, "Should succeed via refs/tags/ fallback") + urls_tried = [c[0][0] for c in mock_rlg.call_args_list] + self.assertIn("refs/tags/v1.0.zip", urls_tried[2]) + + @patch("somef.process_repository.rate_limit_get") + def test_http_404_falls_back_to_main(self, mock_rlg): + """ + HTTP 404 on all ref-specific URLs must reach the legacy main.zip fallback. + """ + zip_bytes = _make_zip_bytes("owner_repo") + mock_rlg.side_effect = [ + (_make_mock_response(404), ""), # short form + (_make_mock_response(404), ""), # refs/heads/ + (_make_mock_response(404), ""), # refs/tags/ + (_make_mock_response(200, zip_bytes), ""), # main.zip → 200 + ] + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "oldmaster", None) + + self.assertIsNotNone(result, "Should succeed via main.zip fallback") + urls_tried = [c[0][0] for c in mock_rlg.call_args_list] + self.assertIn("archive/main.zip", urls_tried[-1]) + + @patch("somef.process_repository.rate_limit_get") + def test_all_fallbacks_fail_returns_none_not_exit(self, mock_rlg): + """ + When all four candidate URLs fail, download_github_files must return None + instead of calling sys.exit() (which would crash the whole process). + """ + mock_rlg.return_value = (_make_mock_response(404), "") + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "branch", None) + + self.assertIsNone(result) + # All four candidates should have been attempted + self.assertEqual(mock_rlg.call_count, 4) + + @patch("somef.process_repository.rate_limit_get") + def test_size_limit_stops_loop_immediately(self, mock_rlg): + """ + When rate_limit_get returns None (size limit exceeded), the fallback loop + must stop immediately — there is no point retrying other URLs for the same + oversized archive. + """ + mock_rlg.return_value = (None, None) + with tempfile.TemporaryDirectory() as tmp: + result = process_repository.download_github_files(tmp, "owner", "repo", "main", None) + + self.assertIsNone(result) + self.assertEqual(mock_rlg.call_count, 1, "Should stop after first None response") + + +class TestRateLimitGetHeadRequest(unittest.TestCase): + """ + Tests for the socket-leak fix in rate_limit_get (issue #909 follow-up). + + The previous implementation used requests.get(..., stream=True) to inspect + the Content-Length header before downloading. A streaming GET opens a full + TCP connection whose socket is never released if the stream body is never + read and the response is never closed. The fix uses requests.head() instead, + which retrieves headers only and whose connection is explicitly closed. + """ + + @patch("somef.process_repository.requests.get") + @patch("somef.process_repository.requests.head") + def test_head_used_instead_of_streaming_get(self, mock_head, mock_get): + """rate_limit_get must call requests.head() (not streaming GET) for size check.""" + head_resp = MagicMock() + head_resp.headers = {"Content-Length": "1024"} + head_resp.close = MagicMock() + mock_head.return_value = head_resp + + get_resp = MagicMock() + get_resp.status_code = 200 + get_resp.headers = {} + # Simulate a small non-streaming response + get_resp.iter_content = MagicMock(return_value=iter([b"data"])) + mock_get.return_value = get_resp + + process_repository.rate_limit_get( + "https://github.com/owner/repo/archive/main.zip" + ) + + mock_head.assert_called_once() + head_resp.close.assert_called_once() + + @patch("somef.process_repository.requests.get") + @patch("somef.process_repository.requests.head") + def test_head_response_closed_on_size_exceeded(self, mock_head, mock_get): + """ + The HEAD response must be closed even when the size check triggers an + early return — otherwise the connection stays open in the pool indefinitely. + """ + oversized = (constants.SIZE_DOWNLOAD_LIMIT_MB + 1) * 1024 * 1024 + head_resp = MagicMock() + head_resp.headers = {"Content-Length": str(oversized)} + head_resp.close = MagicMock() + mock_head.return_value = head_resp + + result, _ = process_repository.rate_limit_get( + "https://github.com/owner/repo/archive/main.zip" + ) + + self.assertIsNone(result) + head_resp.close.assert_called_once() + mock_get.assert_not_called() # full GET should never be issued \ No newline at end of file From a6fa2806bfa73179a20b4dcd01580c62269b1048 Mon Sep 17 00:00:00 2001 From: priyanka <21082240+priya-gitTest@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:10:28 +0100 Subject: [PATCH 4/5] fix: replace inplace=True pandas calls in header_analysis to silence FutureWarnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pandas 3.0 will remove support for inplace= on chained operations. Three active call sites were still using the deprecated pattern: df.dropna(subset=['Content'], inplace=True) → df = df.dropna(subset=['Content']) df.drop(columns=['ParentGroup'], inplace=True) → df = df.drop(columns=['ParentGroup']) valid.rename(columns={...}, inplace=True) → valid = valid.rename(columns={...}) Co-Authored-By: Claude Sonnet 4.6 --- src/somef/header_analysis.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py index 4a4088dd..d1615d2b 100644 --- a/src/somef/header_analysis.py +++ b/src/somef/header_analysis.py @@ -147,7 +147,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]: # df['Content'].replace('', np.nan, inplace=True) df['Content'] = df['Content'].replace('', np.nan) - df.dropna(subset=['Content'], inplace=True) + df = df.dropna(subset=['Content']) return df, none_header_content @@ -375,7 +375,7 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text) df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup'] - df.drop(columns=['ParentGroup'], inplace=True) + df = df.drop(columns=['ParentGroup']) if not df.iloc[0]['Group']: df.loc[df.index[0], 'Group'] = ['unknown'] @@ -384,11 +384,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res df.loc[df['Group'] == 'unknown', 'Group'] = np.nan valid = df[df['Group'].notna()].copy() - valid.rename(columns={ + valid = valid.rename(columns={ 'Content': constants.PROP_VALUE, 'Header': constants.PROP_ORIGINAL_HEADER, 'ParentHeader': constants.PROP_PARENT_HEADER, - }, inplace=True) + }) source = None if constants.CAT_README_URL in repository_metadata.results: From b1fcde8541c19caeb7afc4e7f754c28a3863bcd8 Mon Sep 17 00:00:00 2001 From: priyanka <21082240+priya-gitTest@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:47:11 +0100 Subject: [PATCH 5/5] fix: guard None returns from rate_limit_get and remove debug leftovers - Add None check in get_all_paginated_results before accessing response.status_code - Add None check for languages_raw before calling .json() in load_online_repository_metadata - Add timeout to requests.head() in rate_limit_get to prevent indefinite hangs - Remove leftover `import pdb` from extract_software_type.py - Replace debug print() calls in check_static_websites with logging.warning() These fixes address crashes when rate_limit_get returns None (e.g. size limit exceeded or network error) and eliminate stdout pollution from debug prints that could corrupt JSON output when SOMEF is called programmatically. Co-Authored-By: Claude Sonnet 4.6 --- src/somef/extract_software_type.py | 8 ++------ src/somef/process_repository.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/somef/extract_software_type.py b/src/somef/extract_software_type.py index 43dc695d..9fa335b2 100644 --- a/src/somef/extract_software_type.py +++ b/src/somef/extract_software_type.py @@ -1,3 +1,4 @@ +import logging import os from pathlib import Path import nbformat @@ -9,8 +10,6 @@ from .utils import constants from .extract_ontologies import is_file_ontology -import pdb - def check_repository_type(path_repo, title, metadata_result: Result): """ Function that adds the metadata result in the JSON @@ -200,18 +199,15 @@ def check_static_websites(path_repo, repo_metadata: Result): return False try: languages = repo_metadata[constants.CAT_PROGRAMMING_LANGUAGES] - print(languages) for language in languages: language_name = language[constants.PROP_RESULT][constants.PROP_NAME] - print(language_name) if language_name.lower() == "javascript": js_size += language[constants.PROP_RESULT][constants.PROP_SIZE] - print(js_size) elif language_name.lower() == "scss" or language_name.lower() == "css": css_size += language[constants.PROP_RESULT][constants.PROP_SIZE] total_size += language[constants.PROP_RESULT][constants.PROP_SIZE] except Exception as e: - print(e) + logging.warning(f"Could not retrieve programming languages for static website check: {e}") if html_file > 0: if js_size > 0 and css_size == 0: if js_size / total_size < 0.91: diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index af460f1e..91f0a0e0 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -59,7 +59,8 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const # TCP connection and starts the response stream but never closes it — # leaking a socket for every archive we inspect. HEAD is the correct # tool here: it retrieves headers without downloading the body. - head_response = requests.head(url, allow_redirects=True, **kwargs) + head_response = requests.head(url, allow_redirects=True, + timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs) head_response.close() # release the connection back to the pool immediately content_length = head_response.headers.get("Content-Length") if content_length is not None: @@ -612,8 +613,12 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, # get languages if not ignore_api_metadata: languages_raw, date = rate_limit_get(filtered_resp['languages_url'], headers=header) - - languages = languages_raw.json() + + if languages_raw is None: + logging.warning("Skipping languages: rate_limit_get returned None (size limit or network error)") + languages = {} + else: + languages = languages_raw.json() if "message" in languages: logging.error("Error while retrieving languages: " + languages["message"]) else: @@ -980,6 +985,10 @@ def get_all_paginated_results(base_url, headers, per_page=100): url = f"{base_url}?per_page={per_page}&page={page}" response, _ = rate_limit_get(url, headers=headers) + if response is None: + logging.warning(f"Skipping page {page}: rate_limit_get returned None (size limit or network error)") + break + if response.status_code != 200: logging.warning(f"GitHub API error on page {page}: {response.status_code}") break