From 8b3b77342fce626ce49cafed4e039982c750fb8c Mon Sep 17 00:00:00 2001
From: priyanka <21082240+priya-gitTest@users.noreply.github.com>
Date: Thu, 5 Mar 2026 20:48:09 +0100
Subject: [PATCH 1/5] fix: gracefully handle GitHub archive download failure
 instead of sys.exit()

When neither the primary archive URL nor the main.zip fallback returns HTTP 200,
download_github_files() was calling sys.exit() which crashed the entire SOMEF process.
Replace with logging.error() + return None, consistent with how download_readme() handles
the same situation. Fixes #909.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/somef/process_repository.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
index 2258e008..81d75c10 100644
--- a/src/somef/process_repository.py
+++ b/src/somef/process_repository.py
@@ -763,7 +763,8 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
             return None
         
     if repo_download.status_code != 200:
-        sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+        logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+        return None
 
     repo_zip = repo_download.content
 

From de4de6e33e20aeb4c6bc8d15b5061de2114680a5 Mon Sep 17 00:00:00 2001
From: priyanka <21082240+priya-gitTest@users.noreply.github.com>
Date: Thu, 5 Mar 2026 21:04:29 +0100
Subject: [PATCH 2/5] fix: smart archive fallback for HTTP 300 + fix socket
 leak in rate_limit_get
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Root cause of issue #909 (balaje/icefem)

GitHub's short-form archive URL `/archive/{ref}.zip` returns HTTP 300
(Multiple Choices) when the ref name is ambiguous — i.e. when a branch and
a tag share the same name (e.g. default branch `v2.0` and a tag also named
`v2.0`).  The previous code only handled HTTP 404 with a single `main.zip`
fallback, so a 300 response bypassed the fallback and hit `sys.exit()`.

## Changes

### download_github_files — smarter fallback chain
Instead of one hard-coded `main.zip` fallback, we now try four candidate
URLs in order until one returns HTTP 200:
  1. /archive/{ref}.zip              — short form, works for unambiguous refs
  2. /archive/refs/heads/{ref}.zip   — explicit branch (resolves HTTP 300)
  3. /archive/refs/tags/{ref}.zip    — explicit tag    (resolves HTTP 300)
  4. /archive/main.zip               — legacy fallback for renamed branches

Any non-200 status (300, 404, …) moves to the next candidate.  If all four
fail, we log an error and return None instead of calling sys.exit().

### rate_limit_get — fix streaming socket leak
The size-check before download was using `requests.get(..., stream=True)` to
fetch just the Content-Length header.  A streaming GET opens a full TCP
connection and begins transferring the response body; if the stream is never
read to completion and never explicitly closed, the underlying socket is held
open (leaked).  Replaced with `requests.head(...)` + `response.close()` —
HEAD retrieves only headers without a body, which is exactly what is needed
here.  Also removed a stray `print()` debug statement.

Fixes #909

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/somef/process_repository.py | 87 +++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
index 81d75c10..af460f1e 100644
--- a/src/somef/process_repository.py
+++ b/src/somef/process_repository.py
@@ -50,25 +50,29 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
     parsed = urlparse(url)
     is_api_request = "api.github.com" in parsed.netloc
     content_length = None
-    # just verify size if NOT is a request to api.github.com
+    # Check file size before downloading the full body (skip for GitHub API requests,
+    # which are always small JSON payloads).
     if not is_api_request:
         try:
-            head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs)
+            # Use a proper HEAD request to read only the response headers.
+            # Previously this used requests.get(..., stream=True) which opens a full
+            # TCP connection and starts the response stream but never closes it —
+            # leaking a socket for every archive we inspect.  HEAD is the correct
+            # tool here: it retrieves headers without downloading the body.
+            head_response = requests.head(url, allow_redirects=True, **kwargs)
+            head_response.close()  # release the connection back to the pool immediately
             content_length = head_response.headers.get("Content-Length")
             if content_length is not None:
                 size_bytes = int(content_length)
-                print(f"HEAD Content-Length: {size_bytes}")
                 if size_bytes > size_limit_bytes:
                     logging.warning(
                         f"Download size {size_bytes} bytes exceeds limit of {size_limit_bytes} bytes. Skipping download."
                     )
                     return None, None
             else:
-                # logging.warning(f"Could not determine file size for {url}. Skipping download.")
-                # return None, None
                 logging.warning(f"No Content-Length header for {url}. Proceeding with download anyway (unable to estimate size).")
         except Exception as e:
-            logging.warning(f"HEAD/stream request failed: {e}. Continuing with GET...")
+            logging.warning(f"HEAD request failed: {e}. Continuing with GET...")
 
     rate_limited = True
     date = ""
@@ -731,10 +735,28 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe
 
 def download_github_files(directory, owner, repo_name, repo_ref, authorization):
     """
-    Download all repository files from a GitHub repository
+    Download all repository files from a GitHub repository.
+
+    GitHub's short-form archive URL ``/archive/{ref}.zip`` returns HTTP 300 (Multiple
+    Choices) when the ref name is **ambiguous** — i.e. a branch and a tag share the
+    same name (e.g. a repo whose default branch is ``v2.0`` and also has a tag called
+    ``v2.0``).  In that case we must use the fully-qualified ref URLs:
+      - ``/archive/refs/heads/{ref}.zip``  (explicit branch)
+      - ``/archive/refs/tags/{ref}.zip``   (explicit tag)
+
+    We also keep the legacy ``main.zip`` fallback for repositories that renamed their
+    default branch to ``main`` after being created with ``master`` (or vice-versa) so
+    that the GitHub API default_branch value is momentarily stale.
+
+    Fallback order tried in sequence until one returns HTTP 200:
+      1. ``/archive/{ref}.zip``              — short form, works for unambiguous refs
+      2. ``/archive/refs/heads/{ref}.zip``   — unambiguous branch (fixes HTTP 300)
+      3. ``/archive/refs/tags/{ref}.zip``    — unambiguous tag  (fixes HTTP 300)
+      4. ``/archive/main.zip``               — legacy branch-rename fallback
+
     Parameters
     ----------
-    repo_ref: link to branch of the repo
+    repo_ref: default branch (or tag) returned by the GitHub API
     repo_name: name of the repo
     owner: GitHub owner
     directory: directory where to extract all downloaded files
@@ -742,28 +764,43 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
 
     Returns
     -------
-    path to the folder where all files have been downloaded
+    Path to the folder where all files have been downloaded, or None on failure.
     """
-    # download the repo at the selected branch with the link
-    repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
-    logging.info(f"Downloading {repo_archive_url}")
-    repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
+    # Candidate archive URLs tried in order.  We start with the short form because it
+    # works for the vast majority of repos and avoids an extra HTTP round-trip.  When
+    # that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the
+    # fully-qualified refs/heads/ and refs/tags/ forms before falling back to main.
+    candidate_urls = [
+        f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip",
+        f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip",
+        f"https://github.com/{owner}/{repo_name}/archive/refs/tags/{repo_ref}.zip",
+        f"https://github.com/{owner}/{repo_name}/archive/main.zip",
+    ]
 
-    if repo_download is None:
-        logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
-        return None
-    
-    if repo_download.status_code == 404:
-        logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
-        repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
-        logging.info(f"Trying to download {repo_archive_url}")
+    repo_download = None
+    repo_archive_url = None
+    for repo_archive_url in candidate_urls:
+        logging.info(f"Downloading {repo_archive_url}")
         repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
         if repo_download is None:
-            logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
+            # Size limit exceeded or streaming error — no point trying other URLs
+            logging.warning(
+                f"Repository archive skipped due to size limit: "
+                f"{constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length."
+            )
             return None
-        
-    if repo_download.status_code != 200:
-        logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+        if repo_download.status_code == 200:
+            break
+        logging.warning(
+            f"Archive URL {repo_archive_url} returned HTTP {repo_download.status_code}, "
+            f"trying next fallback..."
+        )
+
+    if repo_download is None or repo_download.status_code != 200:
+        logging.error(
+            f"All archive download attempts failed for {owner}/{repo_name} "
+            f"(last status: {getattr(repo_download, 'status_code', 'N/A')})"
+        )
         return None
 
     repo_zip = repo_download.content

From aa1dcb91faa642072078b4373a8bf6c58fc78738 Mon Sep 17 00:00:00 2001
From: priyanka <21082240+priya-gitTest@users.noreply.github.com>
Date: Thu, 5 Mar 2026 21:08:47 +0100
Subject: [PATCH 3/5] test: add unit tests for issue #909 archive fallback and
 socket leak fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TestIssue909ArchiveFallback:
- HTTP 300 on short URL falls back to refs/heads/ (the balaje/icefem case)
- HTTP 300 → refs/heads/ 404 → falls back to refs/tags/
- HTTP 404 on all ref URLs reaches the legacy main.zip fallback
- All fallbacks failing returns None instead of calling sys.exit()
- Size-limit None response stops the fallback loop immediately

TestRateLimitGetHeadRequest:
- Verifies requests.head() is now used (not streaming GET) for size check
- Verifies HEAD response is explicitly closed even on size-exceeded early return

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/somef/test/test_process_repository.py | 173 +++++++++++++++++++++-
 1 file changed, 172 insertions(+), 1 deletion(-)

diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py
index dbe477a9..f2eba791 100644
--- a/src/somef/test/test_process_repository.py
+++ b/src/somef/test/test_process_repository.py
@@ -1,7 +1,10 @@
+import io
 import os
 import tempfile
 import unittest
+import zipfile
 from pathlib import Path
+from unittest.mock import MagicMock, patch, call
 
 from .. import process_repository, process_files, somef_cli
 from ..utils import constants
@@ -207,4 +210,172 @@ def test_issue_611(self):
         github_data = Result()
         text, github_data = process_files.process_repository_files(test_data_repositories + "termlex-main", github_data,
                                                                    constants.RepositoryType.LOCAL)
-        assert len(github_data.results[constants.CAT_ONTOLOGIES]) >= 1
\ No newline at end of file
+        assert len(github_data.results[constants.CAT_ONTOLOGIES]) >= 1
+
+
+def _make_mock_response(status_code, content=b""):
+    """Helper: create a minimal mock requests.Response."""
+    resp = MagicMock()
+    resp.status_code = status_code
+    resp.content = content
+    resp.headers = {}
+    return resp
+
+
+def _make_zip_bytes(inner_dir="owner_repo"):
+    """Helper: build a minimal valid zip archive containing one file."""
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr(f"{inner_dir}/README.md", "# Test")
+    return buf.getvalue()
+
+
+class TestIssue909ArchiveFallback(unittest.TestCase):
+    """
+    Tests for download_github_files HTTP-300 fallback chain (issue #909).
+
+    GitHub returns HTTP 300 (Multiple Choices) when the short-form archive URL
+    /archive/{ref}.zip is ambiguous — i.e. a branch and a tag share the same
+    name.  The fix adds a cascade of unambiguous fallback URLs so SOMEF can
+    still download the archive instead of crashing.
+    """
+
+    @patch("somef.process_repository.rate_limit_get")
+    def test_http_300_falls_back_to_refs_heads(self, mock_rlg):
+        """
+        HTTP 300 on the short-form URL must trigger the refs/heads/ fallback.
+        Scenario: balaje/icefem whose default branch 'v2.0' is also a tag name.
+        """
+        zip_bytes = _make_zip_bytes("balaje_icefem")
+        mock_rlg.side_effect = [
+            (_make_mock_response(300), ""),                  # /archive/v2.0.zip       → 300
+            (_make_mock_response(200, zip_bytes), ""),       # refs/heads/v2.0.zip     → 200
+        ]
+        with tempfile.TemporaryDirectory() as tmp:
+            result = process_repository.download_github_files(tmp, "balaje", "icefem", "v2.0", None)
+
+        self.assertIsNotNone(result, "Should succeed via refs/heads/ fallback")
+        urls_tried = [c[0][0] for c in mock_rlg.call_args_list]
+        self.assertIn("archive/v2.0.zip", urls_tried[0])
+        self.assertIn("refs/heads/v2.0.zip", urls_tried[1])
+
+    @patch("somef.process_repository.rate_limit_get")
+    def test_http_300_falls_back_to_refs_tags(self, mock_rlg):
+        """
+        When refs/heads/ also fails, refs/tags/ must be tried next.
+        """
+        zip_bytes = _make_zip_bytes("owner_repo")
+        mock_rlg.side_effect = [
+            (_make_mock_response(300), ""),                  # short form        → 300
+            (_make_mock_response(404), ""),                  # refs/heads/       → 404
+            (_make_mock_response(200, zip_bytes), ""),       # refs/tags/        → 200
+        ]
+        with tempfile.TemporaryDirectory() as tmp:
+            result = process_repository.download_github_files(tmp, "owner", "repo", "v1.0", None)
+
+        self.assertIsNotNone(result, "Should succeed via refs/tags/ fallback")
+        urls_tried = [c[0][0] for c in mock_rlg.call_args_list]
+        self.assertIn("refs/tags/v1.0.zip", urls_tried[2])
+
+    @patch("somef.process_repository.rate_limit_get")
+    def test_http_404_falls_back_to_main(self, mock_rlg):
+        """
+        HTTP 404 on all ref-specific URLs must reach the legacy main.zip fallback.
+        """
+        zip_bytes = _make_zip_bytes("owner_repo")
+        mock_rlg.side_effect = [
+            (_make_mock_response(404), ""),  # short form
+            (_make_mock_response(404), ""),  # refs/heads/
+            (_make_mock_response(404), ""),  # refs/tags/
+            (_make_mock_response(200, zip_bytes), ""),  # main.zip → 200
+        ]
+        with tempfile.TemporaryDirectory() as tmp:
+            result = process_repository.download_github_files(tmp, "owner", "repo", "oldmaster", None)
+
+        self.assertIsNotNone(result, "Should succeed via main.zip fallback")
+        urls_tried = [c[0][0] for c in mock_rlg.call_args_list]
+        self.assertIn("archive/main.zip", urls_tried[-1])
+
+    @patch("somef.process_repository.rate_limit_get")
+    def test_all_fallbacks_fail_returns_none_not_exit(self, mock_rlg):
+        """
+        When all four candidate URLs fail, download_github_files must return None
+        instead of calling sys.exit() (which would crash the whole process).
+        """
+        mock_rlg.return_value = (_make_mock_response(404), "")
+        with tempfile.TemporaryDirectory() as tmp:
+            result = process_repository.download_github_files(tmp, "owner", "repo", "branch", None)
+
+        self.assertIsNone(result)
+        # All four candidates should have been attempted
+        self.assertEqual(mock_rlg.call_count, 4)
+
+    @patch("somef.process_repository.rate_limit_get")
+    def test_size_limit_stops_loop_immediately(self, mock_rlg):
+        """
+        When rate_limit_get returns None (size limit exceeded), the fallback loop
+        must stop immediately — there is no point retrying other URLs for the same
+        oversized archive.
+        """
+        mock_rlg.return_value = (None, None)
+        with tempfile.TemporaryDirectory() as tmp:
+            result = process_repository.download_github_files(tmp, "owner", "repo", "main", None)
+
+        self.assertIsNone(result)
+        self.assertEqual(mock_rlg.call_count, 1, "Should stop after first None response")
+
+
+class TestRateLimitGetHeadRequest(unittest.TestCase):
+    """
+    Tests for the socket-leak fix in rate_limit_get (issue #909 follow-up).
+
+    The previous implementation used requests.get(..., stream=True) to inspect
+    the Content-Length header before downloading.  A streaming GET opens a full
+    TCP connection whose socket is never released if the stream body is never
+    read and the response is never closed.  The fix uses requests.head() instead,
+    which retrieves headers only and whose connection is explicitly closed.
+    """
+
+    @patch("somef.process_repository.requests.get")
+    @patch("somef.process_repository.requests.head")
+    def test_head_used_instead_of_streaming_get(self, mock_head, mock_get):
+        """rate_limit_get must call requests.head() (not streaming GET) for size check."""
+        head_resp = MagicMock()
+        head_resp.headers = {"Content-Length": "1024"}
+        head_resp.close = MagicMock()
+        mock_head.return_value = head_resp
+
+        get_resp = MagicMock()
+        get_resp.status_code = 200
+        get_resp.headers = {}
+        # Simulate a small non-streaming response
+        get_resp.iter_content = MagicMock(return_value=iter([b"data"]))
+        mock_get.return_value = get_resp
+
+        process_repository.rate_limit_get(
+            "https://github.com/owner/repo/archive/main.zip"
+        )
+
+        mock_head.assert_called_once()
+        head_resp.close.assert_called_once()
+
+    @patch("somef.process_repository.requests.get")
+    @patch("somef.process_repository.requests.head")
+    def test_head_response_closed_on_size_exceeded(self, mock_head, mock_get):
+        """
+        The HEAD response must be closed even when the size check triggers an
+        early return — otherwise the connection stays open in the pool indefinitely.
+        """
+        oversized = (constants.SIZE_DOWNLOAD_LIMIT_MB + 1) * 1024 * 1024
+        head_resp = MagicMock()
+        head_resp.headers = {"Content-Length": str(oversized)}
+        head_resp.close = MagicMock()
+        mock_head.return_value = head_resp
+
+        result, _ = process_repository.rate_limit_get(
+            "https://github.com/owner/repo/archive/main.zip"
+        )
+
+        self.assertIsNone(result)
+        head_resp.close.assert_called_once()
+        mock_get.assert_not_called()  # full GET should never be issued
\ No newline at end of file

From a6fa2806bfa73179a20b4dcd01580c62269b1048 Mon Sep 17 00:00:00 2001
From: priyanka <21082240+priya-gitTest@users.noreply.github.com>
Date: Thu, 5 Mar 2026 21:10:28 +0100
Subject: [PATCH 4/5] fix: replace inplace=True pandas calls in header_analysis
 to silence FutureWarnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pandas 3.0 will remove support for inplace= on chained operations.
Three active call sites were still using the deprecated pattern:

  df.dropna(subset=['Content'], inplace=True)
      → df = df.dropna(subset=['Content'])

  df.drop(columns=['ParentGroup'], inplace=True)
      → df = df.drop(columns=['ParentGroup'])

  valid.rename(columns={...}, inplace=True)
      → valid = valid.rename(columns={...})

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/somef/header_analysis.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
index 4a4088dd..d1615d2b 100644
--- a/src/somef/header_analysis.py
+++ b/src/somef/header_analysis.py
@@ -147,7 +147,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:
 
     # df['Content'].replace('', np.nan, inplace=True)
     df['Content'] = df['Content'].replace('', np.nan)
-    df.dropna(subset=['Content'], inplace=True)
+    df = df.dropna(subset=['Content'])
 
     return df, none_header_content
 
@@ -375,7 +375,7 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
         df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text)
 
         df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
-        df.drop(columns=['ParentGroup'], inplace=True)
+        df = df.drop(columns=['ParentGroup'])
 
         if not df.iloc[0]['Group']:
             df.loc[df.index[0], 'Group'] = ['unknown']
@@ -384,11 +384,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
         df.loc[df['Group'] == 'unknown', 'Group'] = np.nan
 
         valid = df[df['Group'].notna()].copy()
-        valid.rename(columns={
+        valid = valid.rename(columns={
             'Content': constants.PROP_VALUE,
             'Header': constants.PROP_ORIGINAL_HEADER,
             'ParentHeader': constants.PROP_PARENT_HEADER,
-        }, inplace=True)
+        })
 
         source = None
         if constants.CAT_README_URL in repository_metadata.results:

From b1fcde8541c19caeb7afc4e7f754c28a3863bcd8 Mon Sep 17 00:00:00 2001
From: priyanka <21082240+priya-gitTest@users.noreply.github.com>
Date: Thu, 5 Mar 2026 21:47:11 +0100
Subject: [PATCH 5/5] fix: guard None returns from rate_limit_get and remove
 debug leftovers

- Add None check in get_all_paginated_results before accessing response.status_code
- Add None check for languages_raw before calling .json() in load_online_repository_metadata
- Add timeout to requests.head() in rate_limit_get to prevent indefinite hangs
- Remove leftover `import pdb` from extract_software_type.py
- Replace debug print() calls in check_static_websites with logging.warning()

These fixes address crashes when rate_limit_get returns None (e.g. size limit
exceeded or network error) and eliminate stdout pollution from debug prints
that could corrupt JSON output when SOMEF is called programmatically.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/somef/extract_software_type.py |  8 ++------
 src/somef/process_repository.py    | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/somef/extract_software_type.py b/src/somef/extract_software_type.py
index 43dc695d..9fa335b2 100644
--- a/src/somef/extract_software_type.py
+++ b/src/somef/extract_software_type.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from pathlib import Path
 import nbformat
@@ -9,8 +10,6 @@
 from .utils import constants
 from .extract_ontologies import is_file_ontology
 
-import pdb
-
 
 def check_repository_type(path_repo, title, metadata_result: Result):
     """ Function that adds the metadata result in the JSON 
@@ -200,18 +199,15 @@ def check_static_websites(path_repo, repo_metadata: Result):
                     return False
     try:
         languages = repo_metadata[constants.CAT_PROGRAMMING_LANGUAGES]
-        print(languages)
         for language in languages:
             language_name = language[constants.PROP_RESULT][constants.PROP_NAME]
-            print(language_name)
             if language_name.lower() == "javascript":
                 js_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
-                print(js_size)
             elif language_name.lower() == "scss" or language_name.lower() == "css":
                 css_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
             total_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
     except Exception as e:
-        print(e)
+        logging.warning(f"Could not retrieve programming languages for static website check: {e}")
     if html_file > 0:
         if js_size > 0 and css_size == 0:
             if js_size / total_size < 0.91:
diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
index af460f1e..91f0a0e0 100644
--- a/src/somef/process_repository.py
+++ b/src/somef/process_repository.py
@@ -59,7 +59,8 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
             # TCP connection and starts the response stream but never closes it —
             # leaking a socket for every archive we inspect.  HEAD is the correct
             # tool here: it retrieves headers without downloading the body.
-            head_response = requests.head(url, allow_redirects=True, **kwargs)
+            head_response = requests.head(url, allow_redirects=True,
+                                           timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs)
             head_response.close()  # release the connection back to the pool immediately
             content_length = head_response.headers.get("Content-Length")
             if content_length is not None:
@@ -612,8 +613,12 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
     # get languages
     if not ignore_api_metadata:
         languages_raw, date = rate_limit_get(filtered_resp['languages_url'], headers=header)
-        
-        languages = languages_raw.json()
+
+        if languages_raw is None:
+            logging.warning("Skipping languages: rate_limit_get returned None (size limit or network error)")
+            languages = {}
+        else:
+            languages = languages_raw.json()
         if "message" in languages:
             logging.error("Error while retrieving languages: " + languages["message"])
         else:
@@ -980,6 +985,10 @@ def get_all_paginated_results(base_url, headers, per_page=100):
         url = f"{base_url}?per_page={per_page}&page={page}"
         response, _ = rate_limit_get(url, headers=headers)
 
+        if response is None:
+            logging.warning(f"Skipping page {page}: rate_limit_get returned None (size limit or network error)")
+            break
+
         if response.status_code != 200:
             logging.warning(f"GitHub API error on page {page}: {response.status_code}")
             break