Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions src/somef/extract_software_type.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
from pathlib import Path
import nbformat
Expand All @@ -9,8 +10,6 @@
from .utils import constants
from .extract_ontologies import is_file_ontology

import pdb


def check_repository_type(path_repo, title, metadata_result: Result):
""" Function that adds the metadata result in the JSON
Expand Down Expand Up @@ -200,18 +199,15 @@ def check_static_websites(path_repo, repo_metadata: Result):
return False
try:
languages = repo_metadata[constants.CAT_PROGRAMMING_LANGUAGES]
print(languages)
for language in languages:
language_name = language[constants.PROP_RESULT][constants.PROP_NAME]
print(language_name)
if language_name.lower() == "javascript":
js_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
print(js_size)
elif language_name.lower() == "scss" or language_name.lower() == "css":
css_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
total_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
except Exception as e:
print(e)
logging.warning(f"Could not retrieve programming languages for static website check: {e}")
if html_file > 0:
if js_size > 0 and css_size == 0:
if js_size / total_size < 0.91:
Expand Down
8 changes: 4 additions & 4 deletions src/somef/header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:

# df['Content'].replace('', np.nan, inplace=True)
df['Content'] = df['Content'].replace('', np.nan)
df.dropna(subset=['Content'], inplace=True)
df = df.dropna(subset=['Content'])

return df, none_header_content

Expand Down Expand Up @@ -375,7 +375,7 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text)

df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
df.drop(columns=['ParentGroup'], inplace=True)
df = df.drop(columns=['ParentGroup'])

if not df.iloc[0]['Group']:
df.loc[df.index[0], 'Group'] = ['unknown']
Expand All @@ -384,11 +384,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
df.loc[df['Group'] == 'unknown', 'Group'] = np.nan

valid = df[df['Group'].notna()].copy()
valid.rename(columns={
valid = valid.rename(columns={
'Content': constants.PROP_VALUE,
'Header': constants.PROP_ORIGINAL_HEADER,
'ParentHeader': constants.PROP_PARENT_HEADER,
}, inplace=True)
})

source = None
if constants.CAT_README_URL in repository_metadata.results:
Expand Down
101 changes: 74 additions & 27 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,30 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
parsed = urlparse(url)
is_api_request = "api.github.com" in parsed.netloc
content_length = None
# just verify size if NOT is a request to api.github.com
# Check file size before downloading the full body (skip for GitHub API requests,
# which are always small JSON payloads).
if not is_api_request:
try:
head_response = requests.get(url, stream=True, allow_redirects=True, **kwargs)
# Use a proper HEAD request to read only the response headers.
# Previously this used requests.get(..., stream=True) which opens a full
# TCP connection and starts the response stream but never closes it —
# leaking a socket for every archive we inspect. HEAD is the correct
# tool here: it retrieves headers without downloading the body.
head_response = requests.head(url, allow_redirects=True,
timeout=constants.DOWNLOAD_TIMEOUT_SECONDS, **kwargs)
head_response.close() # release the connection back to the pool immediately
content_length = head_response.headers.get("Content-Length")
if content_length is not None:
size_bytes = int(content_length)
print(f"HEAD Content-Length: {size_bytes}")
if size_bytes > size_limit_bytes:
logging.warning(
f"Download size {size_bytes} bytes exceeds limit of {size_limit_bytes} bytes. Skipping download."
)
return None, None
else:
# logging.warning(f"Could not determine file size for {url}. Skipping download.")
# return None, None
logging.warning(f"No Content-Length header for {url}. Proceeding with download anyway (unable to estimate size).")
except Exception as e:
logging.warning(f"HEAD/stream request failed: {e}. Continuing with GET...")
logging.warning(f"HEAD request failed: {e}. Continuing with GET...")

rate_limited = True
date = ""
Expand Down Expand Up @@ -608,8 +613,12 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
# get languages
if not ignore_api_metadata:
languages_raw, date = rate_limit_get(filtered_resp['languages_url'], headers=header)

languages = languages_raw.json()

if languages_raw is None:
logging.warning("Skipping languages: rate_limit_get returned None (size limit or network error)")
languages = {}
else:
languages = languages_raw.json()
if "message" in languages:
logging.error("Error while retrieving languages: " + languages["message"])
else:
Expand Down Expand Up @@ -731,39 +740,73 @@ def download_repository_files(owner, repo_name, default_branch, repo_type, targe

def download_github_files(directory, owner, repo_name, repo_ref, authorization):
"""
Download all repository files from a GitHub repository
Download all repository files from a GitHub repository.

GitHub's short-form archive URL ``/archive/{ref}.zip`` returns HTTP 300 (Multiple
Choices) when the ref name is **ambiguous** — i.e. a branch and a tag share the
same name (e.g. a repo whose default branch is ``v2.0`` and also has a tag called
``v2.0``). In that case we must use the fully-qualified ref URLs:
- ``/archive/refs/heads/{ref}.zip`` (explicit branch)
- ``/archive/refs/tags/{ref}.zip`` (explicit tag)

We also keep the legacy ``main.zip`` fallback for repositories that renamed their
default branch to ``main`` after being created with ``master`` (or vice-versa) so
that the GitHub API default_branch value is momentarily stale.

Fallback order tried in sequence until one returns HTTP 200:
1. ``/archive/{ref}.zip`` — short form, works for unambiguous refs
2. ``/archive/refs/heads/{ref}.zip`` — unambiguous branch (fixes HTTP 300)
3. ``/archive/refs/tags/{ref}.zip`` — unambiguous tag (fixes HTTP 300)
4. ``/archive/main.zip`` — legacy branch-rename fallback

Parameters
----------
repo_ref: link to branch of the repo
repo_ref: default branch (or tag) returned by the GitHub API
repo_name: name of the repo
owner: GitHub owner
directory: directory where to extract all downloaded files
authorization: GitHub authorization token

Returns
-------
path to the folder where all files have been downloaded
Path to the folder where all files have been downloaded, or None on failure.
"""
# download the repo at the selected branch with the link
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
logging.info(f"Downloading {repo_archive_url}")
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
# Candidate archive URLs tried in order. We start with the short form because it
# works for the vast majority of repos and avoids an extra HTTP round-trip. When
# that returns 300 (ambiguous ref) or 404 (ref not found), we escalate to the
# fully-qualified refs/heads/ and refs/tags/ forms before falling back to main.
candidate_urls = [
f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip",
f"https://github.com/{owner}/{repo_name}/archive/refs/heads/{repo_ref}.zip",
f"https://github.com/{owner}/{repo_name}/archive/refs/tags/{repo_ref}.zip",
f"https://github.com/{owner}/{repo_name}/archive/main.zip",
]

if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
return None

if repo_download.status_code == 404:
logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
logging.info(f"Trying to download {repo_archive_url}")
repo_download = None
repo_archive_url = None
for repo_archive_url in candidate_urls:
logging.info(f"Downloading {repo_archive_url}")
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
# Size limit exceeded or streaming error — no point trying other URLs
logging.warning(
f"Repository archive skipped due to size limit: "
f"{constants.SIZE_DOWNLOAD_LIMIT_MB} MB or no content-length."
)
return None

if repo_download.status_code != 200:
sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
if repo_download.status_code == 200:
break
logging.warning(
f"Archive URL {repo_archive_url} returned HTTP {repo_download.status_code}, "
f"trying next fallback..."
)

if repo_download is None or repo_download.status_code != 200:
logging.error(
f"All archive download attempts failed for {owner}/{repo_name} "
f"(last status: {getattr(repo_download, 'status_code', 'N/A')})"
)
return None

repo_zip = repo_download.content

Expand Down Expand Up @@ -942,6 +985,10 @@ def get_all_paginated_results(base_url, headers, per_page=100):
url = f"{base_url}?per_page={per_page}&page={page}"
response, _ = rate_limit_get(url, headers=headers)

if response is None:
logging.warning(f"Skipping page {page}: rate_limit_get returned None (size limit or network error)")
break

if response.status_code != 200:
logging.warning(f"GitHub API error on page {page}: {response.status_code}")
break
Expand Down
Loading