Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,11 @@ Options:
requests and increase execution time

-h, --help Show this message and exit.

Repoository versions [mutually_exclusive]:
-b, --branch name branch Branch of the repository to analyze. Overrides the default branch.

--tag text Tag of the repository to analyze. Cannot be used together with --branch.
```

## Usage example:
Expand Down
13 changes: 13 additions & 0 deletions src/somef/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,19 @@ def configure(auto, base_uri):
default=False,
help="""SOMEF will extract additional information from certain files like CODEOWNERS, etc."""
)
@click.option(
"--branch",
"-b",
type=str,
default=None,
help="Branch of the repository to analyze. Overrides the default branch."
)
@click.option(
"--tag",
type=str,
default=None,
help="Tag of the repository to analyze. Incompatible with --branch"
)
def describe(requirements_v, requirements_all, **kwargs):
# import so missing packages get installed when appropriate
if requirements_v:
Expand Down
113 changes: 71 additions & 42 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,41 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):

path_components = url.path.split('/')

if len(path_components) < 3:
logging.error("Gitlab link is not correct.")
# if len(path_components) < 3:
# logging.error("Gitlab link is not correct.")
# return " ", {}

# owner = path_components[1]
# repo_name = path_components[2]
# if len(path_components) == 4:
# repo_name = repo_name + '/' + path_components[3]

# new code to support complex gitlab urls. Before this we just accepted urls in the format https://gitlab.com/{owner}/{repo_name}
# clean path components
path_components = [p for p in url.path.split('/') if p]

# GitLab requires at least owner + repo
if len(path_components) < 2:
logging.error("GitLab link is not correct. Expected https://gitlab.com/<owner>/<repo>")
return " ", {}

owner = path_components[1]
repo_name = path_components[2]
if len(path_components) == 4:
repo_name = repo_name + '/' + path_components[3]
# the owner is alwyas the first
owner = path_components[0]

# and repo name is the last
routing_markers = {"-", "tree", "blob", "issues", "merge_requests"}

# If the last component is a routing marker, the repo is the previous one
if path_components[-1] in routing_markers:
repo_name = path_components[-2]
else:
repo_name = path_components[-1]

default_branch = None
if "tree" in path_components:
idx = path_components.index("tree")
if idx + 1 < len(path_components):
default_branch = "/".join(path_components[idx+1:])

# could be gitlab.com or some gitlab self-hosted GitLab servers like gitlab.in2p3.fr
if repository_url.rfind("gitlab.com") > 0:
Expand Down Expand Up @@ -255,20 +282,6 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):

repo_metadata.add_result(constants.CAT_RELEASES, release_obj, 1, constants.TECHNIQUE_GITLAB_API)


default_branch = None

if len(path_components) >= 5:
if not path_components[4] == "tree":
logging.error(
"GitLab link is not correct. \nThe correct format is https://gitlab.com/{owner}/{repo_name}.")

return " ", {}

# we must join all after 4, as sometimes tags have "/" in them.
default_branch = "/".join(path_components[5:])
ref_param = {"ref": default_branch}

if 'defaultBranch' in project_details.keys():
general_resp = {'defaultBranch': project_details['defaultBranch']}
elif 'default_branch' in project_details.keys():
Expand All @@ -289,17 +302,19 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):
if default_branch is None:
default_branch = general_resp['defaultBranch']

project_path = "/".join(path_components)

# {constants.PROP_VALUE: f"https://{url.netloc}/{owner}/{repo_name}/",
repo_metadata.add_result(constants.CAT_CODE_REPOSITORY,
{constants.PROP_VALUE: f"https://{url.netloc}/{owner}/{repo_name}/",
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_GITLAB_API)
{constants.PROP_VALUE: f"https://{url.netloc}/{project_path}/",
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_GITLAB_API)

# filtered_resp = do_crosswalk(general_resp, github_crosswalk_table)
# filtered_resp = {"downloadUrl": f"https://gitlab.com/{owner}/{repo_name}/-/branches"}
# {constants.PROP_VALUE: f"https://{url.netloc}/{owner}/{repo_name}/-/branches",
repo_metadata.add_result(constants.CAT_DOWNLOAD_URL,
{constants.PROP_VALUE: f"https://{url.netloc}/{owner}/{repo_name}/-/branches",
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_GITLAB_API)
{constants.PROP_VALUE: f"https://{url.netloc}/{project_path}/-/branches",
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_GITLAB_API)

# condense license information
license_result = {constants.PROP_TYPE: constants.URL}
Expand Down Expand Up @@ -330,11 +345,6 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):
if constants.PROP_VALUE in license_result:
repo_metadata.add_result(constants.CAT_LICENSE, license_result, 1, constants.TECHNIQUE_GITLAB_API)

# get keywords / topics
# topics_headers = header
# topics_headers['accept'] = 'application/vnd.github.mercy-preview+json'
# topics_resp, date = rate_limit_get(repo_api_base_url + "/topics",
# headers=topics_headers)
topics_resp = {}

keywords = []
Expand Down Expand Up @@ -388,7 +398,9 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url):
}, 1, constants.TECHNIQUE_GITLAB_API)

logging.info("Repository information successfully loaded. \n")
return repo_metadata, owner, repo_name, default_branch
# return repo_metadata, owner, repo_name, default_branch
return repo_metadata, owner, repo_name, default_branch, project_path



def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref):
Expand All @@ -409,9 +421,15 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref):
url = urlparse(repo_ref)
path_components = url.path.split('/')

repo_archive_url = f"https://{url.netloc}/{owner}/{repo_name}/-/archive/{repo_branch}/{repo_name}-{repo_branch}.zip"
if len(path_components) == 4:
repo_archive_url = f"https://{url.netloc}/{owner}/{repo_name}/-/archive/{repo_branch}/{path_components[3]}.zip"
path_components = [p for p in path_components if p]
project_path = "/".join(path_components)

# repo_archive_url = f"https://{url.netloc}/{owner}/{repo_name}/-/archive/{repo_branch}/{repo_name}-{repo_branch}.zip"
# if len(path_components) == 4:
# repo_archive_url = f"https://{url.netloc}/{owner}/{repo_name}/-/archive/{repo_branch}/{path_components[3]}.zip"
repo_archive_url = (
f"https://{url.netloc}/{project_path}/-/archive/{repo_branch}/{repo_name}-{repo_branch}.zip"
)

logging.info(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url)
Expand All @@ -435,7 +453,7 @@ def download_gitlab_files(directory, owner, repo_name, repo_branch, repo_ref):
return None


def download_readme(owner, repo_name, default_branch, repo_type, authorization):
def download_readme(owner, repo_name, default_branch, repo_type, authorization, project_path = None):
"""
Method that given a repository owner, name and default branch, it downloads the readme content only.
The readme is assumed to be README.md
Expand All @@ -451,8 +469,11 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization):
@return: text with the contents of the readme file
"""
if repo_type is constants.RepositoryType.GITLAB:
primary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/{default_branch}/README.md"
secondary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/master/README.md"
base = f"https://gitlab.com/{project_path}" if project_path else f"https://gitlab.com/{owner}/{repo_name}"
primary_url = f"{base}/-/raw/{default_branch}/README.md"
secondary_url = f"{base}/-/raw/master/README.md"
# primary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/{default_branch}/README.md"
# secondary_url = f"https://gitlab.com/{owner}/{repo_name}/-/raw/master/README.md"
elif repo_type is constants.RepositoryType.GITHUB:
primary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/{default_branch}/README.md"
secondary_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/README.md"
Expand Down Expand Up @@ -482,7 +503,8 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization):


def load_online_repository_metadata(repository_metadata: Result, repository_url, ignore_api_metadata=False,
repo_type=constants.RepositoryType.GITHUB, authorization=None, reconcile_authors=False):
repo_type=constants.RepositoryType.GITHUB, authorization=None, reconcile_authors=False,
branch=None,tag=None):
"""
Function uses the repository_url provided to load required information from GitHub or Gitlab.
Information kept from the repository is written in keep_keys.
Expand All @@ -494,6 +516,8 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
@param repository_url: target repository URL.
@param authorization: GitHub authorization token
@param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. More request.
@param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata.
@param tag: tag of the repository to analyze. Cannot be used together with the branch parameter.

Returns
-------
Expand Down Expand Up @@ -568,6 +592,11 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
elif default_branch is None:
default_branch = general_resp['default_branch']

if branch:
default_branch = branch
if tag:
default_branch = tag

# filter the general response with only the fields we are interested in, mapping them to our keys
filtered_resp = {}
if not ignore_api_metadata:
Expand Down Expand Up @@ -705,7 +734,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
repository_metadata.add_result(constants.CAT_RELEASES, release_obj, 1,
constants.TECHNIQUE_GITHUB_API)
logging.info("Repository information successfully loaded.\n")
return repository_metadata, owner, repo_name, default_branch
return repository_metadata, owner, repo_name, default_branch, None


def get_path(obj, path):
Expand Down
35 changes: 26 additions & 9 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None,
ignore_github_metadata=False, readme_only=False, keep_tmp=None, authorization=None,
ignore_test_folder=True,requirements_mode='all', reconcile_authors=False) -> Result:
ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None) -> Result:
"""
Main function to get the data through the command line
Parameters
Expand All @@ -38,6 +38,8 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
@param ignore_test_folder: Ignore contents of test folders
@param requiriments_mode: flag to indicate what requirements show in codemeta
@param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API.
@param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata.
@param tag: tag of the repository to analyze. Cannot be used together with the branch parameter.
Returns
-------
@return: Dictionary with the results found by SOMEF, formatted as a Result object.
Expand All @@ -54,6 +56,11 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
repo_type = constants.RepositoryType.GITHUB
repository_metadata = Result()
def_branch = "main"

if branch and tag:
logging.error("You cannot use --branch and --tag at the same time. Mutually exclusive")
sys.exit()

if repo_url is not None:
try:

Expand Down Expand Up @@ -82,19 +89,23 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
if bGitLab:
repo_type = constants.RepositoryType.GITLAB

repository_metadata, owner, repo_name, def_branch = process_repository.load_online_repository_metadata(
logging.info("Processing repository metadata.")
repository_metadata, owner, repo_name, def_branch, project_path = process_repository.load_online_repository_metadata(
repository_metadata,
repo_url,
ignore_github_metadata,
repo_type,
authorization,
reconcile_authors
reconcile_authors,
branch=branch,
tag=tag
)

# download files and obtain path to download folder
if readme_only:
logging.info("Downloading README only...")
# download readme only with the information above
readme_text = process_repository.download_readme(owner, repo_name, def_branch, repo_type, authorization)
readme_text = process_repository.download_readme(owner, repo_name, def_branch, repo_type, authorization, project_path)

elif keep_tmp is not None: # save downloaded files locally
os.makedirs(keep_tmp, exist_ok=True)
Expand Down Expand Up @@ -251,7 +262,9 @@ def run_cli(*,
keep_tmp=None,
ignore_test_folder=True,
requirements_mode="all",
reconcile_authors=False
reconcile_authors=False,
branch=None,
tag=None
):
"""Function to run all the required components of the cli for a repository"""
# check if it is a valid url
Expand Down Expand Up @@ -285,7 +298,8 @@ def run_cli(*,
encoded_url = encoded_url.replace(".","") #removing dots just in case
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only,
keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, reconcile_authors=reconcile_authors)
keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)

if hasattr(repo_data, "get_json"):
repo_data = repo_data.get_json()
Expand Down Expand Up @@ -318,13 +332,16 @@ def run_cli(*,
if repo_url:
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only,
keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors)
keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)
elif local_repo:
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers,
local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors)
local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)
else:
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers,
doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors)
doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors,
branch=branch, tag=tag)

if hasattr(repo_data, "get_json"):
repo_data = repo_data.get_json()
Expand Down
37 changes: 37 additions & 0 deletions src/somef/test/test_JSON_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,4 +625,41 @@ def test_issue_gitlab_enrich_authors(self):
self.assertEqual(second.get("username"), "FrNecas")
self.assertEqual(second.get("name"), "František Nečas")

os.remove(output_file)


@unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally")
def test_issue_914(self):
"""
Checks that somef can correctly handle and download a GitLab repository whose URL includes nested group/subgroup paths,
ensuring complex GitLab routes are processed without errors.
"""

output_file = test_data_path + "test-issue-914.json"

somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url="https://gitlab.com/gitlab-org/frontend/playground/batch-issue-creator",
local_repo=None,
doc_src=None,
in_file=None,
output=output_file,
graph_out=None,
graph_format="turtle",
codemeta_out=None,
pretty=True,
missing=True,
readme_only=False,
reconcile_authors=True
)

with open(output_file, "r") as f:
json_content = json.load(f)

title = json_content.get(constants.CAT_FULL_TITLE, [])
self.assertEqual(title[0].get("result", {}).get("value"), "batch-issue-creator")

requirements = json_content.get(constants.CAT_REQUIREMENTS, [])
self.assertEqual(requirements[0].get("result", {}).get("version"), "3.20.3")

os.remove(output_file)
Loading
Loading