From c91b035c13ac77b4ee112a7909350b9b9a472a16 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:37:43 -0300 Subject: [PATCH 1/7] Add Forked repos stream --- tap_github/client.py | 4 +- tap_github/schemas/repo_forked_compares.json | 74 ++++++ tap_github/schemas/repo_forked_parents.json | 260 +++++++++++++++++++ tap_github/schemas/repos.json | 248 ++++++++++++++++++ tap_github/schemas/shared/repos.json | 248 ++++++++++++++++++ tap_github/streams.py | 140 +++++++++- tap_github/sync.py | 2 +- 7 files changed, 969 insertions(+), 7 deletions(-) create mode 100644 tap_github/schemas/repo_forked_compares.json create mode 100644 tap_github/schemas/repo_forked_parents.json create mode 100644 tap_github/schemas/repos.json create mode 100644 tap_github/schemas/shared/repos.json diff --git a/tap_github/client.py b/tap_github/client.py index 92c02f35..7e524a16 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -222,7 +222,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) resp._content = b'{}' # pylint: disable=protected-access return resp - def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=True): """ Fetch all pages of records and return them. """ @@ -231,6 +231,8 @@ def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_4 response = self.authed_get(source, next_url, headers, stream, should_skip_404) yield response + if skip_pagination: + break next_url = response.links.get('next', {}).get('url', None) def prepare_url(self, url): diff --git a/tap_github/schemas/repo_forked_compares.json b/tap_github/schemas/repo_forked_compares.json new file mode 100644 index 00000000..0b756fbf --- /dev/null +++ b/tap_github/schemas/repo_forked_compares.json @@ -0,0 +1,74 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "owner_login": { + "type": ["null", "string"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "fork_name": { + "type": ["null", "string"] + }, + "fork_owner_login": { + "type": ["null", "string"] + }, + "fork_default_branch": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "permalink_url": { + "type": ["null", "string"] + }, + "diff_url": { + "type": ["null", "string"] + }, + "patch_url": { + "type": ["null", "string"] + }, + "base_commit": { + "type": ["null", "object"] + }, + "merge_base_commit": { + "type": ["null", "object"] + }, + "status": { + "type": ["null", "string"] + }, + "ahead_by": { + "type": ["null", "number"] + }, + "behind_by": { + "type": ["null", "number"] + }, + "total_commits": { + "type": ["null", "number"] + }, + "commits": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"] + } + }, + "files": { + "type": ["null", "array"], + "items": { + "type": ["null", "object"] + } + } + } +} diff --git a/tap_github/schemas/repo_forked_parents.json b/tap_github/schemas/repo_forked_parents.json new file mode 100644 index 00000000..31ff6129 --- /dev/null +++ b/tap_github/schemas/repo_forked_parents.json @@ -0,0 +1,260 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "fork_name": { + "type": ["null", "string"] + }, + "fork_owner_login": { + "type": ["null", "string"] + }, + "fork_default_branch": { + "type": ["null", "string"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + }, + "parent": { + "$ref": "shared/repos.json#/" + } + } +} diff --git a/tap_github/schemas/repos.json b/tap_github/schemas/repos.json new file mode 100644 index 00000000..7954ed2e --- /dev/null +++ b/tap_github/schemas/repos.json @@ -0,0 +1,248 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + } + } +} diff --git a/tap_github/schemas/shared/repos.json b/tap_github/schemas/shared/repos.json new file mode 100644 index 00000000..7954ed2e --- /dev/null +++ b/tap_github/schemas/shared/repos.json @@ -0,0 +1,248 @@ +{ + "type": ["null", "object"], + "properties": { + "_sdc_repository": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "number"] + }, + "node_id": { + "type": ["null", "string"] + }, + "url": { + "type": ["null", "string"] + }, + "html_url": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "full_name": { + "type": ["null", "string"] + }, + "private": { + "type": ["null", "boolean"] + }, + "owner": { + "$ref": "shared/user.json#/" + }, + "description": { + "type": ["null", "string"] + }, + "fork": { + "type": ["null", "boolean"] + }, + "keys_url": { + "type": ["null", "string"] + }, + "collaborators_url": { + "type": ["null", "string"] + }, + "teams_url": { + "type": ["null", "string"] + }, + "hooks_url": { + "type": ["null", "string"] + }, + "issue_events_url": { + "type": ["null", "string"] + }, + "events_url": { + "type": ["null", "string"] + }, + "assignees_url": { + "type": ["null", "string"] + }, + "branches_url": { + "type": ["null", "string"] + }, + "tags_url": { + "type": ["null", "string"] + }, + "blobs_url": { + "type": ["null", "string"] + }, + "git_tags_url": { + "type": ["null", "string"] + }, + "git_refs_url": { + "type": ["null", "string"] + }, + "trees_url": { + "type": ["null", "string"] + }, + "statuses_url": { + "type": ["null", "string"] + }, + "languages_url": { + "type": ["null", "string"] + }, + "stargazers_url": { + "type": ["null", "string"] + }, + "contributors_url": { + "type": ["null", "string"] + }, + "subscribers_url": { + "type": ["null", "string"] + }, + "subscription_url": { + "type": ["null", "string"] + }, + "commits_url": { + "type": ["null", "string"] + }, + "git_commits_url": { + "type": ["null", "string"] + }, + "comments_url": { + "type": ["null", "string"] + }, + "issue_comment_url": { + "type": ["null", "string"] + }, + "contents_url": { + "type": ["null", "string"] + }, + "compare_url": { + "type": ["null", "string"] + }, + "merges_url": { + "type": ["null", "string"] + }, + "archive_url": { + "type": ["null", "string"] + }, + "downloads_url": { + "type": ["null", "string"] + }, + "issues_url": { + "type": ["null", "string"] + }, + "pulls_url": { + "type": ["null", "string"] + }, + "milestones_url": { + "type": ["null", "string"] + }, + "notifications_url": { + "type": ["null", "string"] + }, + "labels_url": { + "type": ["null", "string"] + }, + "releases_url": { + "type": ["null", "string"] + }, + "deployments_url": { + "type": ["null", "string"] + }, + "created_at": { + "type": ["null", "string"] + }, + "updated_at": { + "type": ["null", "string"] + }, + "pushed_at": { + "type": ["null", "string"] + }, + "git_url": { + "type": ["null", "string"] + }, + "ssh_url": { + "type": ["null", "string"] + }, + "clone_url": { + "type": ["null", "string"] + }, + "svn_url": { + "type": ["null", "string"] + }, + "homepage": { + "type": ["null", "string"] + }, + "size": { + "type": ["null", "number"] + }, + "stargazers_count": { + "type": ["null", "number"] + }, + "watchers_count": { + "type": ["null", "number"] + }, + "language": { + "type": ["null", "string"] + }, + "has_issues": { + "type": ["null", "boolean"] + }, + "has_projects": { + "type": ["null", "boolean"] + }, + "has_downloads": { + "type": ["null", "boolean"] + }, + "has_wiki": { + "type": ["null", "boolean"] + }, + "has_pages": { + "type": ["null", "boolean"] + }, + "has_discussions": { + "type": ["null", "boolean"] + }, + "forks_count": { + "type": ["null", "number"] + }, + "mirror_url": { + "type": ["null", "string"] + }, + "archived": { + "type": ["null", "boolean"] + }, + "disabled": { + "type": ["null", "boolean"] + }, + "open_issues_count": { + "type": ["null", "number"] + }, + "license": { + "type": ["null", "object"] + }, + "allow_forking": { + "type": ["null", "boolean"] + }, + "is_template": { + "type": ["null", "boolean"] + }, + "web_commit_signoff_required": { + "type": ["null", "boolean"] + }, + "topics": { + "type": ["null", "array"], + "items": { + "type": ["string"] + } + }, + "visibility": { + "type": ["null", "string"] + }, + "forks": { + "type": ["null", "number"] + }, + "open_issues": { + "type": ["null", "number"] + }, + "watchers": { + "type": ["null", "number"] + }, + "default_branch": { + "type": ["null", "string"] + }, + "permissions": { + "$ref": "shared/pull_permissions.json#/" + } + } +} diff --git a/tap_github/streams.py b/tap_github/streams.py index 3a75a003..828b556f 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -2,6 +2,8 @@ import singer from singer import (metrics, bookmarks, metadata) +from tap_github.client import UnprocessableError + LOGGER = singer.get_logger() DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' @@ -68,6 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None + skip_pagination = True def build_url(self, base_url, repo_path, bookmark): """ @@ -156,7 +159,8 @@ def get_child_records(self, for response in client.authed_get_all_pages( child_object.tap_stream_id, child_full_url, - stream = child_object.tap_stream_id + stream = child_object.tap_stream_id, + skip_pagination = child_object.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -196,6 +200,17 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) + # Loop thru each child and nested child in the parent and fetch all the child records. + for nested_child in child_object.children: + if nested_child in stream_to_sync: + # Collect id of child record to pass in the API of its sub-child. + child_id = tuple(records.get(key) for key in STREAMS[nested_child]().id_keys) + # Here, grand_parent_id is the id of 1st level parent(main parent) which is required to + # pass in the API of the current child's sub-child. + child_object.get_child_records(client, catalog, nested_child, child_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, grand_parent_id, + records) + # pylint: disable=unnecessary-pass def add_fields_at_1st_level(self, record, parent_record = None): """ @@ -227,7 +242,8 @@ def sync_endpoint(self, self.tap_stream_id, full_url, self.headers, - stream = self.tap_stream_id + stream = self.tap_stream_id, + skip_pagination = self.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -298,7 +314,8 @@ def sync_endpoint(self, self.tap_stream_id, full_url, self.headers, - stream = self.tap_stream_id + stream = self.tap_stream_id, + skip_pagination = self.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -384,7 +401,8 @@ def sync_endpoint(self, for response in client.authed_get_all_pages( self.tap_stream_id, full_url, - stream = self.tap_stream_id + stream = self.tap_stream_id, + skip_pagination = self.skip_pagination ): records = response.json() extraction_time = singer.utils.now() @@ -753,6 +771,115 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['user_id'] = record['user']['id'] +class Repos(FullTableStream): + ''' + https://docs.github.com/en/rest/repos/repos#list-organization-repositories + ''' + tap_stream_id = "repos" + replication_method = "FULL_TABLE" + key_properties = ["id"] + use_organization = True + path = "orgs/{}/repos?per_page=100" + has_children = True + children= ["repo_forked_parents"] + pk_child_fields = ['_sdc_repository'] + + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id=None, + parent_record=None): + if child_stream == 'repo_forked_parents' and parent_record and not parent_record.get('fork'): + # Skip fetching child records for non-forked repositories. + return + super().get_child_records(client, catalog, child_stream, grand_parent_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, parent_id, parent_record) + + +class RepoForkedParents(FullTableStream): + ''' + Get parent repositories of a forked repository. + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#get-a-repository + ''' + tap_stream_id = "repo_forked_parents" + replication_method = "FULL_TABLE" + key_properties = ["_sdc_repository"] + use_organization = True + path = "repos/{}/{}" + parent = 'repos' + id_keys = ['name', 'default_branch'] + has_children = True + children = ["repo_forked_compares"] + pk_child_fields = ['_sdc_repository'] + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['fork_name'] = record['parent']['owner']['login'] + record['fork_owner_login'] = record['parent']['owner']['login'] + record['fork_default_branch'] = record['parent']['default_branch'] + + def get_child_records(self, + client, + catalog, + child_stream, + grand_parent_id, + repo_path, + state, + start_date, + bookmark_dttm, + stream_to_sync, + selected_stream_ids, + parent_id=None, + parent_record=None): + try: + super().get_child_records(client, catalog, child_stream, grand_parent_id, repo_path, state, start_date, + bookmark_dttm, stream_to_sync, selected_stream_ids, parent_id, parent_record) + except UnprocessableError as e: + error_message = str(e) + if "HTTP-error-code: 422" in error_message and "Sorry, this diff is taking too long to generate" in error_message: + LOGGER.warning(f'Can\'t compare the forked repository ({selected_stream_ids}) with the parent repository. Error: {error_message}') + else: + raise e + + + +class RepoForkedCompares(FullTableStream): + ''' + Compare the repo with the default branch of the parent repository. + https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#compare-two-commits + ''' + tap_stream_id = "repo_forked_compares" + replication_method = "FULL_TABLE" + key_properties = ["_sdc_repository"] + use_organization = True + path = "repos/{}/{}/compare/{}...{}:{}" + parent = 'repo_forked_parents' + id_keys = ['fork_owner_login', 'fork_default_branch'] + skip_pagination = False # No pagination required for this stream. + + def add_fields_at_1st_level(self, record, parent_record = None): + """ + Add fields in the record explicitly at the 1st level of JSON. + """ + record['name'] = parent_record['name'] + record['owner_login'] = parent_record['owner']['login'] + record['full_name'] = parent_record['full_name'] + record['default_branch'] = parent_record['default_branch'] + record['fork_owner_login'] = parent_record['fork_owner_login'] + record['fork_name'] = parent_record['fork_name'] + record['fork_default_branch'] = parent_record['fork_default_branch'] + + # Dictionary of the stream classes STREAMS = { "commits": Commits, @@ -777,5 +904,8 @@ def add_fields_at_1st_level(self, record, parent_record = None): "team_members": TeamMembers, "team_memberships": TeamMemberships, "collaborators": Collaborators, - "stargazers": StarGazers + "stargazers": StarGazers, + "repos": Repos, + "repo_forked_parents": RepoForkedParents, + "repo_forked_compares": RepoForkedCompares } diff --git a/tap_github/sync.py b/tap_github/sync.py index a83610ad..d0e4936b 100644 --- a/tap_github/sync.py +++ b/tap_github/sync.py @@ -4,7 +4,7 @@ from tap_github.streams import STREAMS LOGGER = singer.get_logger() -STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships'] +STREAM_TO_SYNC_FOR_ORGS = ['teams', 'team_members', 'team_memberships', 'repos', 'repo_forked_parents', 'repo_forked_compares'] def get_selected_streams(catalog): ''' From 55bbe381543152c83750bb3417324156a91ddbeb Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:02:37 -0300 Subject: [PATCH 2/7] Fix --- tap_github/client.py | 2 +- tap_github/streams.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 7e524a16..370fea9c 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -222,7 +222,7 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) resp._content = b'{}' # pylint: disable=protected-access return resp - def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=True): + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=False): """ Fetch all pages of records and return them. """ diff --git a/tap_github/streams.py b/tap_github/streams.py index 828b556f..b1b00073 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -70,7 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None - skip_pagination = True + skip_pagination = False def build_url(self, base_url, repo_path, bookmark): """ @@ -782,7 +782,7 @@ class Repos(FullTableStream): path = "orgs/{}/repos?per_page=100" has_children = True children= ["repo_forked_parents"] - pk_child_fields = ['_sdc_repository'] + pk_child_fields = ['id'] def get_child_records(self, client, @@ -811,20 +811,20 @@ class RepoForkedParents(FullTableStream): ''' tap_stream_id = "repo_forked_parents" replication_method = "FULL_TABLE" - key_properties = ["_sdc_repository"] + key_properties = ["id"] use_organization = True path = "repos/{}/{}" parent = 'repos' id_keys = ['name', 'default_branch'] has_children = True children = ["repo_forked_compares"] - pk_child_fields = ['_sdc_repository'] + pk_child_fields = ['id'] def add_fields_at_1st_level(self, record, parent_record = None): """ Add fields in the record explicitly at the 1st level of JSON. """ - record['fork_name'] = record['parent']['owner']['login'] + record['fork_name'] = record['parent']['name'] record['fork_owner_login'] = record['parent']['owner']['login'] record['fork_default_branch'] = record['parent']['default_branch'] @@ -860,12 +860,12 @@ class RepoForkedCompares(FullTableStream): ''' tap_stream_id = "repo_forked_compares" replication_method = "FULL_TABLE" - key_properties = ["_sdc_repository"] + key_properties = ["full_name"] use_organization = True path = "repos/{}/{}/compare/{}...{}:{}" parent = 'repo_forked_parents' id_keys = ['fork_owner_login', 'fork_default_branch'] - skip_pagination = False # No pagination required for this stream. + skip_pagination = True # No pagination required for this stream. def add_fields_at_1st_level(self, record, parent_record = None): """ From 5464ab15f6241c2d90b98265ed780414e726e9b0 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:27:12 -0300 Subject: [PATCH 3/7] Fix tests --- tests/unittests/test_sync_endpoint.py | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 338d9ea4..86bc4348 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -37,7 +37,7 @@ def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, moc self.assertEqual(final_state, expected_state) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) # Verify `write_records` call count self.assertEqual(mock_write_records.call_count, 4) @@ -65,7 +65,7 @@ def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_v self.assertEqual(mock_write_records.call_count, 3) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events') + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) mock_write_records.assert_called_with(mock.ANY, {'id': 4, 'created_at': '2019-01-02T00:00:00Z', '_sdc_repository': 'tap-github'},time_extracted = mock.ANY) @@ -91,7 +91,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["stargazers"], ["stargazers"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers', skip_pagination=False) # Verify that the get_child_records() is not called as Stargazers doesn't have a child stream self.assertFalse(mock_get_child_records.called) @@ -110,7 +110,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members"], ["teams","team_members"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) # Verify that the get_child_records() is called self.assertTrue(mock_get_child_records.called) @@ -136,9 +136,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ self.assertEqual(mock_authed_get_all_pages.call_count, 4) # Verify that the authed_get_all_pages() is called with the expected url - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members', skip_pagination=False) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships', skip_pagination=False) self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) @@ -166,7 +166,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["commits"], ["commits"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits', skip_pagination=False) # Verify that the get_child_records() is not called as Commits does not contain any child stream. self.assertFalse(mock_get_child_records.called) @@ -184,8 +184,8 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) + # Verify that the get_child_records() is called as thw Projects stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -208,9 +208,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ # Verify that the authed_get_all_pages() is called expected number of times self.assertEqual(mock_authed_get_all_pages.call_count, 4) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns') - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns', skip_pagination=False) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards', skip_pagination=False) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) @@ -240,7 +240,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_strptime_to_utc test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests"], ["pull_requests"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) @mock.patch("tap_github.streams.Stream.get_child_records") @@ -257,7 +257,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_strptime_to_utc, test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) # Verify that the get_child_records() is called as the PullRequests stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -281,8 +281,8 @@ def test_with_nested_child_streams(self, mock_strptime_to_utc, mock_authed_get_a self.assertEqual(mock_authed_get_all_pages.call_count, 2) print(mock_authed_get_all_pages.mock_calls) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests') - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments') + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments', skip_pagination=False) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) From 0d4347cecab1c2575325a93eaa0b276bee3ef17c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Tue, 14 Jan 2025 18:20:41 -0300 Subject: [PATCH 4/7] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1236da87..fe4ece57 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.8', + version='2.0.9', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', From 3a94da032ac416c537e2890f476203a68661c396 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:42:22 -0300 Subject: [PATCH 5/7] Update tap_github/streams.py Co-authored-by: Igor Khrol --- tap_github/streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index b1b00073..06f7dfa5 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -200,7 +200,7 @@ def get_child_records(self, singer.write_record(child_object.tap_stream_id, rec, time_extracted=extraction_time) - # Loop thru each child and nested child in the parent and fetch all the child records. + # Loop through each child and nested child in the parent and fetch all the child records. for nested_child in child_object.children: if nested_child in stream_to_sync: # Collect id of child record to pass in the API of its sub-child. From 1183f661d9082e4cc34f87c178232522fda5a051 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:01:09 -0300 Subject: [PATCH 6/7] Refactor auth get pages --- tap_github/client.py | 16 +++++++---- tap_github/streams.py | 20 +++++++------- tests/unittests/test_exception_handling.py | 4 +-- tests/unittests/test_get_all_repos.py | 20 +++++++------- tests/unittests/test_sync_endpoint.py | 32 +++++++++++----------- tests/unittests/test_timeout.py | 4 +-- 6 files changed, 50 insertions(+), 46 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 370fea9c..932d1b86 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -206,7 +206,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) - def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True): + def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. """ @@ -222,19 +222,23 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) resp._content = b'{}' # pylint: disable=protected-access return resp - def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True, skip_pagination=False): + def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): """ Fetch all pages of records and return them. """ next_url = self.prepare_url(url) while next_url: - response = self.authed_get(source, next_url, headers, stream, should_skip_404) + response = self.authed_get_single_page(source, next_url, headers, stream, should_skip_404) yield response - if skip_pagination: - break next_url = response.links.get('next', {}).get('url', None) + def authed_get(self, source, url, headers={}, stream="", should_skip_404=True, single_page=False): + if single_page: + yield self.authed_get_single_page(source, url, headers, stream, should_skip_404) + else: + yield from self.authed_get_all_pages(source, url, headers, stream, should_skip_404) + def prepare_url(self, url): """ Prepare the URL with some additional parameters @@ -249,7 +253,7 @@ def verify_repo_access(self, url_for_repo, repo): Call rest API to verify that the user has sufficient permissions to access this repository. """ try: - self.authed_get("verifying repository access", url_for_repo, should_skip_404=False) + self.authed_get_single_page("verifying repository access", url_for_repo, should_skip_404=False) except NotFoundException: # Throwing user-friendly error message as it checks token access message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) diff --git a/tap_github/streams.py b/tap_github/streams.py index b1b00073..16090de8 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -70,7 +70,7 @@ class Stream: use_repository = False headers = {'Accept': '*/*'} parent = None - skip_pagination = False + single_page = False def build_url(self, base_url, repo_path, bookmark): """ @@ -156,11 +156,11 @@ def get_child_records(self, stream_catalog = get_schema(catalog, child_object.tap_stream_id) with metrics.record_counter(child_object.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( child_object.tap_stream_id, child_full_url, stream = child_object.tap_stream_id, - skip_pagination = child_object.skip_pagination + single_page = child_object.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -238,12 +238,12 @@ def sync_endpoint(self, stream_catalog = get_schema(catalog, self.tap_stream_id) with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, self.headers, stream = self.tap_stream_id, - skip_pagination = self.skip_pagination + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -310,12 +310,12 @@ def sync_endpoint(self, stream_catalog = get_schema(catalog, self.tap_stream_id) with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, self.headers, stream = self.tap_stream_id, - skip_pagination = self.skip_pagination + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -398,11 +398,11 @@ def sync_endpoint(self, parent_bookmark_value = bookmark_value record_counter = 0 with metrics.record_counter(self.tap_stream_id) as counter: - for response in client.authed_get_all_pages( + for response in client.authed_get( self.tap_stream_id, full_url, stream = self.tap_stream_id, - skip_pagination = self.skip_pagination + single_page = self.single_page ): records = response.json() extraction_time = singer.utils.now() @@ -865,7 +865,7 @@ class RepoForkedCompares(FullTableStream): path = "repos/{}/{}/compare/{}...{}:{}" parent = 'repo_forked_parents' id_keys = ['fork_owner_login', 'fork_default_branch'] - skip_pagination = True # No pagination required for this stream. + single_page = True # No pagination required for this stream. def add_fields_at_1st_level(self, record, parent_record = None): """ diff --git a/tests/unittests/test_exception_handling.py b/tests/unittests/test_exception_handling.py index 05e48851..27f85be3 100644 --- a/tests/unittests/test_exception_handling.py +++ b/tests/unittests/test_exception_handling.py @@ -83,7 +83,7 @@ def test_error_message_and_call_count(self, mocked_parse_args, mocked_request, m expected_error_message = "HTTP-error-code: {}, Error: {}".format(erro_code, error_msg) with self.assertRaises(error_class) as e: - test_client.authed_get("", "") + test_client.authed_get_single_page("", "") # Verifying the message formed for the custom exception self.assertEqual(str(e.exception), expected_error_message) @@ -101,7 +101,7 @@ def test_skip_404_error(self, mock_logger, mocked_parse_args, mocked_request, m expected_message = "HTTP-error-code: 404, Error: The resource you have specified cannot be found. Alternatively the access_token is not valid for the resource. Please refer '{}' for more details.".format(json.get("documentation_url")) test_client = GithubClient(self.config) - test_client.authed_get("", "") + test_client.authed_get_single_page("", "") # Verifying the message formed for the custom exception self.assertEqual(mock_logger.mock_calls[0], mock.call(expected_message)) diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py index 5db16fef..fd332381 100644 --- a/tests/unittests/test_get_all_repos.py +++ b/tests/unittests/test_get_all_repos.py @@ -90,39 +90,39 @@ def test_multiple_organizations(self, mocked_authed_get_all_pages, mocked_verify self.assertListEqual(expected_repositories, side_effect) @mock.patch('tap_github.client.GithubClient.verify_repo_access') -@mock.patch('tap_github.client.GithubClient.authed_get') +@mock.patch('tap_github.client.GithubClient.authed_get_single_page') class TestAuthedGetAllPages(unittest.TestCase): """ Test `authed_get_all_pages` method from client. """ config = {"access_token": "", "repository": "test-org/repo1", "max_per_page": 100} - def test_for_one_page(self, mock_auth_get, mock_verify_access): + def test_for_one_page(self, mock_auth_get_single_page, mock_verify_access): """Verify `authed_get` is called only once if one page is available.""" test_client = GithubClient(self.config) - mock_auth_get.return_value = MockResponse({}) + mock_auth_get_single_page.return_value = MockResponse({}) list(test_client.authed_get_all_pages("", "http://mock_url", {})) # Verify `auth_get` call count - self.assertEqual(mock_auth_get.call_count, 1) + self.assertEqual(mock_auth_get_single_page.call_count, 1) - def test_for_multiple_pages(self, mock_auth_get, mock_verify_access): + def test_for_multiple_pages(self, mock_auth_get_single_page, mock_verify_access): """Verify `authed_get` is called equal number times as pages available.""" test_client = GithubClient(self.config) - mock_auth_get.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), + mock_auth_get_single_page.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), MockResponse({"next": {"url": "http://mock_url_3/?per_page=100"}}),MockResponse({})] list(test_client.authed_get_all_pages("", "http://mock_url_1", {})) # Verify `auth_get` call count - self.assertEqual(mock_auth_get.call_count, 3) + self.assertEqual(mock_auth_get_single_page.call_count, 3) # Verify `auth_get` calls with expected url - self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get_single_page.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) diff --git a/tests/unittests/test_sync_endpoint.py b/tests/unittests/test_sync_endpoint.py index 86bc4348..4ba60e47 100644 --- a/tests/unittests/test_sync_endpoint.py +++ b/tests/unittests/test_sync_endpoint.py @@ -37,7 +37,7 @@ def test_sync_without_state(self, mock_write_records, mock_authed_all_pages, moc self.assertEqual(final_state, expected_state) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, 'events', True) # Verify `write_records` call count self.assertEqual(mock_write_records.call_count, 4) @@ -65,7 +65,7 @@ def test_sync_with_state(self, mock_write_records, mock_authed_all_pages, mock_v self.assertEqual(mock_write_records.call_count, 3) # Verify `get_auth_all_pages` called with expected url - mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, stream='events', skip_pagination=False) + mock_authed_all_pages.assert_called_with(mock.ANY, 'https://api.github.com/repos/tap-github/events', mock.ANY, 'events', True) mock_write_records.assert_called_with(mock.ANY, {'id': 4, 'created_at': '2019-01-02T00:00:00Z', '_sdc_repository': 'tap-github'},time_extracted = mock.ANY) @@ -91,7 +91,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["stargazers"], ["stargazers"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, stream='stargazers', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/stargazers", mock.ANY, 'stargazers', True) # Verify that the get_child_records() is not called as Stargazers doesn't have a child stream self.assertFalse(mock_get_child_records.called) @@ -110,7 +110,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["teams", "team_members"], ["teams","team_members"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, 'teams', True) # Verify that the get_child_records() is called self.assertTrue(mock_get_child_records.called) @@ -136,9 +136,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ self.assertEqual(mock_authed_get_all_pages.call_count, 4) # Verify that the authed_get_all_pages() is called with the expected url - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, stream='teams', skip_pagination=False) - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", stream='team_members', skip_pagination=False) - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", stream='team_memberships', skip_pagination=False) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams", mock.ANY, 'teams', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/members", {}, 'team_members', True) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/orgs/tap-github/teams/stitch-dev/memberships/log1", {}, 'team_memberships', True) self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) self.assertEqual(mock_authed_get_all_pages.mock_calls[1], exp_call_2) @@ -166,7 +166,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_authed_get_all_ test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["commits"], ["commits"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, stream='commits', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/commits?since=", mock.ANY, 'commits', True) # Verify that the get_child_records() is not called as Commits does not contain any child stream. self.assertFalse(mock_get_child_records.called) @@ -184,7 +184,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_authed_get_all_p test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["projects", "project_columns"], ["projects","project_columns"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, 'projects', True) # Verify that the get_child_records() is called as thw Projects stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -208,9 +208,9 @@ def test_with_nested_child_streams(self, mock_authed_get_all_pages, mock_verify_ # Verify that the authed_get_all_pages() is called expected number of times self.assertEqual(mock_authed_get_all_pages.call_count, 4) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, stream='projects', skip_pagination=False) - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", stream='project_columns', skip_pagination=False) - exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", stream='project_cards', skip_pagination=False) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/projects?state=all", mock.ANY, 'projects', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/projects/1/columns", {}, 'project_columns', True) + exp_call_3 = mock.call(mock.ANY, "https://api.github.com/projects/columns/1/cards", {}, 'project_cards', True) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) @@ -240,7 +240,7 @@ def test_without_child_stream(self, mock_get_child_records, mock_strptime_to_utc test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests"], ["pull_requests"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) @mock.patch("tap_github.streams.Stream.get_child_records") @@ -257,7 +257,7 @@ def test_with_child_streams(self, mock_get_child_records, mock_strptime_to_utc, test_stream.sync_endpoint(test_client, {}, self.catalog, "tap-github", "", ["pull_requests", "review_comments"], ["pull_requests","review_comments"]) # Verify that the authed_get_all_pages() is called with the expected url - mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) + mock_authed_get_all_pages.assert_called_with(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) # Verify that the get_child_records() is called as the PullRequests stream has a child stream self.assertTrue(mock_get_child_records.called) @@ -281,8 +281,8 @@ def test_with_nested_child_streams(self, mock_strptime_to_utc, mock_authed_get_a self.assertEqual(mock_authed_get_all_pages.call_count, 2) print(mock_authed_get_all_pages.mock_calls) - exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", stream='pull_requests', skip_pagination=False) - exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", stream='review_comments', skip_pagination=False) + exp_call_1 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls?state=all&sort=updated&direction=desc", {}, 'pull_requests', True) + exp_call_2 = mock.call(mock.ANY, "https://api.github.com/repos/tap-github/pulls/1/comments?sort=updated_at&direction=desc", {}, 'review_comments', True) # Verify that the API calls are done as expected with the correct url self.assertEqual(mock_authed_get_all_pages.mock_calls[0], exp_call_1) diff --git a/tests/unittests/test_timeout.py b/tests/unittests/test_timeout.py index a3f6ca53..09129684 100644 --- a/tests/unittests/test_timeout.py +++ b/tests/unittests/test_timeout.py @@ -73,7 +73,7 @@ def test_timeout_value_in_config(self, mocked_parse_args, mocked_request, mocked # get the timeout value for assertion timeout = test_client.get_request_timeout() # function call - test_client.authed_get("test_source", "") + test_client.authed_get_single_page("test_source", "") # verify that we got expected timeout value self.assertEqual(expected_value, timeout) @@ -107,7 +107,7 @@ def test_backoff(self, mocked_parse_args, mocked_request, mocked_sleep, mock_ver test_client = GithubClient(mock_config) with self.assertRaises(error_class): - test_client.authed_get("test_source", "") + test_client.authed_get_single_page("test_source", "") # verify that we backoff 5 times self.assertEqual(5, mocked_request.call_count) From 9db65db175d569645f3d2d1276d5caff34b6adc4 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 17 Jan 2025 13:05:29 -0300 Subject: [PATCH 7/7] Add error_message --- tap_github/schemas/repo_forked_compares.json | 3 +++ tap_github/streams.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/tap_github/schemas/repo_forked_compares.json b/tap_github/schemas/repo_forked_compares.json index 0b756fbf..70ad5d02 100644 --- a/tap_github/schemas/repo_forked_compares.json +++ b/tap_github/schemas/repo_forked_compares.json @@ -69,6 +69,9 @@ "items": { "type": ["null", "object"] } + }, + "error_message": { + "type": ["null", "string"] } } } diff --git a/tap_github/streams.py b/tap_github/streams.py index a61fde6c..627eb45e 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -847,6 +847,13 @@ def get_child_records(self, except UnprocessableError as e: error_message = str(e) if "HTTP-error-code: 422" in error_message and "Sorry, this diff is taking too long to generate" in error_message: + child_object = STREAMS[child_stream]() + record = {'_sdc_repository': repo_path, 'error_message': error_message} + child_object.add_fields_at_1st_level(record=record, parent_record=parent_record) + with singer.Transformer() as transformer: + if child_object.tap_stream_id in selected_stream_ids: + singer.write_record(child_object.tap_stream_id, record, time_extracted=singer.utils.now()) + LOGGER.warning(f'Can\'t compare the forked repository ({selected_stream_ids}) with the parent repository. Error: {error_message}') else: raise e @@ -878,6 +885,10 @@ def add_fields_at_1st_level(self, record, parent_record = None): record['fork_owner_login'] = parent_record['fork_owner_login'] record['fork_name'] = parent_record['fork_name'] record['fork_default_branch'] = parent_record['fork_default_branch'] + if 'status' not in record and 'error_message' not in record: + record['error_message'] = ('Cannot retrieve ahead/behind information for this branch. ' + 'It may happen if the parent repository has been deleted or ' + 'no common ancestor between the default branches.') # Dictionary of the stream classes