diff --git a/README.md b/README.md index e8c4df01..7b8e8845 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,10 @@ This tap: "base_url": "https://api.github.com" } ``` + +> Note: The max results per page is configurable with the parameter `max_per_page`, +> as default it will return 100 (that is the max of most of the endpoints) + 4. Run the tap in discovery mode to get properties.json file ```bash diff --git a/tap_github/client.py b/tap_github/client.py index e8a11d4c..b38286a1 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -1,5 +1,6 @@ import time import requests +from requests.models import PreparedRequest import backoff from simplejson import JSONDecodeError import singer @@ -9,11 +10,14 @@ LOGGER = singer.get_logger() DEFAULT_SLEEP_SECONDS = 600 DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 +DEFAULT_MAX_PER_PAGE = 100 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 +PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' + class GithubException(Exception): pass @@ -118,6 +122,14 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None + if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''): + message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \ + f"Please refer '{response_json.get('documentation_url')}' for more details." \ + "This is a known issue when the results exceed 40k and the last page is not full" \ + " (it will trim the results to get only the available by the API)." + LOGGER.warning(message) + return None + message = "HTTP-error-code: {}, Error: {}".format( error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) @@ -132,7 +144,7 @@ def calculate_seconds(epoch): Calculate the seconds to sleep before making a new request. """ current = time.time() - return int(ceil(epoch - current)) + return max(0, int(ceil(epoch - current))) def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ @@ -165,6 +177,7 @@ def __init__(self, config): self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT) self.set_auth_in_session() self.not_accessible_repos = set() + self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE) def get_request_timeout(self): """ @@ -195,32 +208,37 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) """ Call rest API and return the response in case of status code 200. """ - with metrics.http_request_timer(source) as timer: + with metrics.http_request_timer(url) as timer: self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) if resp.status_code != 200: raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit) - if resp.status_code == 404: + if resp.status_code == 404 or resp.status_code == 422: # Return an empty response body since we're not raising a NotFoundException - resp._content = b'{}' # pylint: disable=protected-access + resp._content = b'{}' # pylint: disable=protected-access return resp def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): """ Fetch all pages of records and return them. """ - while True: - r = self.authed_get(source, url, headers, stream, should_skip_404) - yield r + next_url = self.prepare_url(url) + while next_url: + response = self.authed_get(source, next_url, headers, stream, should_skip_404) + yield response - # Fetch the next page if next found in the response. - if 'next' in r.links: - url = r.links['next']['url'] - else: - # Break the loop if all pages are fetched. - break + next_url = response.links.get('next', {}).get('url', None) + + def prepare_url(self, url): + """ + Prepare the URL with some additional parameters + """ + prepared_request = PreparedRequest() + # Including max per page param + prepared_request.prepare_url(url, {'per_page': self.max_per_page}) + return prepared_request.url def verify_repo_access(self, url_for_repo, repo): """ diff --git a/tests/unittests/test_custom_domain.py b/tests/unittests/test_custom_domain.py index 139b2426..3a517786 100644 --- a/tests/unittests/test_custom_domain.py +++ b/tests/unittests/test_custom_domain.py @@ -12,8 +12,8 @@ def test_config_without_domain(self, mock_verify_access): """ Test if the domain is not given in the config """ - mock_config = {'repository': 'singer-io/test-repo', "access_token": ""} - test_client = GithubClient(mock_config) + config = {'repository': 'singer-io/test-repo', "access_token": ""} + test_client = GithubClient(config) # Verify domain in client is default self.assertEqual(test_client.base_url, DEFAULT_DOMAIN) @@ -22,8 +22,24 @@ def test_config_with_domain(self, mock_verify_access): """ Test if the domain is given in the config """ - mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} - test_client = GithubClient(mock_config) + config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(config) # Verify domain in client is from config - self.assertEqual(test_client.base_url, mock_config["base_url"]) + self.assertEqual(test_client.base_url, config["base_url"]) + + def test_prepare_url(self, mock_verify_access): + """ + Test if the correct params are added to url + """ + config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""} + test_client = GithubClient(config) + + # Verify if per_page param was added with default value + self.assertEqual(test_client.prepare_url(test_client.base_url), "http://custom-git.com/?per_page=100") + self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=100') + + # Verify if per_page param was added as expected + config["max_per_page"] = 35 + test_client2 = GithubClient(config) + self.assertEqual(test_client2.prepare_url(test_client2.base_url), "http://custom-git.com/?per_page=35") diff --git a/tests/unittests/test_get_all_repos.py b/tests/unittests/test_get_all_repos.py index 9235acad..5db16fef 100644 --- a/tests/unittests/test_get_all_repos.py +++ b/tests/unittests/test_get_all_repos.py @@ -95,7 +95,7 @@ class TestAuthedGetAllPages(unittest.TestCase): """ Test `authed_get_all_pages` method from client. """ - config = {"access_token": "", "repository": "test-org/repo1"} + config = {"access_token": "", "repository": "test-org/repo1", "max_per_page": 100} def test_for_one_page(self, mock_auth_get, mock_verify_access): @@ -104,7 +104,7 @@ def test_for_one_page(self, mock_auth_get, mock_verify_access): test_client = GithubClient(self.config) mock_auth_get.return_value = MockResponse({}) - list(test_client.authed_get_all_pages("", "mock_url", {})) + list(test_client.authed_get_all_pages("", "http://mock_url", {})) # Verify `auth_get` call count self.assertEqual(mock_auth_get.call_count, 1) @@ -114,14 +114,15 @@ def test_for_multiple_pages(self, mock_auth_get, mock_verify_access): """Verify `authed_get` is called equal number times as pages available.""" test_client = GithubClient(self.config) - mock_auth_get.side_effect = [MockResponse({"next": {"url": "mock_url_2"}}),MockResponse({"next": {"url": "mock_url_3"}}),MockResponse({})] + mock_auth_get.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}), + MockResponse({"next": {"url": "http://mock_url_3/?per_page=100"}}),MockResponse({})] - list(test_client.authed_get_all_pages("", "mock_url_1", {})) + list(test_client.authed_get_all_pages("", "http://mock_url_1", {})) # Verify `auth_get` call count self.assertEqual(mock_auth_get.call_count, 3) # Verify `auth_get` calls with expected url - self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "mock_url_1", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "mock_url_2", {}, '', True)) - self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "mock_url_3", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True)) + self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True)) diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 4de4ef98..770acd29 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -36,7 +36,6 @@ def test_rate_limt_wait(self, mocked_sleep): mocked_sleep.assert_called_with(121) self.assertTrue(mocked_sleep.called) - def test_rate_limit_exception(self, mocked_sleep): """ Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS` @@ -53,7 +52,6 @@ def test_rate_limit_exception(self, mocked_sleep): rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT) self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 602 seconds.") - def test_rate_limit_not_exceeded(self, mocked_sleep): """ Test `rate_throttling` if sleep time does not exceed limit