-
Notifications
You must be signed in to change notification settings - Fork 0
Adding last page limit and include config to set the max results per page #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6be3a2a
ac14f78
7c9fdae
2d361d3
8c2d281
a67f788
077f8f4
0ad5a28
7864389
d780571
3d46ebc
5b53c97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import time | ||
| import requests | ||
| from requests.models import PreparedRequest | ||
| import backoff | ||
| from simplejson import JSONDecodeError | ||
| import singer | ||
|
|
@@ -9,11 +10,14 @@ | |
| LOGGER = singer.get_logger() | ||
| DEFAULT_SLEEP_SECONDS = 600 | ||
| DEFAULT_MIN_REMAIN_RATE_LIMIT = 0 | ||
| DEFAULT_MAX_PER_PAGE = 100 | ||
| DEFAULT_DOMAIN = "https://api.github.com" | ||
|
|
||
| # Set default timeout of 300 seconds | ||
| REQUEST_TIMEOUT = 300 | ||
|
|
||
| PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' | ||
|
|
||
| class GithubException(Exception): | ||
| pass | ||
|
|
||
|
|
@@ -118,6 +122,14 @@ def raise_for_error(resp, source, stream, client, should_skip_404): | |
| # Don't raise a NotFoundException | ||
| return None | ||
|
|
||
| if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''): | ||
| message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \ | ||
| f"Please refer '{response_json.get('documentation_url')}' for more details." \ | ||
| "This is a known issue when the results exceed 40k and the last page is not full" \ | ||
| " (it will trim the results to get only the available by the API)." | ||
| LOGGER.warning(message) | ||
| return None | ||
|
|
||
| message = "HTTP-error-code: {}, Error: {}".format( | ||
| error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json) | ||
|
|
||
|
|
@@ -132,7 +144,7 @@ def calculate_seconds(epoch): | |
| Calculate the seconds to sleep before making a new request. | ||
| """ | ||
| current = time.time() | ||
| return int(ceil(epoch - current)) | ||
| return max(0, int(ceil(epoch - current))) | ||
|
|
||
| def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): | ||
| """ | ||
|
|
@@ -165,6 +177,7 @@ def __init__(self, config): | |
| self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT) | ||
| self.set_auth_in_session() | ||
| self.not_accessible_repos = set() | ||
| self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE) | ||
|
|
||
| def get_request_timeout(self): | ||
| """ | ||
|
|
@@ -195,32 +208,37 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) | |
| """ | ||
| Call rest API and return the response in case of status code 200. | ||
| """ | ||
| with metrics.http_request_timer(source) as timer: | ||
| with metrics.http_request_timer(url) as timer: | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is already expecting the endpoint, it makes it easier to identify the request in logs. |
||
| self.session.headers.update(headers) | ||
| resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) | ||
| if resp.status_code != 200: | ||
| raise_for_error(resp, source, stream, self, should_skip_404) | ||
| timer.tags[metrics.Tag.http_status_code] = resp.status_code | ||
| rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit) | ||
| if resp.status_code == 404: | ||
| if resp.status_code == 404 or resp.status_code == 422: | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to check the response message again because if the error code is 422 and it's not a pagination error, the |
||
| # Return an empty response body since we're not raising a NotFoundException | ||
| resp._content = b'{}' # pylint: disable=protected-access | ||
| resp._content = b'{}' # pylint: disable=protected-access | ||
| return resp | ||
|
|
||
| def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True): | ||
| """ | ||
| Fetch all pages of records and return them. | ||
| """ | ||
| while True: | ||
| r = self.authed_get(source, url, headers, stream, should_skip_404) | ||
| yield r | ||
| next_url = self.prepare_url(url) | ||
| while next_url: | ||
| response = self.authed_get(source, next_url, headers, stream, should_skip_404) | ||
| yield response | ||
|
|
||
| # Fetch the next page if next found in the response. | ||
| if 'next' in r.links: | ||
| url = r.links['next']['url'] | ||
| else: | ||
| # Break the loop if all pages are fetched. | ||
| break | ||
| next_url = response.links.get('next', {}).get('url', None) | ||
|
|
||
| def prepare_url(self, url): | ||
| """ | ||
| Prepare the URL with some additional parameters | ||
| """ | ||
| prepared_request = PreparedRequest() | ||
| # Including max per page param | ||
| prepared_request.prepare_url(url, {'per_page': self.max_per_page}) | ||
| return prepared_request.url | ||
|
|
||
| def verify_repo_access(self, url_for_repo, repo): | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error 422 can be raised for other reasons, that is why we are comparing the message to check if it's related to pagination.