diff --git a/setup.py b/setup.py index 0c350267..c4e19738 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.11', + version='2.0.12', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index 932d1b86..e182462f 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -16,7 +16,11 @@ # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 +# How many total seconds to retry when getting rate limit error from API. The limit resets every hour. +RATE_LIMIT_RETRY_MAX_TIME = 3600 + PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' +RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded' class GithubException(Exception): pass @@ -51,9 +55,15 @@ class MovedPermanentlyError(GithubException): class ConflictError(GithubException): pass +# Thrown when we receive 403 Rate Limit Exceeded from Github API class RateLimitExceeded(GithubException): pass +# Thrown when we're expected to sleep for longer than the max_sleep_seconds limit +class RateLimitSleepExceeded(GithubException): + pass + +# Thrown when 429 is received from Github API class TooManyRequests(GithubException): pass @@ -111,6 +121,13 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} + response_message = response_json.get('message', '') + + if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 403, Error: {response_message}" + LOGGER.warning(message) + raise RateLimitExceeded() from None + if error_code == 404 and should_skip_404: # Add not accessible stream into list. client.not_accessible_repos.add(stream) @@ -122,8 +139,8 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None - if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''): - message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \ + if error_code == 422 and PAGINATION_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 422, Error: {response_message}. " \ f"Please refer '{response_json.get('documentation_url')}' for more details." \ "This is a known issue when the results exceed 40k and the last page is not full" \ " (it will trim the results to get only the available by the API)." @@ -150,13 +167,18 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ + if "Retry-After" in response.headers: + # handles the secondary rate limit + seconds_to_sleep = int(response.headers['Retry-After']) + LOGGER.info("Retry-After header found in response. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: - seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) + seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']) + 15) if seconds_to_sleep > max_sleep_seconds: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None + raise RateLimitSleepExceeded(message) from None LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) time.sleep(seconds_to_sleep) @@ -206,6 +228,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) + @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), interval=60, jitter=None, max_time=RATE_LIMIT_RETRY_MAX_TIME) def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200.