Skip to content
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ This tap:
"base_url": "https://api.github.com"
}
```

> Note: The max results per page is configurable with the parameter `max_per_page`,
> as default it will return 100 (that is the max of most of the endpoints)

4. Run the tap in discovery mode to get properties.json file

```bash
Expand Down
44 changes: 31 additions & 13 deletions tap_github/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import time
import requests
from requests.models import PreparedRequest
import backoff
from simplejson import JSONDecodeError
import singer
Expand All @@ -9,11 +10,14 @@
LOGGER = singer.get_logger()
DEFAULT_SLEEP_SECONDS = 600
DEFAULT_MIN_REMAIN_RATE_LIMIT = 0
DEFAULT_MAX_PER_PAGE = 100
DEFAULT_DOMAIN = "https://api.github.com"

# Set default timeout of 300 seconds
REQUEST_TIMEOUT = 300

PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.'

class GithubException(Exception):
pass

Expand Down Expand Up @@ -118,6 +122,14 @@ def raise_for_error(resp, source, stream, client, should_skip_404):
# Don't raise a NotFoundException
return None

if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''):
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error 422 can be raised for other reasons, that is why we are comparing the message to check if it's related to pagination.

message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \
f"Please refer '{response_json.get('documentation_url')}' for more details." \
"This is a known issue when the results exceed 40k and the last page is not full" \
" (it will trim the results to get only the available by the API)."
LOGGER.warning(message)
return None

message = "HTTP-error-code: {}, Error: {}".format(
error_code, ERROR_CODE_EXCEPTION_MAPPING.get(error_code, {}).get("message", "Unknown Error") if response_json == {} else response_json)

Expand All @@ -132,7 +144,7 @@ def calculate_seconds(epoch):
Calculate the seconds to sleep before making a new request.
"""
current = time.time()
return int(ceil(epoch - current))
return max(0, int(ceil(epoch - current)))

def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit):
"""
Expand Down Expand Up @@ -165,6 +177,7 @@ def __init__(self, config):
self.min_remain_rate_limit = self.config.get('min_remain_rate_limit', DEFAULT_MIN_REMAIN_RATE_LIMIT)
self.set_auth_in_session()
self.not_accessible_repos = set()
self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE)

def get_request_timeout(self):
"""
Expand Down Expand Up @@ -195,32 +208,37 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True)
"""
Call rest API and return the response in case of status code 200.
"""
with metrics.http_request_timer(source) as timer:
with metrics.http_request_timer(url) as timer:
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already expecting the endpoint, it makes it easier to identify the request in logs.

self.session.headers.update(headers)
resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout())
if resp.status_code != 200:
raise_for_error(resp, source, stream, self, should_skip_404)
timer.tags[metrics.Tag.http_status_code] = resp.status_code
rate_throttling(resp, self.max_sleep_seconds, self.min_remain_rate_limit)
if resp.status_code == 404:
if resp.status_code == 404 or resp.status_code == 422:
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to check the response message again because if the error code is 422 and it's not a pagination error, the raise_for_error method will raise an exception.

# Return an empty response body since we're not raising a NotFoundException
resp._content = b'{}' # pylint: disable=protected-access
resp._content = b'{}' # pylint: disable=protected-access
return resp

def authed_get_all_pages(self, source, url, headers={}, stream="", should_skip_404 = True):
"""
Fetch all pages of records and return them.
"""
while True:
r = self.authed_get(source, url, headers, stream, should_skip_404)
yield r
next_url = self.prepare_url(url)
while next_url:
response = self.authed_get(source, next_url, headers, stream, should_skip_404)
yield response

# Fetch the next page if next found in the response.
if 'next' in r.links:
url = r.links['next']['url']
else:
# Break the loop if all pages are fetched.
break
next_url = response.links.get('next', {}).get('url', None)

def prepare_url(self, url):
"""
Prepare the URL with some additional parameters
"""
prepared_request = PreparedRequest()
# Including max per page param
prepared_request.prepare_url(url, {'per_page': self.max_per_page})
return prepared_request.url

def verify_repo_access(self, url_for_repo, repo):
"""
Expand Down
26 changes: 21 additions & 5 deletions tests/unittests/test_custom_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_config_without_domain(self, mock_verify_access):
"""
Test if the domain is not given in the config
"""
mock_config = {'repository': 'singer-io/test-repo', "access_token": ""}
test_client = GithubClient(mock_config)
config = {'repository': 'singer-io/test-repo', "access_token": ""}
test_client = GithubClient(config)

# Verify domain in client is default
self.assertEqual(test_client.base_url, DEFAULT_DOMAIN)
Expand All @@ -22,8 +22,24 @@ def test_config_with_domain(self, mock_verify_access):
"""
Test if the domain is given in the config
"""
mock_config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""}
test_client = GithubClient(mock_config)
config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""}
test_client = GithubClient(config)

# Verify domain in client is from config
self.assertEqual(test_client.base_url, mock_config["base_url"])
self.assertEqual(test_client.base_url, config["base_url"])

def test_prepare_url(self, mock_verify_access):
"""
Test if the correct params are added to url
"""
config = {'repository': 'singer-io/test-repo', "base_url": "http://CUSTOM-git.com", "access_token": ""}
test_client = GithubClient(config)

# Verify if per_page param was added with default value
self.assertEqual(test_client.prepare_url(test_client.base_url), "http://custom-git.com/?per_page=100")
self.assertEqual(test_client.prepare_url('http://CUSTOM-git.com/?q=query'), 'http://custom-git.com/?q=query&per_page=100')

# Verify if per_page param was added as expected
config["max_per_page"] = 35
test_client2 = GithubClient(config)
self.assertEqual(test_client2.prepare_url(test_client2.base_url), "http://custom-git.com/?per_page=35")
15 changes: 8 additions & 7 deletions tests/unittests/test_get_all_repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class TestAuthedGetAllPages(unittest.TestCase):
"""
Test `authed_get_all_pages` method from client.
"""
config = {"access_token": "", "repository": "test-org/repo1"}
config = {"access_token": "", "repository": "test-org/repo1", "max_per_page": 100}

def test_for_one_page(self, mock_auth_get, mock_verify_access):

Expand All @@ -104,7 +104,7 @@ def test_for_one_page(self, mock_auth_get, mock_verify_access):
test_client = GithubClient(self.config)
mock_auth_get.return_value = MockResponse({})

list(test_client.authed_get_all_pages("", "mock_url", {}))
list(test_client.authed_get_all_pages("", "http://mock_url", {}))

# Verify `auth_get` call count
self.assertEqual(mock_auth_get.call_count, 1)
Expand All @@ -114,14 +114,15 @@ def test_for_multiple_pages(self, mock_auth_get, mock_verify_access):
"""Verify `authed_get` is called equal number times as pages available."""

test_client = GithubClient(self.config)
mock_auth_get.side_effect = [MockResponse({"next": {"url": "mock_url_2"}}),MockResponse({"next": {"url": "mock_url_3"}}),MockResponse({})]
mock_auth_get.side_effect = [MockResponse({"next": {"url": "http://mock_url_2/?per_page=100"}}),
MockResponse({"next": {"url": "http://mock_url_3/?per_page=100"}}),MockResponse({})]

list(test_client.authed_get_all_pages("", "mock_url_1", {}))
list(test_client.authed_get_all_pages("", "http://mock_url_1", {}))

# Verify `auth_get` call count
self.assertEqual(mock_auth_get.call_count, 3)

# Verify `auth_get` calls with expected url
self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "mock_url_1", {}, '', True))
self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "mock_url_2", {}, '', True))
self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "mock_url_3", {}, '', True))
self.assertEqual(mock_auth_get.mock_calls[0], mock.call("", "http://mock_url_1/?per_page=100", {}, '', True))
self.assertEqual(mock_auth_get.mock_calls[1], mock.call("", "http://mock_url_2/?per_page=100", {}, '', True))
self.assertEqual(mock_auth_get.mock_calls[2], mock.call("", "http://mock_url_3/?per_page=100", {}, '', True))
2 changes: 0 additions & 2 deletions tests/unittests/test_rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def test_rate_limt_wait(self, mocked_sleep):
mocked_sleep.assert_called_with(121)
self.assertTrue(mocked_sleep.called)


def test_rate_limit_exception(self, mocked_sleep):
"""
Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS`
Expand All @@ -53,7 +52,6 @@ def test_rate_limit_exception(self, mocked_sleep):
rate_throttling(resp, DEFAULT_SLEEP_SECONDS, DEFAULT_MIN_REMAIN_RATE_LIMIT)
self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 602 seconds.")


def test_rate_limit_not_exceeded(self, mocked_sleep):
"""
Test `rate_throttling` if sleep time does not exceed limit
Expand Down