Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2
jobs:
build:
docker:
- image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:tap-tester-v4
- image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester
steps:
- checkout
- run:
Expand All @@ -21,21 +21,21 @@ jobs:
name: 'pylint'
command: |
source /usr/local/share/virtualenvs/tap-github/bin/activate
pylint tap_github --disable 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-class-docstring,missing-function-docstring,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,wrong-spelling-in-comment,wrong-spelling-in-docstring,bad-whitespace'
pylint tap_github --disable 'missing-module-docstring,missing-function-docstring,missing-class-docstring,line-too-long,invalid-name,too-many-lines,consider-using-f-string,too-many-arguments,too-many-locals'
- run:
name: 'Unit Tests'
command: |
source /usr/local/share/virtualenvs/tap-github/bin/activate
python -m unittest discover -s tests/unittests
- run:
when: always
name: 'Integration Tests'
command: |
aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh
source dev_env.sh
source /usr/local/share/virtualenvs/tap-tester/bin/activate
run-test --tap=tap-github \
--target=target-stitch \
--orchestrator=stitch-orchestrator \
--email=harrison+sandboxtest@stitchdata.com \
--password=$SANDBOX_PASSWORD \
--client-id=50 \
--token=$STITCH_API_TOKEN \
tests
run-test --tap=tap-github tests

workflows:
version: 2
commit:
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,8 @@ properties.json

# Jetbrains IDE
.idea

# macOS
*.DS_Store
.AppleDouble
.LSOverride
12 changes: 5 additions & 7 deletions tap_github/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import argparse
import os
import json
import collections
import time
import requests
import singer
import singer.bookmarks as bookmarks
import singer.metrics as metrics

from singer import metadata
from singer import (bookmarks, metrics, metadata)
from simplejson import JSONDecodeError

session = requests.Session()
logger = singer.get_logger()
Expand Down Expand Up @@ -168,7 +166,7 @@ def raise_for_error(resp, source):
error_code = resp.status_code
try:
response_json = resp.json()
except Exception:
except JSONDecodeError:
response_json = {}

if error_code == 404:
Expand Down Expand Up @@ -242,7 +240,7 @@ def load_schemas():
for filename in os.listdir(get_abs_path('schemas')):
path = get_abs_path('schemas') + '/' + filename
file_raw = filename.replace('.json', '')
with open(path) as file:
with open(path, encoding='utf-8') as file:
schemas[file_raw] = json.load(file)

schemas['pr_commits'] = generate_pr_commit_schema(schemas['commits'])
Expand Down Expand Up @@ -873,7 +871,7 @@ def get_all_collaborators(schema, repo_path, state, mdata, _start_date):
with singer.Transformer() as transformer:
rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata))
singer.write_record('collaborators', rec, time_extracted=extraction_time)
singer.write_bookmark(state, repo_path, 'collaborator', {'since': singer.utils.strftime(extraction_time)})
singer.write_bookmark(state, repo_path, 'collaborators', {'since': singer.utils.strftime(extraction_time)})
counter.increment()

return state
Expand Down
37 changes: 9 additions & 28 deletions tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def get_properties(self, original: bool = True):
:param original: set to false to change the start_date or end_date
"""
return_value = {
'start_date' : dt.strftime(dt.utcnow()-timedelta(days=5), self.START_DATE_FORMAT),
'repository': 'singer-io/tap-github'
'start_date' : '2021-10-01T00:00:00Z',
'repository': 'singer-io/test-repo'
}
if original:
return return_value
Expand All @@ -60,32 +60,13 @@ def get_credentials(self):
'access_token': os.getenv("TAP_GITHUB_TOKEN")
}

@staticmethod
def expected_check_streams():
return {
'assignees',
'collaborators',
'comments',
'commit_comments',
'commits',
'events',
'issue_labels',
'issue_milestones',
'issue_events',
'issues',
'pr_commits',
'project_cards',
'project_columns',
'projects',
'pull_requests',
'releases',
'review_comments',
'reviews',
'stargazers',
'team_members',
'team_memberships',
'teams'
}
def expected_check_streams(self):
"""The expected streams without any streams that are not passing due to tap issues with those streams"""
excluded_streams = {
'team_memberships'
}

return self.expected_streams() - excluded_streams

def expected_metadata(self):
"""The expected streams and metadata about the streams"""
Expand Down
75 changes: 75 additions & 0 deletions tests/test_github_all_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os

from tap_tester import runner, connections, menagerie

from base import TestGithubBase


class TestGithubAllFields(TestGithubBase):
"""Test that with all fields selected for a stream automatic and available fields are replicated"""

@staticmethod
def name():
return "tap_tester_github_all_fields"

def test_run(self):
"""
Ensure running the tap with all streams and fields selected results in the
replication of all fields.
- Verify no unexpected streams were replicated
- Verify that more than just the automatic fields are replicated for each stream.
"""

expected_streams = self.expected_streams()

# instantiate connection
conn_id = connections.ensure_connection(self)

# run check mode
found_catalogs = self.run_and_verify_check_mode(conn_id)

# table and field selection
test_catalogs_all_fields = [catalog for catalog in found_catalogs
if catalog.get('stream_name') in expected_streams]
self.perform_and_verify_table_and_field_selection(
conn_id, test_catalogs_all_fields, select_all_fields=True,
)

# grab metadata after performing table-and-field selection to set expectations
stream_to_all_catalog_fields = dict() # used for asserting all fields are replicated
for catalog in test_catalogs_all_fields:
stream_id, stream_name = catalog['stream_id'], catalog['stream_name']
catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id)
fields_from_field_level_md = [md_entry['breadcrumb'][1]
for md_entry in catalog_entry['metadata']
if md_entry['breadcrumb'] != []]
stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md)

# run initial sync
record_count_by_stream = self.run_and_verify_sync(conn_id)
synced_records = runner.get_records_from_target_output()

# Verify no unexpected streams were replicated
synced_stream_names = set(synced_records.keys())
self.assertSetEqual(expected_streams, synced_stream_names)

for stream in expected_streams:
with self.subTest(stream=stream):
# expected values
expected_automatic_keys = self.expected_primary_keys().get(stream)

# get all expected keys
expected_all_keys = stream_to_all_catalog_fields[stream]

# collect actual values
messages = synced_records.get(stream)
actual_all_keys = [set(message['data'].keys()) for message in messages['messages']
if message['action'] == 'upsert'][0]

# Verify that you get some records for each stream
self.assertGreater(record_count_by_stream.get(stream, -1), 0)

# verify all fields for a stream were replicated
self.assertGreater(len(expected_all_keys), len(expected_automatic_keys))
self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"')
self.assertSetEqual(expected_all_keys, actual_all_keys)
72 changes: 72 additions & 0 deletions tests/test_github_automatic_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Test that with no fields selected for a stream automatic fields are still replicated
"""
from tap_tester import runner, connections

from base import TestGithubBase


class TestGithubAutomaticFields(TestGithubBase):
"""Test that with no fields selected for a stream automatic fields are still replicated"""

@staticmethod
def name():
return "tap_tester_github_automatic_fields"

def test_run(self):
"""
- Verify that for each stream you can get multiple pages of data
when no fields are selected.
- Verify that only the automatic fields are sent to the target.
- Verify that all replicated records have unique primary key values.
"""
Comment thread
loeakaodas marked this conversation as resolved.

# BUG TDL-16137 `team_memberships` stream is not passing run_and_verify_sync()
expected_streams = self.expected_check_streams()

# instantiate connection
conn_id = connections.ensure_connection(self)

# run check mode
found_catalogs = self.run_and_verify_check_mode(conn_id)

# table and field selection
test_catalogs_automatic_fields = [catalog for catalog in found_catalogs
if catalog.get('stream_name') in expected_streams]

self.perform_and_verify_table_and_field_selection(
conn_id, test_catalogs_automatic_fields, select_all_fields=False,
)

# run initial sync
record_count_by_stream = self.run_and_verify_sync(conn_id)
synced_records = runner.get_records_from_target_output()

for stream in expected_streams:
with self.subTest(stream=stream):
# expected values
expected_keys = self.expected_primary_keys().get(stream)

# collect actual values
data = synced_records.get(stream, {})
record_messages_keys = [set(row.get('data').keys()) for row in data.get('messages', {})]
primary_keys_list = [
tuple(message.get('data').get(expected_pk) for expected_pk in expected_keys)
for message in data.get('messages')
if message.get('action') == 'upsert']
unique_primary_keys_list = set(primary_keys_list)

# Verify that you get some records for each stream
self.assertGreater(
record_count_by_stream.get(stream, -1), 0,
msg="The number of records is not over the stream max limit for the {} stream".format(stream))

# Verify that only the automatic fields are sent to the target
for actual_keys in record_messages_keys:
self.assertSetEqual(expected_keys, actual_keys)

# Verify that all replicated records have unique primary key values.
self.assertEqual(
len(primary_keys_list),
len(unique_primary_keys_list),
msg="Replicated record does not have unique primary key values.")
Loading