singer-io · loeakaodas · Oct 15, 2021 · Oct 15, 2021 · Oct 15, 2021 · Oct 15, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:tap-tester-v4
+      - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester
     steps:
       - checkout
       - run:
@@ -21,21 +21,21 @@ jobs:
           name: 'pylint'
           command: |
             source /usr/local/share/virtualenvs/tap-github/bin/activate
-            pylint tap_github --disable 'broad-except,chained-comparison,empty-docstring,fixme,invalid-name,line-too-long,missing-class-docstring,missing-function-docstring,missing-module-docstring,no-else-raise,no-else-return,too-few-public-methods,too-many-arguments,too-many-branches,too-many-lines,too-many-locals,ungrouped-imports,wrong-spelling-in-comment,wrong-spelling-in-docstring,bad-whitespace'
+            pylint tap_github --disable 'missing-module-docstring,missing-function-docstring,missing-class-docstring,line-too-long,invalid-name,too-many-lines,consider-using-f-string,too-many-arguments,too-many-locals'
       - run:
+          name: 'Unit Tests'
+          command: |
+            source /usr/local/share/virtualenvs/tap-github/bin/activate
+            python -m unittest discover -s tests/unittests
+      - run:
+          when: always
           name: 'Integration Tests'
           command: |
             aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh
             source dev_env.sh
             source /usr/local/share/virtualenvs/tap-tester/bin/activate
-            run-test --tap=tap-github \
-                     --target=target-stitch \
-                     --orchestrator=stitch-orchestrator \
-                     --email=harrison+sandboxtest@stitchdata.com \
-                     --password=$SANDBOX_PASSWORD \
-                     --client-id=50 \
-                     --token=$STITCH_API_TOKEN \
-                     tests
+            run-test --tap=tap-github tests
+
 workflows:
   version: 2
   commit:

diff --git a/.gitignore b/.gitignore
@@ -97,3 +97,8 @@ properties.json
 
 # Jetbrains IDE
 .idea
+
+# macOS
+*.DS_Store
+.AppleDouble
+.LSOverride
diff --git a/tap_github/__init__.py b/tap_github/__init__.py
@@ -1,14 +1,12 @@
-import argparse
 import os
 import json
 import collections
 import time
 import requests
 import singer
-import singer.bookmarks as bookmarks
-import singer.metrics as metrics
 
-from singer import metadata
+from singer import (bookmarks, metrics, metadata)
+from simplejson import JSONDecodeError
 
 session = requests.Session()
 logger = singer.get_logger()
@@ -168,7 +166,7 @@ def raise_for_error(resp, source):
     error_code = resp.status_code
     try:
         response_json = resp.json()
-    except Exception:
+    except JSONDecodeError:
         response_json = {}
 
     if error_code == 404:
@@ -242,7 +240,7 @@ def load_schemas():
     for filename in os.listdir(get_abs_path('schemas')):
         path = get_abs_path('schemas') + '/' + filename
         file_raw = filename.replace('.json', '')
-        with open(path) as file:
+        with open(path, encoding='utf-8') as file:
             schemas[file_raw] = json.load(file)
 
     schemas['pr_commits'] = generate_pr_commit_schema(schemas['commits'])
@@ -873,7 +871,7 @@ def get_all_collaborators(schema, repo_path, state, mdata, _start_date):
                 with singer.Transformer() as transformer:
                     rec = transformer.transform(collaborator, schema, metadata=metadata.to_map(mdata))
                 singer.write_record('collaborators', rec, time_extracted=extraction_time)
-                singer.write_bookmark(state, repo_path, 'collaborator', {'since': singer.utils.strftime(extraction_time)})
+                singer.write_bookmark(state, repo_path, 'collaborators', {'since': singer.utils.strftime(extraction_time)})
                 counter.increment()
 
     return state

diff --git a/tests/base.py b/tests/base.py
@@ -45,8 +45,8 @@ def get_properties(self, original: bool = True):
         :param original: set to false to change the start_date or end_date
         """
         return_value = {
-            'start_date' : dt.strftime(dt.utcnow()-timedelta(days=5), self.START_DATE_FORMAT),
-            'repository': 'singer-io/tap-github'
+            'start_date' : '2021-10-01T00:00:00Z',
+            'repository': 'singer-io/test-repo'
         }
         if original:
             return return_value
@@ -60,32 +60,13 @@ def get_credentials(self):
             'access_token': os.getenv("TAP_GITHUB_TOKEN")
         }
 
-    @staticmethod
-    def expected_check_streams():
-        return {
-            'assignees',
-            'collaborators',
-            'comments',
-            'commit_comments',
-            'commits',
-            'events',
-            'issue_labels',
-            'issue_milestones',
-            'issue_events',
-            'issues',
-            'pr_commits',
-            'project_cards',
-            'project_columns',
-            'projects',
-            'pull_requests',
-            'releases',
-            'review_comments',
-            'reviews',
-            'stargazers',
-            'team_members',
-            'team_memberships',
-            'teams'
-        }
+    def expected_check_streams(self):
+        """The expected streams without any streams that are not passing due to tap issues with those streams"""
+        excluded_streams = {
+            'team_memberships'
+            }
+
+        return self.expected_streams() - excluded_streams
 
     def expected_metadata(self):
         """The expected streams and metadata about the streams"""

diff --git a/tests/test_github_all_fields.py b/tests/test_github_all_fields.py
@@ -0,0 +1,75 @@
+import os
+
+from tap_tester import runner, connections, menagerie
+
+from base import TestGithubBase
+
+
+class TestGithubAllFields(TestGithubBase):
+    """Test that with all fields selected for a stream automatic and available fields are  replicated"""
+
+    @staticmethod
+    def name():
+        return "tap_tester_github_all_fields"
+
+    def test_run(self):
+        """
+        Ensure running the tap with all streams and fields selected results in the
+        replication of all fields.
+        - Verify no unexpected streams were replicated
+        - Verify that more than just the automatic fields are replicated for each stream.
+        """
+
+        expected_streams = self.expected_streams()
+
+        # instantiate connection
+        conn_id = connections.ensure_connection(self)
+
+        # run check mode
+        found_catalogs = self.run_and_verify_check_mode(conn_id)
+
+        # table and field selection
+        test_catalogs_all_fields = [catalog for catalog in found_catalogs
+                                    if catalog.get('stream_name') in expected_streams]
+        self.perform_and_verify_table_and_field_selection(
+            conn_id, test_catalogs_all_fields, select_all_fields=True,
+        )
+
+        # grab metadata after performing table-and-field selection to set expectations
+        stream_to_all_catalog_fields = dict() # used for asserting all fields are replicated
+        for catalog in test_catalogs_all_fields:
+            stream_id, stream_name = catalog['stream_id'], catalog['stream_name']
+            catalog_entry = menagerie.get_annotated_schema(conn_id, stream_id)
+            fields_from_field_level_md = [md_entry['breadcrumb'][1]
+                                          for md_entry in catalog_entry['metadata']
+                                          if md_entry['breadcrumb'] != []]
+            stream_to_all_catalog_fields[stream_name] = set(fields_from_field_level_md)
+
+        # run initial sync
+        record_count_by_stream = self.run_and_verify_sync(conn_id)
+        synced_records = runner.get_records_from_target_output()
+
+        # Verify no unexpected streams were replicated
+        synced_stream_names = set(synced_records.keys())
+        self.assertSetEqual(expected_streams, synced_stream_names)
+
+        for stream in expected_streams:
+            with self.subTest(stream=stream):
+                # expected values
+                expected_automatic_keys = self.expected_primary_keys().get(stream)
+
+                # get all expected keys
+                expected_all_keys = stream_to_all_catalog_fields[stream]
+
+                # collect actual values
+                messages = synced_records.get(stream)
+                actual_all_keys = [set(message['data'].keys()) for message in messages['messages']
+                                   if message['action'] == 'upsert'][0]
+
+                # Verify that you get some records for each stream
+                self.assertGreater(record_count_by_stream.get(stream, -1), 0)
+
+                # verify all fields for a stream were replicated
+                self.assertGreater(len(expected_all_keys), len(expected_automatic_keys))
+                self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"')
+                self.assertSetEqual(expected_all_keys, actual_all_keys)
diff --git a/tests/test_github_automatic_fields.py b/tests/test_github_automatic_fields.py
@@ -0,0 +1,72 @@
+"""
+Test that with no fields selected for a stream automatic fields are still replicated
+"""
+from tap_tester import runner, connections
+
+from base import TestGithubBase
+
+
+class TestGithubAutomaticFields(TestGithubBase):
+    """Test that with no fields selected for a stream automatic fields are still replicated"""
+
+    @staticmethod
+    def name():
+        return "tap_tester_github_automatic_fields"
+
+    def test_run(self):
+        """
+        - Verify that for each stream you can get multiple pages of data
+            when no fields are selected.
+        - Verify that only the automatic fields are sent to the target.
+        - Verify that all replicated records have unique primary key values.
+        """
+
+        # BUG TDL-16137 `team_memberships` stream is not passing run_and_verify_sync()
+        expected_streams = self.expected_check_streams()
+
+        # instantiate connection
+        conn_id = connections.ensure_connection(self)
+
+        # run check mode
+        found_catalogs = self.run_and_verify_check_mode(conn_id)
+
+        # table and field selection
+        test_catalogs_automatic_fields = [catalog for catalog in found_catalogs
+                                          if catalog.get('stream_name') in expected_streams]
+
+        self.perform_and_verify_table_and_field_selection(
+            conn_id, test_catalogs_automatic_fields, select_all_fields=False,
+        )
+
+        # run initial sync
+        record_count_by_stream = self.run_and_verify_sync(conn_id)
+        synced_records = runner.get_records_from_target_output()
+
+        for stream in expected_streams:
+            with self.subTest(stream=stream):
+                # expected values
+                expected_keys = self.expected_primary_keys().get(stream)
+
+                # collect actual values
+                data = synced_records.get(stream, {})
+                record_messages_keys = [set(row.get('data').keys()) for row in data.get('messages', {})]
+                primary_keys_list = [
+                    tuple(message.get('data').get(expected_pk) for expected_pk in expected_keys)
+                    for message in data.get('messages')
+                    if message.get('action') == 'upsert']
+                unique_primary_keys_list = set(primary_keys_list)
+
+                # Verify that you get some records for each stream
+                self.assertGreater(
+                    record_count_by_stream.get(stream, -1), 0,
+                    msg="The number of records is not over the stream max limit for the {} stream".format(stream))
+
+                # Verify that only the automatic fields are sent to the target
+                for actual_keys in record_messages_keys:
+                    self.assertSetEqual(expected_keys, actual_keys)
+
+                # Verify that all replicated records have unique primary key values.
+                self.assertEqual(
+                    len(primary_keys_list),
+                    len(unique_primary_keys_list),
+                    msg="Replicated record does not have unique primary key values.")