Handle multiple crowdtangle dashboards in same pipeline, fetch and process in parallel (#200)

macpd · web-flow · commit e643dd25d699 · 2021-06-04T18:29:06.000-04:00
* get map of dashboard name -&gt; id before pipeline start. attached dashboard id to encapsulated post instead of doing dashboard name -&gt; id lookup in insert_post_dashboards

* fix crowdtangle db_functions import

* log Dashboard Names -&gt; IDs

* write_crowdtangle_results_to_database.py from master

* post merge cleanup

* Handle multiple dashboards in FetchCrowdTangle PTransform, to config multiple dashboards add config var DASHBOARD_CONFIG_SECTION_NAMES which should be a comma separated list of config sections to read dashboard config data from

* var referene and other lint cleanups

* bump minet dep version due to second-level dep issue (quenouille)

* fix minet version

* minet now returns objects as casanova.namedrecord, so convert it to dict before adding annotation.

* Lots of changes to handle new post datastructure format from minet v0.52.8 CrowdTangleAPIClient

* fix keyword arg for ExpandedLinkRecord

* add necessary PostRecord namedtuple arg

* fix make_statistics_record keyword arg

* fix arg reference in make_statistics_record

* import itertools

* add default False for account_verified

* check media and links values instead of just key presence

* log ProcessCrowdTanglePosts.process input args for debugging

* move crowdtangle fetch out of PTransform into own DoFn

* move crowdtangle fetch out of PTransform into own DoFn

* see if parallelism works with FlatMap

* undo accidental uncapitalization of code block

* remove unneeded code

* remove more unneeded code

* move date parsing out of dashboard parsing loop, move max_results_to_fetch to be dashboard_specific, simplify ref to config_section with new var instead of lookup each time

* more cleanup

* go back to version of minet currently in use in prod, and undo changes to post datastruct processing

* undo unrelated changes in write_crowdtangle_results_to_database
diff --git a/crowdtangle/EXAMPLE.cfg b/crowdtangle/EXAMPLE.cfg
@@ -13,10 +13,23 @@ PORT=<database port>
 # END_DATE=2021-01-02
 # If used, API query uses Today - DAYS_IN_PAST_TO_SYNC for start_date
 DAYS_IN_PAST_TO_SYNC=7
+# Comma separated list of config section names to read dashboard configurations from.
+DASHBOARD_CONFIG_SECTION_NAMES=DASHBOARD_1,DASHBOARD_2
+
+[DASHBOARD_1]
 API_TOKEN=<crowdtangle API token>
 # Dashboard name can be any string. This is used to track which dashboards a posts comes from (potentially multiple)
-DASHBOARD_NAME=<dashboard name, 
+DASHBOARD_NAME=<dashboard name>
 # Limit on number of results to fetch from API. If not specified no limit used.
 # MAX_RESULTS_TO_FETCH=10000000
 # Comman separated list of crowdtangle list ID(s). Leave empty to get posts from all lists (associated to the API token)
 # LIST_IDS=
+
+[DASHBOARD_2]
+API_TOKEN=<crowdtangle API token>
+# Dashboard name can be any string. This is used to track which dashboards a posts comes from (potentially multiple)
+DASHBOARD_NAME=<dashboard name>
+# Limit on number of results to fetch from API. If not specified no limit used.
+# MAX_RESULTS_TO_FETCH=10000000
+# Comman separated list of crowdtangle list ID(s). Leave empty to get posts from all lists (associated to the API token)
+LIST_IDS=2,3
diff --git a/crowdtangle/fetch_crowdtangle.py b/crowdtangle/fetch_crowdtangle.py
@@ -7,34 +7,16 @@
 from minet.crowdtangle.exceptions import CrowdTangleError
 
 
-FetchCrowdTangleArgs = namedtuple('FetchCrowdTangleArgs', ['start_date',
+FetchCrowdTangleArgs = namedtuple('FetchCrowdTangleArgs', ['api_token',
+                                                           'start_date',
                                                            'end_date',
                                                            'list_ids',
                                                            'dashboard_id',
                                                            'max_results_to_fetch'])
 
-
 class FetchCrowdTangle(PTransform):
-    def __init__(self, *args, api_token=None, crowdtangle_client=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        if api_token and crowdtangle_client:
-            raise ValueError('api_token and crowdtangle_client args are mutually exclusive.')
-        self._api_token = api_token
-        self._crowdtangle_client = crowdtangle_client
-
-    def get_crowdtangle_client(self):
-        """Returns the CrowdTangleAPIClient provided in the constructor, or creates a new client
-        from API token stores in GCP secrets manager.
-
-        This is neccessary because CrowdTangleAPIClient hangs when pickled and then depickled (which
-        Apache Beam does sometimes for PTransform objects)
-        """
-        if self._crowdtangle_client:
-            return self._crowdtangle_client
-
-        return CrowdTangleAPIClient(token=self._api_token)
-
     def fetch(self, input_args):
+        logging.info('in FetchCrowdTangle.fetch input_args: %s', input_args)
         try:
             start_date = input_args.start_date
         except KeyError as e:
@@ -59,7 +41,7 @@ def fetch(self, input_args):
         logging.info('Querying CrowdTangle. %s', query_info_message)
         num_posts = 0
         try:
-            crowdtangle_client = self.get_crowdtangle_client()
+            crowdtangle_client = CrowdTangleAPIClient(token=input_args.api_token)
             for post in crowdtangle_client.posts(start_date=start_date, end_date=end_date,
                                                  partition_strategy=partition_strategy,
                                                  sort_by=sort_by, format=format_val,
@@ -82,5 +64,5 @@ def expand(self, p):
         (if encountered)
         """
         return (
-            p | "Fetch CrowdTangle results" >> beam.FlatMap(self.fetch).with_outputs('api_results',
+            p | "Fetch CrowdTangle results" >> beam.ParDo(self.fetch).with_outputs('api_results',
                                                                                      'errors'))
diff --git a/crowdtangle/requirements.txt b/crowdtangle/requirements.txt
@@ -1,4 +1,5 @@
 setuptools
 apache-beam>=2.28.0
 minet==0.47.0
-psycopg2==2.8.6
+quenouille==0.6.6 # required due to intreface change that breaks minet
+psycopg2<3,>=2.8.6
diff --git a/crowdtangle/run_fetch_crowdtangle.py b/crowdtangle/run_fetch_crowdtangle.py
@@ -1,18 +1,64 @@
 import argparse
+import configparser
 import datetime
 import logging
 
+from typing import Sequence
+
 import apache_beam as beam
 from apache_beam.options.pipeline_options import PipelineOptions
 from apache_beam.options.pipeline_options import SetupOptions
 
+import config_utils
 from crowdtangle import fetch_crowdtangle
 from crowdtangle import process_crowdtangle_posts
 from crowdtangle import write_crowdtangle_results_to_database
-
-import config_utils
 from crowdtangle import db_functions
 
+
+def get_dashboards_fetch_args(config: configparser.ConfigParser,
+                              database_connection_params: config_utils.DatabaseConnectionParams) -> Sequence[fetch_crowdtangle.FetchCrowdTangleArgs]:
+    """Gets list of config section names from ['CROWDTANGLE']['DASHBOARD_CONFIG_SECTION_NAMES'],
+    parses API_TOKEN, DASHBOARD_NAME, LIST_IDS from those named config sections, and returns
+    FetchCrowdTangleArgs for each named config section.
+    """
+    dashboard_config_section_names = (
+        config['CROWDTANGLE']['DASHBOARD_CONFIG_SECTION_NAMES'].split(','))
+
+    with config_utils.get_database_connection(database_connection_params) as db_connection:
+        db_interface = db_functions.CrowdTangleDBInterface(db_connection)
+        dashboard_name_to_id = db_interface.all_dashboards_name_to_id()
+        logging.info('Dashboard Names -> IDs: %s', dashboard_name_to_id)
+
+    if 'DAYS_IN_PAST_TO_SYNC' in config['CROWDTANGLE']:
+        start_date = (datetime.date.today() -
+                      datetime.timedelta(days=config['CROWDTANGLE'].getint('DAYS_IN_PAST_TO_SYNC'))
+                      ).isoformat()
+        end_date = None
+    else:
+        start_date = config['CROWDTANGLE'].get('START_DATE')
+        end_date = config['CROWDTANGLE'].get('END_DATE', None)
+
+    fetch_args_list = []
+    for config_section_name in dashboard_config_section_names:
+        config_section = config[config_section_name]
+        api_token = config_section.get('API_TOKEN')
+        dashboard_name = config_section.get('DASHBOARD_NAME')
+        max_results_to_fetch = config_section.getint('MAX_RESULTS_TO_FETCH', None)
+        list_ids = config_section.get('LIST_IDS', None)
+        if list_ids:
+            list_ids = list_ids.split(',')
+
+        fetch_args_list.append(fetch_crowdtangle.FetchCrowdTangleArgs(
+                    api_token=api_token,
+                    list_ids=list_ids,
+                    start_date=start_date,
+                    end_date=end_date,
+                    dashboard_id=dashboard_name_to_id[dashboard_name],
+                    max_results_to_fetch=max_results_to_fetch))
+    return fetch_args_list
+
+
 def run(argv=None, save_main_session=True):
     """Main entry point; defines and runs the wordcount pipeline."""
     parser = argparse.ArgumentParser()
@@ -29,39 +75,14 @@ def run(argv=None, save_main_session=True):
     pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
 
     config = config_utils.get_config(known_args.config_path)
-    max_results_to_fetch = config['CROWDTANGLE'].getint('MAX_RESULTS_TO_FETCH', None)
-    if 'DAYS_IN_PAST_TO_SYNC' in config['CROWDTANGLE']:
-        start_date = (datetime.date.today() -
-                      datetime.timedelta(days=config['CROWDTANGLE'].getint('DAYS_IN_PAST_TO_SYNC'))
-                      ).isoformat()
-        end_date = None
-    else:
-        start_date = config['CROWDTANGLE'].get('START_DATE')
-        end_date = config['CROWDTANGLE'].get('END_DATE', None)
-    api_token = config['CROWDTANGLE'].get('API_TOKEN')
-    list_ids = config['CROWDTANGLE'].get('LIST_IDS', None)
-    dashboard_name = config['CROWDTANGLE'].get('DASHBOARD_NAME')
-    if list_ids:
-        list_ids = list_ids.split(',')
-
     database_connection_params = config_utils.get_database_connection_params_from_config(config)
-    with config_utils.get_database_connection(database_connection_params) as db_connection:
-        db_interface = db_functions.CrowdTangleDBInterface(db_connection)
-        dashboard_name_to_id = db_interface.all_dashboards_name_to_id()
-        logging.info('Dashboard Names -> IDs: %s', dashboard_name_to_id)
-
-    fetch_crowdtangle_args = fetch_crowdtangle.FetchCrowdTangleArgs(
-                list_ids=list_ids,
-                start_date=start_date,
-                end_date=end_date,
-                dashboard_id=dashboard_name_to_id[dashboard_name],
-                max_results_to_fetch=max_results_to_fetch)
+    fetch_args_list = get_dashboards_fetch_args(config, database_connection_params)
 
-    logging.info('About to start crowdtangle fetch pipline with args: %s', fetch_crowdtangle_args)
+    logging.info('About to start crowdtangle fetch pipline with args: %s', fetch_args_list)
     with beam.Pipeline(options=pipeline_options) as pipeline:
         results, errors = (
-            pipeline | beam.Create([fetch_crowdtangle_args])
-            | 'Fetch CrowdTangle results' >> fetch_crowdtangle.FetchCrowdTangle(api_token=api_token)
+            pipeline | beam.Create(fetch_args_list)
+            | 'Fetch CrowdTangle results' >> fetch_crowdtangle.FetchCrowdTangle()
             )
 
         processed_results = (