11import gzip
2+ from typing import List
23
34import boto3
45import jsbeautifier
56import pyarrow .parquet as pq
7+ import pyspark .sql .functions as F
68import s3fs
79from botocore .exceptions import ClientError
810from pyarrow .filesystem import S3FSWrapper # noqa
9- from pyspark .sql import SQLContext
11+ from pyspark import SparkContext
12+ from pyspark .sql import DataFrame , SQLContext
1013
11- class S3Dataset :
12- def __init__ (self , s3_directory , s3_bucket = 'openwpm-crawls' ):
14+ from openwpm_utils .crawlhistory import get_worst_status_per_visit_id
15+ from openwpm_utils .dataquality import TableFilter
16+
17+
18+ class S3Dataset (object ):
19+ def __init__ (self , s3_directory , s3_bucket = "openwpm-crawls" ):
1320 """Helper class to load OpenWPM datasets from S3 using pandas
1421
1522 This dataset wrapper is safe to use by spark worker processes, as it
@@ -38,30 +45,33 @@ def read_table(self, table_name, columns=None):
3845 columns : list of strings
3946 The set of columns to filter the parquet dataset by
4047 """
41- return pq .ParquetDataset (
42- self ._s3_table_loc % table_name ,
43- filesystem = self ._s3fs ,
44- metadata_nthreads = 4
45- ).read (use_pandas_metadata = True , columns = columns ).to_pandas ()
48+ return (
49+ pq .ParquetDataset (
50+ self ._s3_table_loc % table_name ,
51+ filesystem = self ._s3fs ,
52+ metadata_nthreads = 4 ,
53+ )
54+ .read (use_pandas_metadata = True , columns = columns )
55+ .to_pandas ()
56+ )
4657
4758 def collect_content (self , content_hash , beautify = False ):
4859 """Collect content by directly connecting to S3 via boto3"""
49- s3 = boto3 .client ('s3' )
60+ s3 = boto3 .client ("s3" )
5061 try :
5162 obj = s3 .get_object (
52- Bucket = self ._s3_bucket ,
53- Key = self ._content_key % content_hash
63+ Bucket = self ._s3_bucket , Key = self ._content_key % content_hash
5464 )
5565 body = obj ["Body" ]
5666 compressed_content = body .read ()
5767 body .close ()
5868 except ClientError as e :
59- if e .response [' Error' ][ ' Code' ] != ' NoSuchKey' :
69+ if e .response [" Error" ][ " Code" ] != " NoSuchKey" :
6070 raise
6171 else :
6272 return None
6373
64- with gzip .GzipFile (fileobj = compressed_content , mode = 'r' ) as f :
74+ with gzip .GzipFile (fileobj = compressed_content , mode = "r" ) as f :
6575 content = f .read ()
6676
6777 if content is None or content == "" :
@@ -74,9 +84,11 @@ def collect_content(self, content_hash, beautify=False):
7484 pass
7585 return content
7686
87+
7788class PySparkS3Dataset (S3Dataset ):
78- def __init__ (self , spark_context , s3_directory ,
79- s3_bucket = 'openwpm-crawls' ):
89+ def __init__ (
90+ self , spark_context , s3_directory : str , s3_bucket : str = "openwpm-crawls"
91+ ):
8092 """Helper class to load OpenWPM datasets from S3 using PySpark
8193
8294 Parameters
@@ -89,16 +101,17 @@ def __init__(self, spark_context, s3_directory,
89101 s3_bucket : string, optional
90102 The bucket name on S3. Defaults to `openwpm-crawls`.
91103 """
92- self ._s3_bucket = s3_bucket
93- self ._s3_directory = s3_directory
104+ super ().__init__ (s3_directory , s3_bucket )
94105 self ._spark_context = spark_context
95106 self ._sql_context = SQLContext (spark_context )
96- self ._s3_table_loc = "s3a://%s/%s/visits/%%s/" % (
97- s3_bucket , s3_directory )
98- self . _s3_content_loc = "s3a://%s/%s/content/%%s.gz" % (
99- s3_bucket , s3_directory )
107+ self ._s3_table_loc = f "s3a://{ self . _s3_table_loc } "
108+ incomplete_visits = self . read_table ( "incomplete_visits" , mode = "all" )
109+ crawl_history = self . read_table ( "crawl_history" , mode = "all" )
110+ self . _filter = TableFilter ( incomplete_visits , crawl_history )
100111
101- def read_table (self , table_name , columns = None ):
112+ def read_table (
113+ self , table_name : str , columns : List [str ] = None , mode : str = "successful"
114+ ):
102115 """Read `table_name` from OpenWPM dataset into a pyspark dataframe.
103116
104117 Parameters
@@ -107,8 +120,26 @@ def read_table(self, table_name, columns=None):
107120 OpenWPM table to read
108121 columns : list of strings
109122 The set of columns to filter the parquet dataset by
123+ mode : string
124+ The valid values are "successful", "failed", "all"
125+ Success is determined per visit_id. A visit_id is failed
126+ if one of it's commands failed or if it's in the interrupted table
110127 """
111128 table = self ._sql_context .read .parquet (self ._s3_table_loc % table_name )
129+
130+ if mode == "all" :
131+ table = table
132+ elif mode == "failed" :
133+ table = self ._filter .dirty_table (table )
134+ elif mode == "successful" :
135+ table = self ._filter .clean_table (table )
136+ else :
137+ raise AssertionError (
138+ f"Mode was ${ mode } ,"
139+ "allowed modes are 'all', 'failed' and 'successful'"
140+ )
141+
112142 if columns is not None :
113- return table .select (columns )
143+ table = table .select (columns )
144+
114145 return table
0 commit comments