1515from openwpm_utils .dataquality import TableFilter
1616
1717
18- class S3Dataset ( object ) :
19- def __init__ (self , s3_directory , s3_bucket = "openwpm-crawls" ):
18+ class S3Dataset :
19+ def __init__ (self , s3_directory : str , s3_bucket : str = "openwpm-crawls" ):
2020 """Helper class to load OpenWPM datasets from S3 using pandas
2121
2222 This dataset wrapper is safe to use by spark worker processes, as it
@@ -87,8 +87,11 @@ def collect_content(self, content_hash, beautify=False):
8787
8888class PySparkS3Dataset (S3Dataset ):
8989 def __init__ (
90- self , spark_context , s3_directory : str , s3_bucket : str = "openwpm-crawls"
91- ):
90+ self ,
91+ spark_context : SparkContext ,
92+ s3_directory : str ,
93+ s3_bucket : str = "openwpm-crawls" ,
94+ ) -> None :
9295 """Helper class to load OpenWPM datasets from S3 using PySpark
9396
9497 Parameters
@@ -111,7 +114,7 @@ def __init__(
111114
112115 def read_table (
113116 self , table_name : str , columns : List [str ] = None , mode : str = "successful"
114- ):
117+ ) -> DataFrame :
115118 """Read `table_name` from OpenWPM dataset into a pyspark dataframe.
116119
117120 Parameters
@@ -126,7 +129,6 @@ def read_table(
126129 if one of it's commands failed or if it's in the interrupted table
127130 """
128131 table = self ._sql_context .read .parquet (self ._s3_table_loc % table_name )
129-
130132 if mode == "all" :
131133 table = table
132134 elif mode == "failed" :
0 commit comments