Skip to content

Commit 247adea

Browse files
author
Stefan Zabka
committed
Backporting from next
1 parent cb19511 commit 247adea

File tree

1 file changed

+8
-6
lines changed

1 file changed

+8
-6
lines changed

openwpm_utils/s3.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
from openwpm_utils.dataquality import TableFilter
1616

1717

18-
class S3Dataset(object):
19-
def __init__(self, s3_directory, s3_bucket="openwpm-crawls"):
18+
class S3Dataset:
19+
def __init__(self, s3_directory: str, s3_bucket: str = "openwpm-crawls"):
2020
"""Helper class to load OpenWPM datasets from S3 using pandas
2121
2222
This dataset wrapper is safe to use by spark worker processes, as it
@@ -87,8 +87,11 @@ def collect_content(self, content_hash, beautify=False):
8787

8888
class PySparkS3Dataset(S3Dataset):
8989
def __init__(
90-
self, spark_context, s3_directory: str, s3_bucket: str = "openwpm-crawls"
91-
):
90+
self,
91+
spark_context: SparkContext,
92+
s3_directory: str,
93+
s3_bucket: str = "openwpm-crawls",
94+
) -> None:
9295
"""Helper class to load OpenWPM datasets from S3 using PySpark
9396
9497
Parameters
@@ -111,7 +114,7 @@ def __init__(
111114

112115
def read_table(
113116
self, table_name: str, columns: List[str] = None, mode: str = "successful"
114-
):
117+
) -> DataFrame:
115118
"""Read `table_name` from OpenWPM dataset into a pyspark dataframe.
116119
117120
Parameters
@@ -126,7 +129,6 @@ def read_table(
126129
if one of it's commands failed or if it's in the interrupted table
127130
"""
128131
table = self._sql_context.read.parquet(self._s3_table_loc % table_name)
129-
130132
if mode == "all":
131133
table = table
132134
elif mode == "failed":

0 commit comments

Comments
 (0)