forked from rom1504/cc2dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_platform_urls.py
More file actions
123 lines (97 loc) · 4.01 KB
/
check_platform_urls.py
File metadata and controls
123 lines (97 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# End-to-end example of checking platform URLs from WAT files just to get training data for platform URLs classifier
from tqdm import tqdm
import pandas as pd
from yt_dlp import YoutubeDL
import multiprocessing.pool as pool
import warnings
from urllib.parse import urlparse
import numpy as np
from fastwarc import ArchiveIterator, WarcRecordType
import fsspec
import random
import simdjson
from urllib.parse import urljoin
warnings.filterwarnings("ignore")
class loggerOutputs:
def error(msg):
# print(msg)
pass
def warning(msg):
pass
def debug(msg):
pass
def check_url(v_url):
yt_args = {
'quiet': True,
'no_warnings': True,
'ignoreerrors': True,
"logger": loggerOutputs,
"socket_timeout": 1
}
try:
with YoutubeDL(yt_args) as ydl:
info_dict = ydl.extract_info(v_url, download=False, process=False)
return v_url, info_dict['formats'][0]['url'] # if it has a link then it's a video
except Exception as err:
return v_url, None
def flatten(l):
return [item for sublist in l for item in sublist]
def check(urls):
extracted_urls = []
# print(pd.read_parquet('extracted_data.parquet')['info'].values)
for url in tqdm(urls):
extracted_urls.append(check_url(url))
return extracted_urls
def make_link_absolute(url, base_url):
if url.startswith("http://") or url.startswith("https://"):
return url
try:
return urljoin(base_url, url)
except ValueError:
return url
def make_links_absolute(links, base_url):
return [make_link_absolute(link["url"], base_url) for link in links if link.get('url')]
if __name__ == '__main__':
num_wats = 10
num_links_per_wat = 10_000
wat_ids = random.sample(list(range(100)), num_wats)
all_links = []
for i in wat_ids:
wat_links = []
url = f'https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-40/segments/1664030331677.90/wat/CC-MAIN-20220924151538-20220924181538-{i:05d}.warc.wat.gz'
with fsspec.open(url, mode="rb", compression="gzip") as f:
for record in ArchiveIterator(f, record_types=WarcRecordType.metadata, parse_http=False):
try:
record_data = simdjson.load(record.reader) # type: ignore
except: # pylint: disable=bare-except
continue
envelope = record_data["Envelope"]
payload = envelope["Payload-Metadata"]
if "HTTP-Response-Metadata" not in payload:
continue
http_resp = payload["HTTP-Response-Metadata"]
if "HTML-Metadata" not in http_resp:
continue
metadata = http_resp["HTML-Metadata"]
if "Links" not in metadata:
continue
links = metadata["Links"]
base_url = envelope["WARC-Header-Metadata"]["WARC-Target-URI"]
if "Head" in metadata and "Base" in metadata["Head"]:
try:
base_url = urljoin(base_url, metadata["Head"]["Base"])
except ValueError:
pass
links = make_links_absolute(links, base_url)
wat_links.extend(links)
all_links.extend(random.sample(wat_links, num_links_per_wat))
all_links_df = pd.DataFrame(all_links, columns=['url']).drop_duplicates('url')
all_links_df.to_parquet('all_links.parquet')
n_proc = 64
urls = np.array_split(all_links, n_proc)
with pool.ThreadPool(n_proc) as p:
processed_urls = flatten(p.map(check, urls))
df = pd.DataFrame(processed_urls, columns=['url', 'extracted_url'])
df['label'] = df['extracted_url'].apply(lambda x: 'positive' if x else 'negative')
print(df.label.value_counts()) # just to see what we got
df.to_parquet('checked_urls.parquet', engine='fastparquet')