-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdownload_images.py
More file actions
91 lines (74 loc) · 2.84 KB
/
download_images.py
File metadata and controls
91 lines (74 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# coding: utf-8
import argparse
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
import requests
from loguru import logger
def download_image(url, filename, _id):
"""Downloads an image, handling common errors gracefully. Returns ID on failure."""
try:
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
return None # Indicating success
else:
logger.error(f'Download failed ({response.status_code}): {url}')
except requests.RequestException as e:
logger.error(f'Connection error for {url}: {e}')
return _id # Return the ID on failure
def process_chunk(urls, download_folder):
"""Processes a chunk of URLs synchronously using a ThreadPool for concurrency and appends failed downloads to a JSON file."""
try:
with open('failed_downloads.json', 'r') as f:
failed_downloads = json.load(f)
except FileNotFoundError:
failed_downloads = []
with ThreadPoolExecutor() as executor:
futures = {}
for _id, url in urls:
url = url.replace('medium', 'original')
suffix = Path(url).suffix.lower()
if suffix not in ['.jpg', '.jpeg', '.png']:
continue
filename = os.path.join(download_folder, f'{_id}{suffix}')
futures[executor.submit(download_image, url, filename, _id)] = _id
for future in as_completed(futures):
result = future.result()
if result is not None:
failed_downloads.append(result)
with open('failed_downloads.json', 'w') as f:
json.dump(failed_downloads, f)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-d",
"--data",
type=str,
required=True,
help=
"Path to the data parquet file (e.g., data_filtered_by_license.parquet)"
)
parser.add_argument("-f",
"--download_folder",
type=str,
default="images",
help="Folder to save downloaded images")
args = parser.parse_args()
logger.add('download_images.log')
df = pd.read_parquet(args.data)
df = df.dropna(subset=['image_url'])
obsv_urls = list(zip(df['photo_id'], df['image_url']))
download_folder = args.download_folder
Path(download_folder).mkdir(exist_ok=True)
chunk_size = 100
total_processed = 0
for i in range(0, len(obsv_urls), chunk_size):
chunk = obsv_urls[i:i + chunk_size]
process_chunk(chunk, download_folder)
total_processed += len(chunk)
logger.info(f"Processed {total_processed} items so far")