-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_images.py
More file actions
110 lines (87 loc) · 3.36 KB
/
preprocess_images.py
File metadata and controls
110 lines (87 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# coding: utf-8
import argparse
import hashlib
import os
import cv2
from PIL import Image
from tqdm import tqdm
def process_images(base_dir):
file_paths = []
for root, _, files in os.walk(base_dir):
for file in files:
if file.lower().endswith(
('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
file_paths.append(os.path.join(root, file))
for file_path in tqdm(file_paths, desc="Processing images"):
try:
img = cv2.imread(file_path)
if img is None:
print(f"Corrupted image detected: {file_path}")
continue
if file_path.lower().endswith('.png'):
new_file_path = file_path.rsplit('.', 1)[0] + '.jpg'
cv2.imwrite(new_file_path, img,
[int(cv2.IMWRITE_JPEG_QUALITY), 95])
os.remove(file_path)
else:
cv2.imwrite(file_path, img)
except Exception as e:
print(f"Error processing {file_path}: {e}")
def resize_images(directory, target_size=(224, 224)):
image_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(('.png', '.jpg', '.jpeg')):
image_files.append(os.path.join(root, file))
for file_path in tqdm(image_files, desc="Resizing images"):
try:
with Image.open(file_path) as img:
if img.size != target_size:
img = img.resize(target_size, Image.LANCZOS)
img.save(file_path)
except Exception as e:
print(f"Error processing {file_path}: {e}")
def calculate_hash(image_path):
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (224, 224))
return hashlib.md5(img).hexdigest()
def find_and_remove_duplicate_images(base_dir):
image_hashes = {}
duplicates = []
file_paths = []
for root, _, files in os.walk(base_dir):
for file in files:
if file.lower().endswith(('.jpg', '.jpeg', '.png')):
file_paths.append(os.path.join(root, file))
for file_path in tqdm(file_paths, desc='Checking for duplicates'):
try:
image_hash = calculate_hash(file_path)
if image_hash in image_hashes:
os.remove(file_path)
duplicates.append((image_hashes[image_hash][0], file_path))
else:
image_hashes[image_hash] = [file_path]
except Exception as e:
print(f"Error processing {file_path}: {e}")
return duplicates
def main():
parser = argparse.ArgumentParser(
description='Preprocess images in a directory.')
parser.add_argument('-d',
'--directory',
required=True,
help='Path to the directory containing images.')
args = parser.parse_args()
directory_path = args.directory
process_images(directory_path)
resize_images(directory_path)
duplicates = find_and_remove_duplicate_images(directory_path)
if duplicates:
print("Duplicate images found and removed:")
for original, duplicate in duplicates:
print(f"Original: {original}, Duplicate: {duplicate}")
else:
print("No duplicate images found.")
if __name__ == '__main__':
main()