-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoverlap_analysis.py
More file actions
69 lines (57 loc) · 2.35 KB
/
overlap_analysis.py
File metadata and controls
69 lines (57 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json
import pandas as pd
# Load dataset splits and mapping
cholec80 = json.load(open("./resources/Cholec80_splits.json"))
cholecT50 = json.load(open("./resources/CholecT50_splits.json"))
endoscapes = json.load(open("./resources/Endoscapes_splits.json"))
mapping = json.load(open("./resources/mapping_to_endoscapes.json"))
mapping_to_public = {v: k for k, v in mapping.items()}
endoscapes_vid_id_map = (
pd.read_csv("./resources/endoscapes_vid_id_map.csv")
.set_index("orig_vid_id")["public_vid_id"]
.to_dict()
)
# Convert public IDs to Endoscapes IDs
def convert_to_endoscapes_ids(dataset_splits):
return {
split: {mapping[str(id)] for id in ids} for split, ids in dataset_splits.items()
}
cholec80_private = convert_to_endoscapes_ids(cholec80)
cholecT50_private = convert_to_endoscapes_ids(cholecT50)
endoscapes_private = {split: set(ids) for split, ids in endoscapes.items()}
# Define dataset pairs and analyze overlaps
datasets = {
"Cholec80": cholec80_private,
"CholecT50": cholecT50_private,
"Endoscapes": endoscapes_private,
}
pairs = [
("Cholec80", "CholecT50"),
("Endoscapes", "Cholec80"),
("Endoscapes", "CholecT50"),
]
for dataset_a, dataset_b in pairs:
print(f"\n{'='*50}")
print(f"Analyzing {dataset_a} vs {dataset_b}")
print(f"{'='*50}\n")
for split_a in ["train", "val", "test"]:
for split_b in ["train", "val", "test"]:
a_set = datasets[dataset_a][split_a]
b_set = datasets[dataset_b][split_b]
overlap = a_set & b_set
if overlap:
public_ids = [mapping_to_public[p] for p in overlap]
print(f"{dataset_a} {split_a} and {dataset_b} {split_b}:")
print(f"Overlap count: {len(overlap)}")
print(
f"Video IDs in {dataset_b}: {sorted([int(id) for id in public_ids])}\n"
)
if dataset_a == "Endoscapes":
endoscapes_ids = [endoscapes_vid_id_map[p] for p in overlap]
# overlap_private_ids = sorted([int(id) for id in overlap])
print(
f"Video IDs in {dataset_a}: {sorted([int(id) for id in endoscapes_ids])}\n"
)
else:
print(f"{dataset_a} {split_a} and {dataset_b} {split_b}: No overlap\n")
print(f"{'-'*100}\n")