@@ -154,6 +154,17 @@ def count_files_in_folder(folder: Path) -> int:
154154 return sum (1 for _ in folder .rglob ("*" ) if _ .is_file ())
155155
156156
157+ def copy_file (src_fp , target_folder , data_folder ):
158+ """
159+ Construct the target file path based on the file's relative location from data_folder,
160+ then copy the file if it doesn't already exist.
161+ """
162+ target_fp = target_folder / src_fp .relative_to (data_folder )
163+ if not target_fp .exists ():
164+ target_fp .parent .mkdir (parents = True , exist_ok = True )
165+ shutil .copy (src_fp , target_fp )
166+
167+
157168def create_debug_data (
158169 competition : str ,
159170 dr_cls : type [DataReducer ] = UniqueIDDataReducer ,
@@ -239,13 +250,16 @@ def create_debug_data(
239250
240251 # Process non-data files
241252 subfolder_dict = {}
253+ global_groups = defaultdict (list )
242254 for file_path in files_to_process :
243255 if file_path in processed_files :
244256 continue # Already handled above
245257 rel_dir = file_path .relative_to (data_folder ).parts [0 ]
246258 subfolder_dict .setdefault (rel_dir , []).append (file_path )
259+ global_groups [file_path .stem ].append (Path (file_path ))
247260
248261 # For each subfolder, decide which files to copy
262+ selected_groups = []
249263 for rel_dir , file_list in tqdm (subfolder_dict .items (), desc = "Processing files" , unit = "file" ):
250264 used_files = []
251265 not_used_files = []
@@ -262,11 +276,7 @@ def create_debug_data(
262276
263277 # Directly copy used files
264278 for uf in used_files :
265- sampled_file_path = sample_folder / uf .relative_to (data_folder )
266- if sampled_file_path .exists ():
267- continue
268- sampled_file_path .parent .mkdir (parents = True , exist_ok = True )
269- shutil .copy (uf , sampled_file_path )
279+ copy_file (uf , sample_folder , data_folder )
270280
271281 # If no files are used, randomly sample files to keep the folder from being empty
272282 if len (used_files ) == 0 :
@@ -275,29 +285,23 @@ def create_debug_data(
275285 else :
276286 num_to_keep = max (int (len (file_list ) * min_frac ), min_num )
277287
278- # Group files by their base name (i.e., filename without extension)
279- groups = defaultdict (list )
280- for nf in not_used_files :
281- groups [nf .stem ].append (nf ) # nf.stem gives the file name without its extension
282-
283- # Convert the dictionary to a list of groups, where each group is a list of files with the same base name
284- group_list = list (groups .values ())
285-
286288 # Use a greedy strategy to select groups so that the total number of files is as close as possible to num_to_keep
287- selected_groups = []
288289 total_files = 0
289- for group in group_list :
290- # If adding the entire group does not exceed the target, add it
291- if not selected_groups or total_files + len (group ) <= num_to_keep :
292- selected_groups .append (group )
293- total_files += len (group )
294- else :
290+ for nf in not_used_files :
291+ if total_files > num_to_keep :
295292 break
293+ if nf .stem in selected_groups :
294+ total_files += 1
295+ else :
296+ selected_groups .append (nf .stem )
297+ total_files += 1
296298
297299 print (f"Sampling { num_to_keep } files without label from { total_files } files in { rel_dir } " )
298300
299301 # Flatten the selected groups into a single list of files
300- sampled_not_used = [nf for group in selected_groups for nf in group ]
302+ sampled_not_used = [
303+ nf for group , value in global_groups .items () if group in selected_groups for nf in value
304+ ]
301305
302306 # Copy the selected files to the target directory (all files with the same base name will be copied)
303307 for nf in sampled_not_used :
@@ -311,11 +315,7 @@ def create_debug_data(
311315 # Copy extra files
312316 print (f"Copying { len (extra_files )} extra files" )
313317 for uf in extra_files :
314- sampled_file_path = sample_folder / uf .relative_to (data_folder )
315- if sampled_file_path .exists ():
316- continue
317- sampled_file_path .parent .mkdir (parents = True , exist_ok = True )
318- shutil .copy (uf , sampled_file_path )
318+ copy_file (uf , sample_folder , data_folder )
319319
320320 final_files_count = count_files_in_folder (sample_folder )
321321 print (f"[INFO] After sampling, the sample folder `{ sample_folder } ` contains { final_files_count } files in total." )
0 commit comments