Skip to content

Commit 8204983

Browse files
authored
chore: print txt info (microsoft#600)
* print head for txt files * update sample
1 parent 75eaecf commit 8204983

File tree

2 files changed

+27
-27
lines changed

2 files changed

+27
-27
lines changed

rdagent/scenarios/data_science/debug/data.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,17 @@ def count_files_in_folder(folder: Path) -> int:
154154
return sum(1 for _ in folder.rglob("*") if _.is_file())
155155

156156

157+
def copy_file(src_fp, target_folder, data_folder):
158+
"""
159+
Construct the target file path based on the file's relative location from data_folder,
160+
then copy the file if it doesn't already exist.
161+
"""
162+
target_fp = target_folder / src_fp.relative_to(data_folder)
163+
if not target_fp.exists():
164+
target_fp.parent.mkdir(parents=True, exist_ok=True)
165+
shutil.copy(src_fp, target_fp)
166+
167+
157168
def create_debug_data(
158169
competition: str,
159170
dr_cls: type[DataReducer] = UniqueIDDataReducer,
@@ -239,13 +250,16 @@ def create_debug_data(
239250

240251
# Process non-data files
241252
subfolder_dict = {}
253+
global_groups = defaultdict(list)
242254
for file_path in files_to_process:
243255
if file_path in processed_files:
244256
continue # Already handled above
245257
rel_dir = file_path.relative_to(data_folder).parts[0]
246258
subfolder_dict.setdefault(rel_dir, []).append(file_path)
259+
global_groups[file_path.stem].append(Path(file_path))
247260

248261
# For each subfolder, decide which files to copy
262+
selected_groups = []
249263
for rel_dir, file_list in tqdm(subfolder_dict.items(), desc="Processing files", unit="file"):
250264
used_files = []
251265
not_used_files = []
@@ -262,11 +276,7 @@ def create_debug_data(
262276

263277
# Directly copy used files
264278
for uf in used_files:
265-
sampled_file_path = sample_folder / uf.relative_to(data_folder)
266-
if sampled_file_path.exists():
267-
continue
268-
sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
269-
shutil.copy(uf, sampled_file_path)
279+
copy_file(uf, sample_folder, data_folder)
270280

271281
# If no files are used, randomly sample files to keep the folder from being empty
272282
if len(used_files) == 0:
@@ -275,29 +285,23 @@ def create_debug_data(
275285
else:
276286
num_to_keep = max(int(len(file_list) * min_frac), min_num)
277287

278-
# Group files by their base name (i.e., filename without extension)
279-
groups = defaultdict(list)
280-
for nf in not_used_files:
281-
groups[nf.stem].append(nf) # nf.stem gives the file name without its extension
282-
283-
# Convert the dictionary to a list of groups, where each group is a list of files with the same base name
284-
group_list = list(groups.values())
285-
286288
# Use a greedy strategy to select groups so that the total number of files is as close as possible to num_to_keep
287-
selected_groups = []
288289
total_files = 0
289-
for group in group_list:
290-
# If adding the entire group does not exceed the target, add it
291-
if not selected_groups or total_files + len(group) <= num_to_keep:
292-
selected_groups.append(group)
293-
total_files += len(group)
294-
else:
290+
for nf in not_used_files:
291+
if total_files > num_to_keep:
295292
break
293+
if nf.stem in selected_groups:
294+
total_files += 1
295+
else:
296+
selected_groups.append(nf.stem)
297+
total_files += 1
296298

297299
print(f"Sampling {num_to_keep} files without label from {total_files} files in {rel_dir}")
298300

299301
# Flatten the selected groups into a single list of files
300-
sampled_not_used = [nf for group in selected_groups for nf in group]
302+
sampled_not_used = [
303+
nf for group, value in global_groups.items() if group in selected_groups for nf in value
304+
]
301305

302306
# Copy the selected files to the target directory (all files with the same base name will be copied)
303307
for nf in sampled_not_used:
@@ -311,11 +315,7 @@ def create_debug_data(
311315
# Copy extra files
312316
print(f"Copying {len(extra_files)} extra files")
313317
for uf in extra_files:
314-
sampled_file_path = sample_folder / uf.relative_to(data_folder)
315-
if sampled_file_path.exists():
316-
continue
317-
sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
318-
shutil.copy(uf, sampled_file_path)
318+
copy_file(uf, sample_folder, data_folder)
319319

320320
final_files_count = count_files_in_folder(sample_folder)
321321
print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")

rdagent/scenarios/data_science/scen/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
209209
for tag, value in img.tag_v2.items():
210210
tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
211211
result.append(" " * (indent + 4) + f"{tag_name}: {value}")
212-
if file_type == "json":
212+
if file_type in ["json", "txt"]:
213213
result.append(" " * (indent + 2) + f"- Content of {file}:")
214214
with open(path, "r", encoding="utf-8") as f:
215215
for i, line in enumerate(f):

0 commit comments

Comments
 (0)