chore: print txt info (microsoft#600)

qew21 · web-flow · commit 8204983c491f · 2025-02-14T19:16:28.000+08:00
* print head for txt files

* update sample
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
@@ -154,6 +154,17 @@ def count_files_in_folder(folder: Path) -> int:
     return sum(1 for _ in folder.rglob("*") if _.is_file())
 
 
+def copy_file(src_fp, target_folder, data_folder):
+    """
+    Construct the target file path based on the file's relative location from data_folder,
+    then copy the file if it doesn't already exist.
+    """
+    target_fp = target_folder / src_fp.relative_to(data_folder)
+    if not target_fp.exists():
+        target_fp.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(src_fp, target_fp)
+
+
 def create_debug_data(
     competition: str,
     dr_cls: type[DataReducer] = UniqueIDDataReducer,
@@ -239,13 +250,16 @@ def create_debug_data(
 
     # Process non-data files
     subfolder_dict = {}
+    global_groups = defaultdict(list)
     for file_path in files_to_process:
         if file_path in processed_files:
             continue  # Already handled above
         rel_dir = file_path.relative_to(data_folder).parts[0]
         subfolder_dict.setdefault(rel_dir, []).append(file_path)
+        global_groups[file_path.stem].append(Path(file_path))
 
     # For each subfolder, decide which files to copy
+    selected_groups = []
     for rel_dir, file_list in tqdm(subfolder_dict.items(), desc="Processing files", unit="file"):
         used_files = []
         not_used_files = []
@@ -262,11 +276,7 @@ def create_debug_data(
 
         # Directly copy used files
         for uf in used_files:
-            sampled_file_path = sample_folder / uf.relative_to(data_folder)
-            if sampled_file_path.exists():
-                continue
-            sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy(uf, sampled_file_path)
+            copy_file(uf, sample_folder, data_folder)
 
         # If no files are used, randomly sample files to keep the folder from being empty
         if len(used_files) == 0:
@@ -275,29 +285,23 @@ def create_debug_data(
             else:
                 num_to_keep = max(int(len(file_list) * min_frac), min_num)
 
-            # Group files by their base name (i.e., filename without extension)
-            groups = defaultdict(list)
-            for nf in not_used_files:
-                groups[nf.stem].append(nf)  # nf.stem gives the file name without its extension
-
-            # Convert the dictionary to a list of groups, where each group is a list of files with the same base name
-            group_list = list(groups.values())
-
             # Use a greedy strategy to select groups so that the total number of files is as close as possible to num_to_keep
-            selected_groups = []
             total_files = 0
-            for group in group_list:
-                # If adding the entire group does not exceed the target, add it
-                if not selected_groups or total_files + len(group) <= num_to_keep:
-                    selected_groups.append(group)
-                    total_files += len(group)
-                else:
+            for nf in not_used_files:
+                if total_files > num_to_keep:
                     break
+                if nf.stem in selected_groups:
+                    total_files += 1
+                else:
+                    selected_groups.append(nf.stem)
+                    total_files += 1
 
             print(f"Sampling {num_to_keep} files without label from {total_files} files in {rel_dir}")
 
             # Flatten the selected groups into a single list of files
-            sampled_not_used = [nf for group in selected_groups for nf in group]
+            sampled_not_used = [
+                nf for group, value in global_groups.items() if group in selected_groups for nf in value
+            ]
 
             # Copy the selected files to the target directory (all files with the same base name will be copied)
             for nf in sampled_not_used:
@@ -311,11 +315,7 @@ def create_debug_data(
         # Copy extra files
         print(f"Copying {len(extra_files)} extra files")
         for uf in extra_files:
-            sampled_file_path = sample_folder / uf.relative_to(data_folder)
-            if sampled_file_path.exists():
-                continue
-            sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy(uf, sampled_file_path)
+            copy_file(uf, sample_folder, data_folder)
 
     final_files_count = count_files_in_folder(sample_folder)
     print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -209,7 +209,7 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
                         for tag, value in img.tag_v2.items():
                             tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
                             result.append(" " * (indent + 4) + f"{tag_name}: {value}")
-                if file_type == "json":
+                if file_type in ["json", "txt"]:
                     result.append(" " * (indent + 2) + f"- Content of {file}:")
                     with open(path, "r", encoding="utf-8") as f:
                         for i, line in enumerate(f):