|
1 | 1 | import json |
2 | | -import os |
3 | 2 | from pathlib import Path |
4 | 3 | from typing import Dict |
5 | 4 |
|
6 | | -import pandas as pd |
7 | | -from PIL import Image, TiffTags |
8 | | - |
9 | 5 | from rdagent.app.data_science.conf import DS_RD_SETTING |
10 | 6 | from rdagent.components.coder.data_science.conf import get_ds_env |
11 | 7 | from rdagent.core.experiment import FBWorkspace |
12 | 8 | from rdagent.core.scenario import Scenario |
13 | 9 | from rdagent.log import rdagent_logger as logger |
14 | 10 | from rdagent.oai.llm_utils import APIBackend |
| 11 | +from rdagent.scenarios.data_science.scen.utils import ( |
| 12 | + describe_data_folder, |
| 13 | + describe_data_folder_v2, |
| 14 | +) |
15 | 15 | from rdagent.scenarios.kaggle.kaggle_crawler import ( |
16 | 16 | crawl_descriptions, |
17 | 17 | leaderboard_scores, |
18 | 18 | ) |
19 | 19 | from rdagent.utils.agent.tpl import T |
20 | 20 |
|
21 | 21 |
|
22 | | -def read_csv_head(file_path, indent=0, lines=5, max_col_width=100): |
23 | | - """ |
24 | | - Reads the first few rows of a CSV file and formats them with indentation and optional truncation. |
25 | | -
|
26 | | - Parameters: |
27 | | - file_path (str): Path to the CSV file. |
28 | | - indent (int): Number of spaces to prepend to each line for indentation. |
29 | | - lines (int): Number of rows to read from the CSV file. |
30 | | - max_col_width (int): Maximum width of each column's content. |
31 | | -
|
32 | | - Returns: |
33 | | - str: A formatted string of the first few rows of the CSV file. |
34 | | - """ |
35 | | - try: |
36 | | - # Read the CSV file with specified rows |
37 | | - df = pd.read_csv(file_path, nrows=lines) |
38 | | - |
39 | | - if df.empty: |
40 | | - return " " * indent + "(No data in the file)" |
41 | | - |
42 | | - # Truncate column contents to a maximum width |
43 | | - truncated_df = df.copy() |
44 | | - for col in truncated_df.columns: |
45 | | - truncated_df[col] = ( |
46 | | - truncated_df[col] |
47 | | - .astype(str) |
48 | | - .apply(lambda x: (x[:max_col_width] + "...") if len(x) > max_col_width else x) |
49 | | - ) |
50 | | - |
51 | | - # Convert DataFrame to a string representation |
52 | | - df_string_lines = truncated_df.to_string(index=False).split("\n") |
53 | | - |
54 | | - # Add indentation to each line |
55 | | - indented_lines = [" " * indent + line for line in df_string_lines] |
56 | | - |
57 | | - return "\n".join(indented_lines) |
58 | | - except FileNotFoundError: |
59 | | - return f"Error: File not found at path '{file_path}'." |
60 | | - except pd.errors.EmptyDataError: |
61 | | - return f"Error: The file at '{file_path}' is empty." |
62 | | - except Exception as e: |
63 | | - return f"Error reading CSV: {e}" |
64 | | - |
65 | | - |
66 | | -def get_dir_snapshot(folder_path): |
67 | | - """ |
68 | | - [note] |
69 | | - - Returns a set of file extensions within the subfolder (excluding subfolder names) |
70 | | - - Compares only the types of files contained, not specific file names or quantities |
71 | | - """ |
72 | | - exts = set() |
73 | | - try: |
74 | | - with os.scandir(folder_path) as it: |
75 | | - for entry in it: |
76 | | - if entry.is_file(): |
77 | | - file_ext = os.path.splitext(entry.name)[1] |
78 | | - exts.add(file_ext) |
79 | | - except Exception as e: |
80 | | - logger.error(f"Error scanning directory: {e}") |
81 | | - |
82 | | - return frozenset(exts) |
83 | | - |
84 | | - |
85 | | -def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subfolders=2, is_top_level=True): |
86 | | - """ |
87 | | - folder_path : Current directory path |
88 | | - indent : Current indentation |
89 | | - max_files : Maximum number of files of the same type to display |
90 | | - partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted |
91 | | - is_top_level : Indicates if the current folder is the top-level folder |
92 | | - """ |
93 | | - result = [] |
94 | | - files_count = {} |
95 | | - files_details = {} |
96 | | - |
97 | | - for root, dirs, files in os.walk(folder_path): |
98 | | - dirs.sort() |
99 | | - files.sort() |
100 | | - if not dirs: |
101 | | - for file in files: |
102 | | - file_path = os.path.join(root, file) |
103 | | - file_type = os.path.splitext(file)[1][1:] |
104 | | - file_size = os.path.getsize(file_path) |
105 | | - |
106 | | - if file_type not in files_count: |
107 | | - files_count[file_type] = 0 |
108 | | - files_details[file_type] = [] |
109 | | - files_count[file_type] += 1 |
110 | | - |
111 | | - # At top level, collect all CSV and Markdown files without restrictions |
112 | | - # In deeper levels, follow the max_files restriction |
113 | | - if is_top_level and file_type in ["csv", "md"]: |
114 | | - files_details[file_type].append((file, file_size, file_path)) |
115 | | - elif len(files_details[file_type]) < max_files: |
116 | | - files_details[file_type].append((file, file_size, file_path)) |
117 | | - break |
118 | | - |
119 | | - # Collect "type snapshots" of subfolders |
120 | | - snapshots = [] |
121 | | - for d in dirs: |
122 | | - subfolder_path = os.path.join(root, d) |
123 | | - snapshot = get_dir_snapshot(subfolder_path) |
124 | | - snapshots.append(snapshot) |
125 | | - |
126 | | - # Determine if all subfolders have the same file type distribution |
127 | | - first_snapshot = snapshots[0] |
128 | | - all_same_structure = all(s == first_snapshot for s in snapshots) |
129 | | - |
130 | | - if all_same_structure: |
131 | | - for i, d in enumerate(dirs): |
132 | | - if i < partial_expand_subfolders: |
133 | | - result.append(" " * indent + f"- Folder: {d}") |
134 | | - subfolder_path = os.path.join(root, d) |
135 | | - result.append( |
136 | | - describe_data_folder( |
137 | | - folder_path=subfolder_path, |
138 | | - indent=indent + 2, |
139 | | - max_files=max_files, |
140 | | - partial_expand_subfolders=partial_expand_subfolders, |
141 | | - is_top_level=False, |
142 | | - ) |
143 | | - ) |
144 | | - else: |
145 | | - remaining = len(dirs) - i |
146 | | - result.append(" " * indent + f"... ({remaining} more subfolders)") |
147 | | - break |
148 | | - else: |
149 | | - for d in dirs: |
150 | | - result.append(" " * indent + f"- Folder: {d}") |
151 | | - subfolder_path = os.path.join(root, d) |
152 | | - result.append( |
153 | | - describe_data_folder( |
154 | | - folder_path=subfolder_path, |
155 | | - indent=indent + 2, |
156 | | - max_files=max_files, |
157 | | - partial_expand_subfolders=partial_expand_subfolders, |
158 | | - is_top_level=False, |
159 | | - ) |
160 | | - ) |
161 | | - |
162 | | - for file in files: |
163 | | - file_path = os.path.join(root, file) |
164 | | - file_type = os.path.splitext(file)[1][1:] |
165 | | - file_size = os.path.getsize(file_path) |
166 | | - |
167 | | - if file_type not in files_count: |
168 | | - files_count[file_type] = 0 |
169 | | - files_details[file_type] = [] |
170 | | - files_count[file_type] += 1 |
171 | | - |
172 | | - # At top level, collect all CSV and Markdown files without restrictions |
173 | | - # In deeper levels, follow the max_files restriction |
174 | | - if is_top_level and file_type in ["csv", "md"]: |
175 | | - files_details[file_type].append((file, file_size, file_path)) |
176 | | - elif not is_top_level and len(files_details[file_type]) <= max_files: |
177 | | - files_details[file_type].append((file, file_size, file_path)) |
178 | | - |
179 | | - break |
180 | | - |
181 | | - # Print the folder and its contents |
182 | | - for file_type, count in files_count.items(): |
183 | | - if count > max_files and file_type not in ["csv", "md", "txt"]: |
184 | | - result.append(" " * indent + f"{count} {file_type}s:") |
185 | | - for file, size, path in files_details[file_type]: |
186 | | - result.append(" " * (indent + 2) + f"- {file} ({size} bytes)") |
187 | | - result.append(" " * (indent + 2) + "... (file limit reached)") |
188 | | - else: |
189 | | - for file, size, path in files_details[file_type]: |
190 | | - if file_type == "csv": |
191 | | - df = pd.read_csv(path) |
192 | | - result.append( |
193 | | - " " * indent + f"- {file} ({size} bytes, with {df.shape[0]} rows and {df.shape[1]} columns)" |
194 | | - ) |
195 | | - result.append(" " * (indent + 2) + f"- Head of {file}:") |
196 | | - csv_head = read_csv_head(path, indent + 4) |
197 | | - result.append(csv_head) |
198 | | - continue |
199 | | - result.append(" " * indent + f"- {file} ({size} bytes)") |
200 | | - if file_type == "md": |
201 | | - result.append(" " * (indent + 2) + f"- Content of {file}:") |
202 | | - if file == "description.md": |
203 | | - result.append(" " * (indent + 4) + f"Please refer to the background of the scenario context.") |
204 | | - continue |
205 | | - with open(path, "r", encoding="utf-8") as f: |
206 | | - result.append(" " * (indent + 4) + f.read()) |
207 | | - if file_type == "tif": |
208 | | - result.append(" " * (indent + 2) + f"- Metadata of {file}:") |
209 | | - with Image.open(path) as img: |
210 | | - for tag, value in img.tag_v2.items(): |
211 | | - tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}") |
212 | | - result.append(" " * (indent + 4) + f"{tag_name}: {value}") |
213 | | - if file_type in ["json", "txt"]: |
214 | | - result.append(" " * (indent + 2) + f"- Content of {file}:") |
215 | | - with open(path, "r", encoding="utf-8") as f: |
216 | | - for i, line in enumerate(f): |
217 | | - if i < 2: |
218 | | - result.append( |
219 | | - " " * (indent + 4) + line.strip()[:100] + ("..." if len(line.strip()) > 100 else "") |
220 | | - ) |
221 | | - else: |
222 | | - break |
223 | | - |
224 | | - return "\n".join(result) + "\n" |
225 | | - |
226 | | - |
227 | 22 | class DataScienceScen(Scenario): |
228 | 23 | """Data Science Scenario""" |
229 | 24 |
|
@@ -333,7 +128,7 @@ def get_runtime_environment(self) -> str: |
333 | 128 | return stdout |
334 | 129 |
|
335 | 130 | def _get_data_folder_description(self) -> str: |
336 | | - return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition) |
| 131 | + return describe_data_folder_v2(Path(DS_RD_SETTING.local_data_path) / self.competition) |
337 | 132 |
|
338 | 133 |
|
339 | 134 | class KaggleScen(DataScienceScen): |
@@ -364,3 +159,5 @@ def rich_style_description(self) -> str: |
364 | 159 |
|
365 | 160 | if __name__ == "__main__": |
366 | 161 | print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "stanford-covid-vaccine")) |
| 162 | + |
| 163 | + print(describe_data_folder_v2(Path("/data/userdata/share/mle_kaggle") / "stanford-covid-vaccine")) |
0 commit comments