Skip to content

Commit e78c3e7

Browse files
authored
feat: add describe_data_folder_v2 (microsoft#738)
* add describe_data_folder_v2 * fix ci * fix * add packages
1 parent f7cc65e commit e78c3e7

File tree

3 files changed

+393
-210
lines changed

3 files changed

+393
-210
lines changed
Lines changed: 7 additions & 210 deletions
Original file line numberDiff line numberDiff line change
@@ -1,229 +1,24 @@
11
import json
2-
import os
32
from pathlib import Path
43
from typing import Dict
54

6-
import pandas as pd
7-
from PIL import Image, TiffTags
8-
95
from rdagent.app.data_science.conf import DS_RD_SETTING
106
from rdagent.components.coder.data_science.conf import get_ds_env
117
from rdagent.core.experiment import FBWorkspace
128
from rdagent.core.scenario import Scenario
139
from rdagent.log import rdagent_logger as logger
1410
from rdagent.oai.llm_utils import APIBackend
11+
from rdagent.scenarios.data_science.scen.utils import (
12+
describe_data_folder,
13+
describe_data_folder_v2,
14+
)
1515
from rdagent.scenarios.kaggle.kaggle_crawler import (
1616
crawl_descriptions,
1717
leaderboard_scores,
1818
)
1919
from rdagent.utils.agent.tpl import T
2020

2121

22-
def read_csv_head(file_path, indent=0, lines=5, max_col_width=100):
23-
"""
24-
Reads the first few rows of a CSV file and formats them with indentation and optional truncation.
25-
26-
Parameters:
27-
file_path (str): Path to the CSV file.
28-
indent (int): Number of spaces to prepend to each line for indentation.
29-
lines (int): Number of rows to read from the CSV file.
30-
max_col_width (int): Maximum width of each column's content.
31-
32-
Returns:
33-
str: A formatted string of the first few rows of the CSV file.
34-
"""
35-
try:
36-
# Read the CSV file with specified rows
37-
df = pd.read_csv(file_path, nrows=lines)
38-
39-
if df.empty:
40-
return " " * indent + "(No data in the file)"
41-
42-
# Truncate column contents to a maximum width
43-
truncated_df = df.copy()
44-
for col in truncated_df.columns:
45-
truncated_df[col] = (
46-
truncated_df[col]
47-
.astype(str)
48-
.apply(lambda x: (x[:max_col_width] + "...") if len(x) > max_col_width else x)
49-
)
50-
51-
# Convert DataFrame to a string representation
52-
df_string_lines = truncated_df.to_string(index=False).split("\n")
53-
54-
# Add indentation to each line
55-
indented_lines = [" " * indent + line for line in df_string_lines]
56-
57-
return "\n".join(indented_lines)
58-
except FileNotFoundError:
59-
return f"Error: File not found at path '{file_path}'."
60-
except pd.errors.EmptyDataError:
61-
return f"Error: The file at '{file_path}' is empty."
62-
except Exception as e:
63-
return f"Error reading CSV: {e}"
64-
65-
66-
def get_dir_snapshot(folder_path):
67-
"""
68-
[note]
69-
- Returns a set of file extensions within the subfolder (excluding subfolder names)
70-
- Compares only the types of files contained, not specific file names or quantities
71-
"""
72-
exts = set()
73-
try:
74-
with os.scandir(folder_path) as it:
75-
for entry in it:
76-
if entry.is_file():
77-
file_ext = os.path.splitext(entry.name)[1]
78-
exts.add(file_ext)
79-
except Exception as e:
80-
logger.error(f"Error scanning directory: {e}")
81-
82-
return frozenset(exts)
83-
84-
85-
def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subfolders=2, is_top_level=True):
86-
"""
87-
folder_path : Current directory path
88-
indent : Current indentation
89-
max_files : Maximum number of files of the same type to display
90-
partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted
91-
is_top_level : Indicates if the current folder is the top-level folder
92-
"""
93-
result = []
94-
files_count = {}
95-
files_details = {}
96-
97-
for root, dirs, files in os.walk(folder_path):
98-
dirs.sort()
99-
files.sort()
100-
if not dirs:
101-
for file in files:
102-
file_path = os.path.join(root, file)
103-
file_type = os.path.splitext(file)[1][1:]
104-
file_size = os.path.getsize(file_path)
105-
106-
if file_type not in files_count:
107-
files_count[file_type] = 0
108-
files_details[file_type] = []
109-
files_count[file_type] += 1
110-
111-
# At top level, collect all CSV and Markdown files without restrictions
112-
# In deeper levels, follow the max_files restriction
113-
if is_top_level and file_type in ["csv", "md"]:
114-
files_details[file_type].append((file, file_size, file_path))
115-
elif len(files_details[file_type]) < max_files:
116-
files_details[file_type].append((file, file_size, file_path))
117-
break
118-
119-
# Collect "type snapshots" of subfolders
120-
snapshots = []
121-
for d in dirs:
122-
subfolder_path = os.path.join(root, d)
123-
snapshot = get_dir_snapshot(subfolder_path)
124-
snapshots.append(snapshot)
125-
126-
# Determine if all subfolders have the same file type distribution
127-
first_snapshot = snapshots[0]
128-
all_same_structure = all(s == first_snapshot for s in snapshots)
129-
130-
if all_same_structure:
131-
for i, d in enumerate(dirs):
132-
if i < partial_expand_subfolders:
133-
result.append(" " * indent + f"- Folder: {d}")
134-
subfolder_path = os.path.join(root, d)
135-
result.append(
136-
describe_data_folder(
137-
folder_path=subfolder_path,
138-
indent=indent + 2,
139-
max_files=max_files,
140-
partial_expand_subfolders=partial_expand_subfolders,
141-
is_top_level=False,
142-
)
143-
)
144-
else:
145-
remaining = len(dirs) - i
146-
result.append(" " * indent + f"... ({remaining} more subfolders)")
147-
break
148-
else:
149-
for d in dirs:
150-
result.append(" " * indent + f"- Folder: {d}")
151-
subfolder_path = os.path.join(root, d)
152-
result.append(
153-
describe_data_folder(
154-
folder_path=subfolder_path,
155-
indent=indent + 2,
156-
max_files=max_files,
157-
partial_expand_subfolders=partial_expand_subfolders,
158-
is_top_level=False,
159-
)
160-
)
161-
162-
for file in files:
163-
file_path = os.path.join(root, file)
164-
file_type = os.path.splitext(file)[1][1:]
165-
file_size = os.path.getsize(file_path)
166-
167-
if file_type not in files_count:
168-
files_count[file_type] = 0
169-
files_details[file_type] = []
170-
files_count[file_type] += 1
171-
172-
# At top level, collect all CSV and Markdown files without restrictions
173-
# In deeper levels, follow the max_files restriction
174-
if is_top_level and file_type in ["csv", "md"]:
175-
files_details[file_type].append((file, file_size, file_path))
176-
elif not is_top_level and len(files_details[file_type]) <= max_files:
177-
files_details[file_type].append((file, file_size, file_path))
178-
179-
break
180-
181-
# Print the folder and its contents
182-
for file_type, count in files_count.items():
183-
if count > max_files and file_type not in ["csv", "md", "txt"]:
184-
result.append(" " * indent + f"{count} {file_type}s:")
185-
for file, size, path in files_details[file_type]:
186-
result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
187-
result.append(" " * (indent + 2) + "... (file limit reached)")
188-
else:
189-
for file, size, path in files_details[file_type]:
190-
if file_type == "csv":
191-
df = pd.read_csv(path)
192-
result.append(
193-
" " * indent + f"- {file} ({size} bytes, with {df.shape[0]} rows and {df.shape[1]} columns)"
194-
)
195-
result.append(" " * (indent + 2) + f"- Head of {file}:")
196-
csv_head = read_csv_head(path, indent + 4)
197-
result.append(csv_head)
198-
continue
199-
result.append(" " * indent + f"- {file} ({size} bytes)")
200-
if file_type == "md":
201-
result.append(" " * (indent + 2) + f"- Content of {file}:")
202-
if file == "description.md":
203-
result.append(" " * (indent + 4) + f"Please refer to the background of the scenario context.")
204-
continue
205-
with open(path, "r", encoding="utf-8") as f:
206-
result.append(" " * (indent + 4) + f.read())
207-
if file_type == "tif":
208-
result.append(" " * (indent + 2) + f"- Metadata of {file}:")
209-
with Image.open(path) as img:
210-
for tag, value in img.tag_v2.items():
211-
tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
212-
result.append(" " * (indent + 4) + f"{tag_name}: {value}")
213-
if file_type in ["json", "txt"]:
214-
result.append(" " * (indent + 2) + f"- Content of {file}:")
215-
with open(path, "r", encoding="utf-8") as f:
216-
for i, line in enumerate(f):
217-
if i < 2:
218-
result.append(
219-
" " * (indent + 4) + line.strip()[:100] + ("..." if len(line.strip()) > 100 else "")
220-
)
221-
else:
222-
break
223-
224-
return "\n".join(result) + "\n"
225-
226-
22722
class DataScienceScen(Scenario):
22823
"""Data Science Scenario"""
22924

@@ -333,7 +128,7 @@ def get_runtime_environment(self) -> str:
333128
return stdout
334129

335130
def _get_data_folder_description(self) -> str:
336-
return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
131+
return describe_data_folder_v2(Path(DS_RD_SETTING.local_data_path) / self.competition)
337132

338133

339134
class KaggleScen(DataScienceScen):
@@ -364,3 +159,5 @@ def rich_style_description(self) -> str:
364159

365160
if __name__ == "__main__":
366161
print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "stanford-covid-vaccine"))
162+
163+
print(describe_data_folder_v2(Path("/data/userdata/share/mle_kaggle") / "stanford-covid-vaccine"))

0 commit comments

Comments
 (0)