Skip to content

Commit bf2684c

Browse files
xisen-wWinstonLiyt
andauthored
feat: update model_experiment.py to support basic EDA (microsoft#220)
* Update model_experiment.py to support basic eda It looks into the data first before the proposal. * Update model_experiment.py Revised linting * Update model_experiment.py by fixing sorting order * Update model_experiment.py for black linting * Update model_experiment.py * Update model_experiment.py * Update model_experiment.py * Update model_experiment.py --------- Co-authored-by: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com>
1 parent 9b95e6a commit bf2684c

File tree

1 file changed

+25
-2
lines changed

1 file changed

+25
-2
lines changed

rdagent/scenarios/kaggle/experiment/model_experiment.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
from pathlib import Path
33

4+
import pandas as pd
45
from jinja2 import Environment, StrictUndefined
56

67
from rdagent.components.coder.model_coder.model import (
@@ -77,12 +78,34 @@ def background(self) -> str:
7778
competition_features=self.competition_features,
7879
)
7980
)
80-
8181
return background_prompt
8282

8383
@property
8484
def source_data(self) -> str:
85-
raise NotImplementedError("source_data is not implemented")
85+
kaggle_conf = KGDockerConf()
86+
data_path = Path(f"{kaggle_conf.share_data_path}/{self.competition}")
87+
88+
csv_files = list(data_path.glob("*.csv"))
89+
90+
if not csv_files:
91+
return "No CSV files found in the specified path."
92+
93+
dataset = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
94+
95+
simple_eda = dataset.info(buf=None) # Capture the info output
96+
data_shape = dataset.shape
97+
data_head = dataset.head()
98+
99+
eda = (
100+
f"Basic Info about the data:\n{simple_eda}\n"
101+
f"Shape of the dataset: {data_shape}\n"
102+
f"Sample Data:\n{data_head}\n"
103+
)
104+
105+
data_description = self.competition_descriptions.get("Data Description", "No description provided")
106+
eda += f"\nData Description:\n{data_description}"
107+
108+
return eda
86109

87110
@property
88111
def output_format(self) -> str:

0 commit comments

Comments
 (0)