Skip to content

Commit ed87a48

Browse files
authored
refactor: use dynamic input path and update template loader (microsoft#792)
* refactor: use dynamic input path and update template loader * fix: update include syntax for data source in prompts.yaml * add customization path * docs: update prompts for ensemble scoring and metric direction * chore: remove obsolete data_science/share.yaml file
1 parent eafd4df commit ed87a48

File tree

23 files changed

+137
-51
lines changed

23 files changed

+137
-51
lines changed

rdagent/components/coder/data_science/ensemble/eval.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,11 @@ def evaluate(
4747
)
4848

4949
env = get_ds_env(
50-
extra_volumes={f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
50+
extra_volumes={
51+
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": T(
52+
"scenarios.data_science.share:scen.input_path"
53+
).r()
54+
}
5155
)
5256

5357
fname = "test/ensemble_test.txt"

rdagent/components/coder/data_science/feature/eval.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ def evaluate(
4343
)
4444

4545
env = get_ds_env(
46-
extra_volumes={f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
46+
extra_volumes={
47+
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": T(
48+
"scenarios.data_science.share:scen.input_path"
49+
).r()
50+
}
4751
)
4852

4953
# TODO: do we need to clean the generated temporary content?

rdagent/components/coder/data_science/model/eval.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,11 @@ def evaluate(
5757
)
5858

5959
env = get_ds_env(
60-
extra_volumes={f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
60+
extra_volumes={
61+
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": T(
62+
"scenarios.data_science.share:scen.input_path"
63+
).r()
64+
}
6165
)
6266

6367
if_model_removed = False

rdagent/components/coder/data_science/pipeline/eval.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,11 @@ def evaluate(
5252
)
5353

5454
env = get_ds_env(
55-
extra_volumes={f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
55+
extra_volumes={
56+
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": T(
57+
"scenarios.data_science.share:scen.input_path"
58+
).r()
59+
}
5660
)
5761

5862
# Clean the scores.csv & submission.csv.
@@ -97,7 +101,7 @@ def evaluate(
97101
score_ret_code = 1
98102

99103
# Check submission file
100-
base_check_code = (DIRNAME / "eval_tests" / "submission_format_test.txt").read_text()
104+
base_check_code = T(".eval_tests.submission_format_test", ftype="txt").r()
101105
implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
102106
# stdout += "----Submission Check 1-----\n"
103107
submission_check_out, submission_ret_code = implementation.execute_ret_code(

rdagent/components/coder/data_science/pipeline/eval_tests/submission_format_test.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ find . | grep -i sample | grep -i submission | grep -v sample_submission.csv | g
2525
"""
2626

2727
# Find sample submission file dynamically
28-
input_dir = Path("/kaggle/input")
28+
input_dir = Path("{% include "scenarios.data_science.share:scen.input_path" %}")
2929
# Look for common variations of sample submission filenames
3030
sample_submission_files = list(input_dir.glob("*sample_submission*.csv")) + list(
3131
input_dir.glob("*sampleSubmission*.csv")
3232
)
3333

34-
assert sample_submission_files, "Error: No sample submission file found in /kaggle/input/"
34+
assert sample_submission_files, "Error: No sample submission file found in {% include "scenarios.data_science.share:scen.input_path" %}"
3535

3636
# Use first matching file
3737
sample_submission_name = sample_submission_files[0].name

rdagent/components/coder/data_science/pipeline/prompts.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ pipeline_coder:
3737
3838
3939
## Guidelines
40-
1. Ensure that the dataset is loaded strictly from `/kaggle/input/`, following the exact folder structure described in the **Data Folder Description**, and do not attempt to load data from the current directory (`./`).
40+
1. Ensure that the dataset is loaded strictly from `{% include "scenarios.data_science.share:scen.input_path" %}`, following the exact folder structure described in the **Data Folder Description**, and do not attempt to load data from the current directory (`./`).
4141
2. You should avoid using logging module to output information in your generated code, and instead use the print() function.
4242
4343
## Exploratory Data Analysis (EDA) part(Required):

rdagent/components/coder/data_science/raw_data_loader/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,11 @@ def develop(self, exp):
226226
new_exp = super().develop(exp)
227227

228228
env = get_ds_env(
229-
extra_volumes={f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"},
229+
extra_volumes={
230+
f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": T(
231+
"scenarios.data_science.share:scen.input_path"
232+
).r()
233+
},
230234
running_timeout_period=DS_RD_SETTING.full_timeout,
231235
)
232236

rdagent/components/coder/data_science/raw_data_loader/eval.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,11 @@ def evaluate(
4646
)
4747

4848
env = get_ds_env(
49-
extra_volumes={f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
49+
extra_volumes={
50+
f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": T(
51+
"scenarios.data_science.share:scen.input_path"
52+
).r()
53+
}
5054
)
5155

5256
# TODO: do we need to clean the generated temporary content?

rdagent/components/coder/data_science/raw_data_loader/prompts.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ spec:
4848
- `test_ids` (DT): Identifiers for the test data.
4949
- Docstring Requirements:
5050
- Describe the purpose of the function.
51-
- Specify the data source location (`/kaggle/input/`).
51+
- Specify the data source location (`{% include "scenarios.data_science.share:scen.input_path" %}`).
5252
- Clearly define the structure and type of the output.
5353
- Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
5454
2. Notes:
@@ -268,7 +268,7 @@ data_loader_coder:
268268
{% endif %}
269269
270270
## Guidelines
271-
1. Ensure that the dataset is loaded strictly from `/kaggle/input/`, following the exact folder structure described in the **Data Folder Description**, and do not attempt to load data from the current directory (`./`).
271+
1. Ensure that the dataset is loaded strictly from `{% include "scenarios.data_science.share:scen.input_path" %}`, following the exact folder structure described in the **Data Folder Description**, and do not attempt to load data from the current directory (`./`).
272272
2. You should avoid using logging module to output information in your generated code, and instead use the print() function.
273273
3. You should use the following cache decorator to cache the results of the function:
274274
```python

rdagent/components/coder/data_science/share/eval.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ def evaluate(
4545
)
4646
env = get_ds_env()
4747
env.conf.extra_volumes = {
48-
f"{DS_RD_SETTING.local_data_path}/{'sample/' if self.data_type == 'sample' else ''}{self.scen.competition}": "/kaggle/input"
48+
f"{DS_RD_SETTING.local_data_path}/{'sample/' if self.data_type == 'sample' else ''}{self.scen.competition}": T(
49+
"scenarios.data_science.share:scen.input_path"
50+
).r()
4951
}
5052

5153
# 2) check the result and stdout after reruning the model.

0 commit comments

Comments
 (0)