@@ -77,8 +77,10 @@ def evaluate(
7777
7878 # Check score file
7979 score_fp = implementation .workspace_path / "scores.csv"
80+ score_ret_code = 0
8081 if not score_fp .exists ():
81- stdout += "\n Metrics file (scores.csv) is not generated."
82+ stdout += "\n [Error] Metrics file (scores.csv) is not generated!"
83+ score_ret_code = 1
8284 else :
8385 try :
8486 score_df = pd .read_csv (score_fp , index_col = 0 )
@@ -89,29 +91,30 @@ def evaluate(
8991 for model in model_set_in_folder :
9092 if model not in model_set_in_scores :
9193 stdout += f"\n Model { model } is not evaluated in the scores.csv. The scores.csv has { model_set_in_scores } ."
94+ score_ret_code = 1
9295 except Exception as e :
9396 stdout += f"\n Error in checking the scores.csv file: { e } \n scores.csv's content:\n -----\n { score_fp .read_text ()} \n -----"
97+ score_ret_code = 1
9498
9599 # Check submission file
96- submission_fp = implementation .workspace_path / "submission.csv"
97- if not submission_fp .exists ():
98- stdout += "\n Submission file (submission.csv) is not generated."
99- else :
100- base_check_code = (DIRNAME / "eval_tests" / "submission_format_test.txt" ).read_text ()
101- implementation .inject_files (** {"test/submission_format_test.py" : base_check_code })
102- # stdout += "----Submission Check 1-----\n"
103- stdout += implementation .execute (env = de , entry = "python test/submission_format_test.py" )
104-
105- # MLEBench Check
106- # !!! Since we are running on a sampled dataset, mlebench check is not required.
107- # mle_check_code = (
108- # (DIRNAME / "eval_tests" / "mle_submission_format_test.txt")
109- # .read_text()
110- # .replace("<competition_id>", self.scen.competition)
111- # )
112- # implementation.inject_files(**{"test/mle_submission_format_test.py": mle_check_code})
113- # stdout += "----Submission Check 2-----\n"
114- # stdout += implementation.execute(env=mde, entry=f"python test/mle_submission_format_test.py")
100+ base_check_code = (DIRNAME / "eval_tests" / "submission_format_test.txt" ).read_text ()
101+ implementation .inject_files (** {"test/submission_format_test.py" : base_check_code })
102+ # stdout += "----Submission Check 1-----\n"
103+ submission_stdout , submission_ret_code = implementation .execute_ret_code (
104+ env = de , entry = "python test/submission_format_test.py"
105+ )
106+ stdout += submission_stdout
107+
108+ # MLEBench Check
109+ # !!! Since we are running on a sampled dataset, mlebench check is not required.
110+ # mle_check_code = (
111+ # (DIRNAME / "eval_tests" / "mle_submission_format_test.txt")
112+ # .read_text()
113+ # .replace("<competition_id>", self.scen.competition)
114+ # )
115+ # implementation.inject_files(**{"test/mle_submission_format_test.py": mle_check_code})
116+ # stdout += "----Submission Check 2-----\n"
117+ # stdout += implementation.execute(env=mde, entry=f"python test/mle_submission_format_test.py")
115118
116119 system_prompt = T (".prompts:workflow_eval.system" ).r (
117120 scenario = self .scen .get_scenario_all_desc (),
@@ -122,6 +125,8 @@ def evaluate(
122125 stdout = stdout .strip (),
123126 code = implementation .file_dict ["main.py" ],
124127 )
125- return build_cls_from_json_with_retry (
128+ wfb = build_cls_from_json_with_retry (
126129 WorkflowSingleFeedback , system_prompt = system_prompt , user_prompt = user_prompt
127130 )
131+ wfb .final_decision = wfb .final_decision and submission_ret_code == 0 and score_ret_code == 0
132+ return wfb
0 commit comments