chore: more optional parameters for running benchmark analysis (microsoft#431)

qew21 · web-flow · commit 8f8afea4c4f2 · 2024-10-15T12:41:32.000+08:00
* set title and round

* decision from multiple types

* check if decision is true

* reformat

* remove unused file
diff --git a/rdagent/app/benchmark/factor/analysis.py b/rdagent/app/benchmark/factor/analysis.py
@@ -178,28 +178,32 @@ def change_fs(font_size):
         plt.rc("figure", titlesize=font_size)
 
     @staticmethod
-    def plot_data(data, file_name):
+    def plot_data(data, file_name, title):
         plt.figure(figsize=(10, 6))
         sns.barplot(x="index", y="b", hue="a", data=data)
         plt.xlabel("Method")
         plt.ylabel("Value")
-        plt.title("Comparison of Different Methods")
+        plt.title(title)
         plt.savefig(file_name)
 
 
-def main(path="git_ignore_folder/eval_results/res_promptV220240724-060037.pkl"):
+def main(
+    path="git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+    round=1,
+    title="Comparison of Different Methods",
+):
     settings = BenchmarkSettings()
     benchmark = BenchmarkAnalyzer(settings)
     results = {
-        "1 round experiment": path,
+        f"{round} round experiment": path,
     }
     final_results = benchmark.process_results(results)
     final_results_df = pd.DataFrame(final_results)
 
     Plotter.change_fs(20)
     plot_data = final_results_df.drop(["max. accuracy", "avg. accuracy"], axis=0).T
     plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
-    Plotter.plot_data(plot_data, "./comparison_plot.png")
+    Plotter.plot_data(plot_data, "./comparison_plot.png", title)
 
 
 if __name__ == "__main__":
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -195,14 +195,7 @@ def evaluate(
                     user_prompt=gen_df_info_str, system_prompt=system_prompt, json_mode=True
                 )
                 resp_dict = json.loads(resp)
-
-                if isinstance(resp_dict["output_format_decision"], str) and resp_dict[
-                    "output_format_decision"
-                ].lower() in (
-                    "true",
-                    "false",
-                ):
-                    resp_dict["output_format_decision"] = resp_dict["output_format_decision"].lower() == "true"
+                resp_dict["output_format_decision"] = str(resp_dict["output_format_decision"]).lower() in ["true", "1"]
 
                 return (
                     resp_dict["output_format_feedback"],
@@ -243,7 +236,7 @@ def evaluate(
                 False,
             )
 
-        time_diff = gen_df.index.get_level_values("datetime").to_series().diff().dropna().unique()
+        time_diff = pd.to_datetime(gen_df.index.get_level_values("datetime")).to_series().diff().dropna().unique()
         if pd.Timedelta(minutes=1) in time_diff:
             return (
                 "The generated dataframe is not daily. The implementation is definitely wrong. Please check the implementation.",
@@ -548,11 +541,7 @@ def evaluate(
                 final_decision = final_evaluation_dict["final_decision"]
                 final_feedback = final_evaluation_dict["final_feedback"]
 
-                if isinstance(final_decision, str) and final_decision.lower() in ("true", "false"):
-                    final_decision = final_decision.lower() == "true"
-                elif isinstance(final_decision, int) and final_decision in (0, 1):
-                    final_decision = bool(final_decision)
-
+                final_decision = str(final_decision).lower() in ["true", "1"]
                 return final_decision, final_feedback
 
             except json.JSONDecodeError as e: