Extract factors from financial reports loop finished

microsoft · WinstonLiyt · Aug 2, 2024 · Jul 17, 2024 · Jul 18, 2024 · Jul 18, 2024
commit 64812781805699b58aa9239036f115e6b1fe0ee6
diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
@@ -30,5 +30,9 @@ class Config:
     py_bin: str = "/usr/bin/python"
     local_qlib_folder: Path = Path("/home/rdagent/qlib")
 
+    origin_report_path: str = "data/report_origin"
+    local_report_path: str = "data/report"
+    report_result_json_file_path: str = "git_ignore_folder/res_dict.json"
+
 
 PROP_SETTING = PropSetting()
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -0,0 +1,122 @@
+import json
+from pathlib import Path
+import pickle
+from dotenv import load_dotenv
+from jinja2 import Environment, StrictUndefined
+import pandas as pd
+
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.qlib.developer.factor_coder import QlibFactorCoSTEER
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario, QlibFactorExperiment
+from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
+    FactorExperimentLoaderFromPDFfiles,
+    classify_report_from_dict,
+)
+
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Hypothesis,
+    Trace,
+)
+
+from rdagent.core.exception import FactorEmptyException
+from rdagent.core.developer import Developer
+
+assert load_dotenv()
+
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+
+hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+
+hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+
+qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+
+qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+
+qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+
+with open(PROP_SETTING.report_result_json_file_path, 'r') as f:
+    judge_pdf_data = json.load(f)
+
+prompts_path = Path(__file__).parent / "prompts.yaml"
+prompts = Prompts(file_path=prompts_path)
+
+def generate_hypothesis(factor_result: dict, report_content: str) -> str:
+    system_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
+    user_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["user"]).render(
+        factor_descriptions=json.dumps(factor_result),
+        report_content=report_content
+    )
+
+    response = APIBackend().build_messages_and_create_chat_completion(
+        user_prompt=user_prompt,
+        system_prompt=system_prompt,
+        json_mode=True,
+    )
+
+    response_json = json.loads(response)
+    hypothesis_text = response_json.get("hypothesis", "No hypothesis generated.")
+    reason_text = response_json.get("reason", "No reason provided.")
+
+    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text)
+
+def extract_factors_and_implement(report_file_path: str) -> tuple:
+    scenario = QlibFactorScenario()
+
+    with logger.tag("extract_factors_and_implement"):
+        with logger.tag("load_factor_tasks"):
+
+            exp = FactorExperimentLoaderFromPDFfiles().load(report_file_path)
+            if exp is None or exp.sub_tasks == []:
+                return None, None
+
+    docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
+
+    factor_result = {
+        task.factor_name: {
+            "description": task.factor_description,
+            "formulation": task.factor_formulation,
+            "variables": task.variables,
+            "resources": task.factor_resources
+        }
+        for task in exp.sub_tasks
+    }
+
+    report_content = "\n".join(docs_dict.values())
+    hypothesis = generate_hypothesis(factor_result, report_content)
+
+    return exp, hypothesis
+
+trace = Trace(scen=scen)
+
+for file_path, attributes in judge_pdf_data.items():
+    if attributes["class"] == 1:
+        report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
+        if report_file_path.exists():
+            logger.info(f"Processing {report_file_path}")
+            exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+            if exp is None:
+                continue
+            exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+            if len(exp.based_experiments) == 0:
+                exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+            exp = qlib_factor_coder.develop(exp)
+            exp = qlib_factor_runner.develop(exp)
+            if exp is None:
+                logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                continue
+            feedback = qlib_factor_summarizer.generateFeedback(exp, hypothesis, trace)
+
+            trace.hist.append((hypothesis, exp, feedback))
+            logger.info(f"Processed {report_file_path}: Result: {exp}")
+        else:
+            logger.error(f"File not found: {report_file_path}")
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -0,0 +1,147 @@
+import json
+from pathlib import Path
+import pickle
+from dotenv import load_dotenv
+from jinja2 import Environment, StrictUndefined
+import pandas as pd
+
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.qlib.developer.factor_coder import QlibFactorCoSTEER
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario, QlibFactorExperiment
+from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
+    FactorExperimentLoaderFromPDFfiles,
+    classify_report_from_dict,
+)
+
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Hypothesis,
+    Trace,
+)
+
+from rdagent.core.exception import FactorEmptyException
+from rdagent.core.developer import Developer
+
+assert load_dotenv()
+
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+
+hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.factor_hypothesis_gen)(scen)
+
+hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.factor_hypothesis2experiment)()
+
+qlib_factor_coder: Developer = import_class(PROP_SETTING.factor_coder)(scen)
+
+qlib_factor_runner: Developer = import_class(PROP_SETTING.factor_runner)(scen)
+
+qlib_factor_summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.factor_summarizer)(scen)
+
+json_file_path = "/home/finco/v-yuanteli/RD-Agent/git_ignore_folder/res_dict.json"
+with open(json_file_path, 'r') as f:
+    judge_pdf_data = json.load(f)
+
+prompts_path = Path(__file__).parent / "prompts.yaml"
+prompts = Prompts(file_path=prompts_path)
+
+progress_file = "/home/finco/v-yuanteli/RD-Agent/git_ignore_folder/progress.pkl"
+
+def save_progress(trace, current_index):
+    with open(progress_file, "wb") as f:
+        pickle.dump((trace, current_index), f)
+
+def load_progress():
+    if Path(progress_file).exists():
+        with open(progress_file, "rb") as f:
+            return pickle.load(f)
+    return Trace(scen=scen), 0
+
+def generate_hypothesis(factor_result: dict, report_content: str) -> str:
+    system_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["system"]).render()
+    user_prompt = Environment(undefined=StrictUndefined).from_string(prompts["hypothesis_generation"]["user"]).render(
+        factor_descriptions=json.dumps(factor_result),
+        report_content=report_content
+    )
+
+    response = APIBackend().build_messages_and_create_chat_completion(
+        user_prompt=user_prompt,
+        system_prompt=system_prompt,
+        json_mode=True,
+    )
+
+    response_json = json.loads(response)
+    hypothesis_text = response_json.get("hypothesis", "No hypothesis generated.")
+    reason_text = response_json.get("reason", "No reason provided.")
+
+    return Hypothesis(hypothesis=hypothesis_text, reason=reason_text)
+
+def extract_factors_and_implement(report_file_path: str) -> tuple:
+    scenario = QlibFactorScenario()
+
+    with logger.tag("extract_factors_and_implement"):
+        with logger.tag("load_factor_tasks"):
+
+            exp = FactorExperimentLoaderFromPDFfiles().load(report_file_path)
+            if exp is None or exp.sub_tasks == []:
+                return None, None
+
+    docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
+
+    factor_result = {
+        task.factor_name: {
+            "description": task.factor_description,
+            "formulation": task.factor_formulation,
+            "variables": task.variables,
+            "resources": task.factor_resources
+        }
+        for task in exp.sub_tasks
+    }
+
+    report_content = "\n".join(docs_dict.values())
+    hypothesis = generate_hypothesis(factor_result, report_content)
+
+    return exp, hypothesis
+
+trace, start_index = load_progress()
+
+try:
+    judge_pdf_data_items = list(judge_pdf_data.items())
+    for index in range(start_index, len(judge_pdf_data_items)):
+        if index > 1000:
+            break
+        file_path, attributes = judge_pdf_data_items[index]
+        if attributes["class"] == 1:
+            report_file_path = Path(file_path.replace("/data/home/xiaoyang/data/ftp/amc_origin_file/report", "/home/finco/data/report"))
+            if report_file_path.exists():
+                print(f"Processing {report_file_path}")
+                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                if exp is None:
+                    continue
+                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                if len(exp.based_experiments) == 0:
+                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                exp = qlib_factor_coder.develop(exp)
+                exp = qlib_factor_runner.develop(exp)
+                if exp is None:
+                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                    continue
+                feedback = qlib_factor_summarizer.generateFeedback(exp, hypothesis, trace)
+
+                trace.hist.append((hypothesis, exp, feedback))
+                print(f"Processed {report_file_path}: Result: {exp}")
+
+                # Save progress after processing each report
+                save_progress(trace, index + 1)
+            else:
+                print(f"File not found: {report_file_path}")
+except Exception as e:
+    logger.error(f"An error occurred: {e}")
+    save_progress(trace, index)
+    raise
diff --git a/rdagent/app/qlib_rd_loop/prompts.yaml b/rdagent/app/qlib_rd_loop/prompts.yaml
@@ -0,0 +1,15 @@
+hypothesis_generation:
+  system: |-
+    You are an expert in financial analysis. Your task is to generate a well-reasoned hypothesis based on the provided financial factors and report content.
+    Please ensure your response is in JSON format as shown below:
+    {
+      "hypothesis": "A clear and concise hypothesis based on the provided information.",
+      "reason": "A detailed explanation supporting the generated hypothesis."
+    }
+
+  user: |-
+    The following are the financial factors and their descriptions:
+    {{ factor_descriptions }}
+
+    The report content is as follows:
+    {{ report_content }}
diff --git a/rdagent/app/qlib_rd_loop/run_script.sh b/rdagent/app/qlib_rd_loop/run_script.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+max_retries=1000
+count=0
+log_file="/home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/run_script.log"
+
+while [ $count -lt $max_retries ]; do
+  echo "$(date) - Attempt $count of $max_retries" >> $log_file
+  /home/finco/anaconda3/envs/rdagent/bin/python /home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/factor_from_report_sh.py >> $log_file 2>&1
+  if [ $? -eq 0 ]; then
+    echo "$(date) - Script completed successfully on attempt $count" >> $log_file
+    break
+  fi
+  count=$((count + 1))
+  echo "$(date) - Restarting script after crash... Attempt $count of $max_retries" >> $log_file
+done
+
+if [ $count -ge $max_retries ]; then
+  echo "$(date) - Script failed after $max_retries attempts." >> $log_file
+else
+  echo "$(date) - Script completed successfully." >> $log_file
+fi
+
+# chmod +x /home/finco/v-yuanteli/RD-Agent/rdagent/app/qlib_rd_loop/run_script.sh
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -71,6 +71,14 @@ def evolve(
             implementation_factors_per_round = int(
                 FACTOR_IMPLEMENT_SETTINGS.select_ratio * len(to_be_finished_task_index)
             )
+
+            # Ensure at least one task is selected
+            if implementation_factors_per_round == 0:
+                implementation_factors_per_round = 1
+
+            if implementation_factors_per_round > len(to_be_finished_task_index):
+                implementation_factors_per_round = len(to_be_finished_task_index)
+
             if FACTOR_IMPLEMENT_SETTINGS.select_method == "random":
                 to_be_finished_task_index = RandomSelect(
                     to_be_finished_task_index,

diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
@@ -109,7 +109,8 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                 raise CodeFormatException(self.FB_CODE_NOT_SET)
             else:
                 # TODO: to make the interface compatible with previous code. I kept the original behavior.
-                raise ValueError(self.FB_CODE_NOT_SET)
+                # raise ValueError(self.FB_CODE_NOT_SET)
+                return self.FB_CODE_NOT_SET, None
         with FileLock(self.workspace_path / "execution.lock"):
             if FACTOR_IMPLEMENT_SETTINGS.enable_execution_cache:
                 # NOTE: cache the result for the same code and same data type

diff --git a/rdagent/scenarios/qlib/developer/factor_runner.py b/rdagent/scenarios/qlib/developer/factor_runner.py
@@ -60,7 +60,9 @@ def develop(self, exp: QlibFactorExperiment) -> QlibFactorExperiment:
             new_factors = self.process_factor_data(exp)
 
             if new_factors.empty:
-                raise FactorEmptyException("No valid factor data found to merge.")
+                # raise FactorEmptyException("No valid factor data found to merge.")
+                logger.error("No valid factor data found to merge.")
+                return None
 
             # Combine the SOTA factor and new factors if SOTA factor exists
             if SOTA_factor is not None and not SOTA_factor.empty: