Skip to content

Commit ca560a8

Browse files
committed
合入Eval与Optimize
当前问题 1. eval 存在部分参数差异(path,csv测试) 2. optimize 尝试新流程(优化后的optimize曲线);optimize 模版书写 3. optimize 在各个数据集上跑通 4. 创建baseline folder 5. 创建experiment data收集方法 6. 从ags中移出
1 parent c390341 commit ca560a8

File tree

172 files changed

+8742
-1255
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

172 files changed

+8742
-1255
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,4 @@ cov.xml
189189
*.dot
190190
.python-version
191191
*.jsonl
192+
*.json

config/config2.yaml

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,39 @@
11
# Full Example: https://github.com/geekan/MetaGPT/blob/main/config/config2.example.yaml
22
# Reflected Code: https://github.com/geekan/MetaGPT/blob/main/metagpt/config2.py
33
# Config Docs: https://docs.deepwisdom.ai/main/en/guide/get_started/configuration.html
4+
5+
# DeepSeeker
46
llm:
5-
api_type: "openai" # or azure / ollama / groq etc.
6-
model: "gpt-4-turbo" # or gpt-3.5-turbo
7-
base_url: "https://api.openai.com/v1" # or forward url / other llm url
8-
api_key: "YOUR_API_KEY"
7+
api_type: "openai" # or azure / ollama / open_llm etc. Check LLMType for more options
8+
model: "deepseek-coder" # or gpt-3.5-turbo-1106 / gpt-4-1106-preview
9+
base_url: "https://api.deepseek.com/v1" # or forward url / other llm url
10+
api_key: "sk-6e1b0fcbbfe94035a502801a080eeb6c"
11+
temperature: 1
12+
13+
# OpenRouter
14+
# llm:
15+
# api_type: "openai" # or azure / ollama / open_llm etc. Check LLMType for more options
16+
# model: "gpt-3.5-turbo-16k" # or gpt-3.5-turbo-1106 / gpt-4-1106-preview
17+
# base_url: "https://oneapi.deepwisdom.ai/v1" # or forward url / other llm url
18+
# api_key: "sk-5TrFZV9UFVMaDJFm039f7128A50247068e9803166f0eEbF8"
19+
# temperature: 0
20+
21+
22+
# llm:
23+
# api_type: "openai" # or azure / ollama / open_llm etc. Check LLMType for more options
24+
# model: "anthropic/claude-3.5-sonnet" # or gpt-3.5-turbo-1106 / gpt-4-1106-preview
25+
# base_url: "https://openrouter.ai/api/v1" # or forward url / other llm url
26+
# api_key: "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"
27+
# temperature: 0
28+
29+
models:
30+
"claude-3-5-sonnet-20240620": # model: "gpt-4-turbo" # or gpt-3.5-turbo
31+
api_type: "openai" # or azure / ollama / groq etc.
32+
base_url: "https://oneapi.deepwisdom.ai/v1"
33+
api_key: "sk-uI0WfR446hllU5BO446836325dF7440d8970EaC18f4830C1"
34+
temperature: 0
35+
"deepseek-coder": # api_type: "openai" # or azure / ollama / groq etc.
36+
api_type: "openai" # or azure / ollama / groq etc.
37+
base_url: "https://oneapi.deepwisdom.ai/v1"
38+
api_key: "sk-uI0WfR446hllU5BO446836325dF7440d8970EaC18f4830C1"
39+
temperature: 0

examples/ags/benchmark/hotpotqa.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
import aiofiles
99

10-
from examples.ags.w_action_node.graph import HotpotQAGraph
11-
from examples.ags.w_action_node.operator import Format, GenerateOnContext
12-
from examples.ags.w_action_node.utils import get_hotpotqa
10+
from examples.ags.scripts.graph import HotpotQAGraph
11+
from examples.ags.scripts.operator import Format, GenerateOnContext
12+
from examples.ags.scripts.utils import get_hotpotqa
1313
from metagpt.llm import LLM
1414
from metagpt.logs import logger
1515

examples/ags/benchmark/humaneval.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
import aiofiles
1414
from evalplus.data import get_human_eval_plus
1515

16-
from examples.ags.w_action_node.graph import HumanEvalGraph
17-
from examples.ags.w_action_node.operator import GenerateCodeBlock
18-
from examples.ags.w_action_node.utils import sort_json_by_key
16+
from examples.ags.scripts.graph import HumanEvalGraph
17+
from examples.ags.scripts.operator import GenerateCodeBlock
18+
from examples.ags.scripts.utils import sort_json_by_key
1919
from metagpt.llm import LLM
2020
from metagpt.logs import logger
2121
from metagpt.utils.common import add_jsonl_file, read_json_file
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import matplotlib.pyplot as plt
2+
import numpy as np
3+
4+
5+
def bootstrap_confidence_interval(data, num_bootstrap_samples=100000, confidence_level=0.95):
6+
"""
7+
Calculate bootstrap confidence interval for 1D accuracy data.
8+
Also returns the median of bootstrap means.
9+
10+
Parameters:
11+
- data (list or array of float): List or array of 1D data points.
12+
- num_bootstrap_samples (int): Number of bootstrap samples.
13+
- confidence_level (float): Desired confidence level (e.g., 0.95 for 95%).
14+
15+
Returns:
16+
- tuple: Tuple containing lower bound, upper bound, and median of the confidence interval.
17+
"""
18+
data = np.array(data)
19+
bootstrap_means = []
20+
for _ in range(num_bootstrap_samples):
21+
bootstrap_sample = np.random.choice(data, size=len(data), replace=True)
22+
bootstrap_mean = np.mean(bootstrap_sample)
23+
bootstrap_means.append(bootstrap_mean)
24+
25+
bootstrap_means = np.array(bootstrap_means)
26+
lower_percentile = (1.0 - confidence_level) / 2.0
27+
upper_percentile = 1.0 - lower_percentile
28+
ci_lower = np.percentile(bootstrap_means, lower_percentile * 100)
29+
ci_upper = np.percentile(bootstrap_means, upper_percentile * 100)
30+
median = np.median(bootstrap_means)
31+
32+
return ci_lower, ci_upper, median
33+
34+
35+
# Generate simulated iteration counts and performance data
36+
iterations = np.linspace(1, 30, 30)
37+
38+
# 每个迭代点有5组数据
39+
training_performance = np.array(
40+
[
41+
[0.68, 0.74, 0.69, 0.65, 0.76],
42+
[0.72, 0.79, 0.73, 0.80, 0.70],
43+
[0.77, 0.85, 0.76, 0.83, 0.74],
44+
[0.82, 0.90, 0.81, 0.88, 0.79],
45+
[0.87, 0.95, 0.86, 0.93, 0.84],
46+
# 为了达到30轮,我们需要添加更多的数据点
47+
# 这里我们使用一个简单的模拟来生成剩余的25轮数据
48+
*[np.random.uniform(0.85, 0.98, 5) for _ in range(25)],
49+
]
50+
)
51+
52+
testing_performance = np.array(
53+
[
54+
[0.62, 0.69, 0.61, 0.70, 0.60],
55+
[0.67, 0.74, 0.66, 0.75, 0.65],
56+
[0.69, 0.77, 0.68, 0.78, 0.67],
57+
[0.72, 0.80, 0.71, 0.81, 0.70],
58+
[0.75, 0.83, 0.74, 0.84, 0.73],
59+
# 同样,为测试性能添加剩余的25轮数据
60+
*[np.random.uniform(0.75, 0.90, 5) for _ in range(25)],
61+
]
62+
)
63+
64+
# Calculate confidence intervals for each iteration point
65+
training_ci = [bootstrap_confidence_interval(perf) for perf in training_performance]
66+
testing_ci = [bootstrap_confidence_interval(perf) for perf in testing_performance]
67+
68+
# Extract lower bounds, upper bounds, and medians of the confidence intervals
69+
training_ci_lower, training_ci_upper, training_median = zip(*training_ci)
70+
testing_ci_lower, testing_ci_upper, testing_median = zip(*testing_ci)
71+
72+
# Print confidence intervals and medians
73+
for i in range(len(iterations)):
74+
print(f"Iteration {i+1}:")
75+
print(
76+
f" Training performance 95% CI: ({training_ci_lower[i]:.3f}, {training_ci_upper[i]:.3f}), Median: {training_median[i]:.3f}"
77+
)
78+
print(
79+
f" Testing performance 95% CI: ({testing_ci_lower[i]:.3f}, {testing_ci_upper[i]:.3f}), Median: {testing_median[i]:.3f}"
80+
)
81+
82+
# Plot the graph
83+
plt.figure(figsize=(10, 6))
84+
85+
# Training performance line and confidence interval
86+
plt.plot(iterations, training_median, label="Training Performance", color="blue")
87+
plt.fill_between(iterations, training_ci_lower, training_ci_upper, color="blue", alpha=0.2)
88+
89+
# Testing performance line and confidence interval
90+
plt.plot(iterations, testing_median, label="Testing Performance", color="red")
91+
plt.fill_between(iterations, testing_ci_lower, testing_ci_upper, color="red", alpha=0.2)
92+
93+
# Graph details
94+
plt.xlabel("Number of Iterations")
95+
plt.ylabel("Performance on GSM8K")
96+
plt.title("SOTimizer On GSM8K")
97+
plt.legend()
98+
plt.grid(True)
99+
100+
# Save the graph
101+
plt.savefig("performance_vs_iterations.png")
102+
plt.show()

0 commit comments

Comments
 (0)