forked from verl-project/verl
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathentropy_trainer.yaml
More file actions
39 lines (33 loc) · 961 Bytes
/
entropy_trainer.yaml
File metadata and controls
39 lines (33 loc) · 961 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
hydra:
searchpath:
- file://verl/trainer/config
defaults:
- ppo_trainer
- _self_
data:
gen_batch_size: ${data.train_batch_size}
reward_model:
reward_kwargs:
overlong_buffer_cfg: ${reward_model.overlong_buffer}
reward_manager: dapo
overlong_buffer:
enable: False
len: 0
penalty_factor: 0.0
log: False
algorithm:
filter_groups:
enable: False # We try to avoid forgetting to set enable
metric: null # acc / score / seq_reward / seq_final_reward / ...
max_num_gen_batches: 0 # Non-positive values mean no upper limit
trainer:
project_name: verl-entropy
actor_rollout_ref:
actor:
policy_loss:
loss_mode: "vanilla" # /clip-cov / kl-cov from https://arxiv.org/abs/2505.
clip_cov_ratio: 0.0002 # for clip-cov loss
clip_cov_lb: 1.0 # for clip-cov loss
clip_cov_ub: 5.0 # for clip-cov loss
kl_cov_ratio: 0.0002 # for kl-cov loss
ppo_kl_coef: 0.1 # for kl-cov loss