-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsft.sh
More file actions
62 lines (59 loc) · 2.08 KB
/
sft.sh
File metadata and controls
62 lines (59 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash
# variables
script_name=arm_thinker_sft
full_batch_size=256
per_device_batch_size=2
gradient_accumulation_steps=$((full_batch_size / (per_device_batch_size * PROC_PER_NODE * NNODES)))
echo "gradient_accumulation_steps: ${gradient_accumulation_steps}"
epoch=1.0
lr=2e-5
image_max_pixels=$((4096 * 28 * 28))
cutoff_len=32768
model_name_or_path=models--Qwen--Qwen2.5-VL-7B-Instruct
tokenized_path=/path/to/LLaMA-Factory/tokenized_cache/${script_name}
mkdir -p $tokenized_path
mix_strategy=concat
save_only_model=true
output_dir=/path/to/LLaMA-Factory/saves/qwen2_5_vl_7b/full_sft/${script_name}
# log
log_dir=${output_dir}/logs
mkdir -p $log_dir
torchrun \
--nnodes $NNODES --nproc_per_node $PROC_PER_NODE --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
/path/to/LLaMA-Factory/src/train.py \
--deepspeed /path/to/LLaMA-Factory/examples/deepspeed/ds_z3_config.json \
--model_name_or_path $model_name_or_path \
--image_max_pixels ${image_max_pixels} \
--crop_to_patches true \
--trust_remote_code true \
--stage sft \
--do_train true \
--finetuning_type full \
--freeze_vision_tower true \
--freeze_multi_modal_projector true \
--freeze_language_model false \
--dataset ARM-Thinker-SFT-Data \
--template qwen2_vl \
--cutoff_len ${cutoff_len} \
--overwrite_cache true \
--tokenized_path ${tokenized_path} \
--preprocessing_num_workers 32 \
--dataloader_num_workers 32 \
--mix_strategy ${mix_strategy} \
--output_dir $output_dir \
--logging_steps 5 \
--plot_loss true \
--overwrite_output_dir false \
--save_only_model ${save_only_model} \
--per_device_train_batch_size $per_device_batch_size \
--gradient_accumulation_steps $gradient_accumulation_steps \
--learning_rate ${lr} \
--num_train_epochs ${epoch} \
--lr_scheduler_type cosine \
--save_steps 100 \
--warmup_ratio 0.1 \
--bf16 true \
--ddp_timeout 180000000 \
--flash_attn fa2 \
--report_to tensorboard \
2>&1 | tee -a "${log_dir}/training_log_$(date +%Y%m%d_%H%M%S).txt"