poolish

xuxinyi389 · xuxinyi389 · commit 20be84b04204 · 2025-02-13T20:57:28.000+08:00
diff --git a/llm/auto_parallel/deepseek-v2/run_pretrain_auto.py b/llm/auto_parallel/deepseek-v2/run_pretrain_auto.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,14 +39,14 @@
     AutoTokenizer,
     CosineAnnealingWithWarmupDecay,
     DeepseekV2Config,
-    DeepseekV2ForCausalLMNet,
+    DeepseekV2ForCausalLMAuto,
     DeepseekV2PretrainingCriterion,
     LinearAnnealingWithWarmupDecay,
 )
 from paddlenlp.utils.log import logger
 
 MODEL_CLASSES = {
-    "deepseekv2_network": (DeepseekV2Config, DeepseekV2ForCausalLMNet, DeepseekV2PretrainingCriterion),
+    "deepseekv2_auto": (DeepseekV2Config, DeepseekV2ForCausalLMAuto, DeepseekV2PretrainingCriterion),
 }
 
 
@@ -90,7 +90,7 @@ class PreTrainingArguments(AutoTrainingArguments):
     )
     sr: Optional[int] = field(default=0, metadata={"help": "The count of chunks without recompute."})
     virtual_pipeline_seg_method: str = field(
-        default="DeepseekV2DecoderLayerNet",
+        default="DeepseekV2DecoderLayerAuto",
         metadata={"help": "The seg method of spliting pp layer for virtual pipeline."},
     )
     # NOTE(gongenlei): new add autotuner_benchmark
diff --git a/llm/auto_parallel/deepseek-v2/run_pretrain_auto.sh b/llm/auto_parallel/deepseek-v2/run_pretrain_auto.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,12 +41,12 @@ unset CUDA_VISIBLE_DEVICES
 task_name="deepseekv2"
 rm -rf output/$task_name/
 rm -rf "output/$task_name""_log"
-rm -rf /root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP/llm/auto_parallel/deepseek-v2/deepseek_single_card
+rm -rf /root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP/llm/auto_parallel/deepseek-v2/log
 
 export SOT_LOG_LEVEL=4
 export PYTHONPATH=/root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP:$PYTHONPATH
 #ulimit -c unlimited
-export GLOG_v=7
+# export GLOG_v=3
 
 # export FLAGS_call_stack_level=3
 # export FLAGS_use_cuda_managed_memory=true
@@ -59,9 +59,9 @@ to_static=0  # 是否开启动转静训练
 
 python -u  -m paddle.distributed.launch \
     --gpus "0,1,2,3" \
-    --log_dir "deepseek_single_card" \
+    --log_dir "log" \
     run_pretrain_auto.py \
-    --model_type "deepseekv2_network" \
+    --model_type "deepseekv2_auto" \
     --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" \
     --tokenizer_name_or_path "deepseek-ai/DeepSeek-V2-Lite" \
     --input_dir "./data" \
diff --git a/paddlenlp/transformers/deepseek_v2/__init__.py b/paddlenlp/transformers/deepseek_v2/__init__.py
@@ -14,6 +14,6 @@
 
 from .configuration import *
 from .modeling import *
-from .modeling_network import *
+from .modeling_auto import *
 from .modeling_pp import *
 from .tokenizer_fast import *
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_auto.py b/paddlenlp/transformers/deepseek_v2/modeling_auto.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 # Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
@@ -69,10 +69,10 @@
 )
 
 __all__ = [
-    "DeepseekV2LMHeadNet",
-    "DeepseekV2ForCausalLMNet",
-    "DeepseekV2ModelNet",
-    "DeepseekV2PretrainedModelNet",
+    "DeepseekV2LMHeadAuto",
+    "DeepseekV2ForCausalLMAuto",
+    "DeepseekV2ModelAuto",
+    "DeepseekV2PretrainedModelAuto",
 ]
 
 
@@ -169,7 +169,7 @@ def scaled_dot_product_attention(
         return (attn_output, attn_weights) if output_attentions else attn_output
 
 
-class DeepseekV2MLPNet(nn.Layer):
+class DeepseekV2MLPAuto(nn.Layer):
     def __init__(self, config: DeepseekV2Config, hidden_size=None, intermediate_size=None, is_moe=False):
         super().__init__()
         self.config = config
@@ -187,7 +187,7 @@ def forward(self, x):
         return down_proj
 
 
-class DeepseekV2MoENet(MoELayer):
+class DeepseekV2MoEAuto(MoELayer):
     """
     A mixed expert module containing shared experts.
     """
@@ -209,15 +209,15 @@ def __init__(self, config: DeepseekV2Config):
         super().__init__(
             config=config,
             moe_num_experts=config.n_routed_experts,
-            expert_class=DeepseekV2MLPNet,
+            expert_class=DeepseekV2MLPAuto,
             expert_kwargs={"config": config, "intermediate_size": config.moe_intermediate_size},
             gate=gate,
             capacity=2.0,
         )
         self.alpha = config.aux_loss_alpha
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
-            self.shared_experts = DeepseekV2MLPNet(config=config, intermediate_size=intermediate_size, is_moe=True)
+            self.shared_experts = DeepseekV2MLPAuto(config=config, intermediate_size=intermediate_size, is_moe=True)
 
     def forward(self, hidden_states):
         final_hidden_states, l_aux, l_zloss = super().forward(hidden_states)
@@ -231,7 +231,7 @@ def forward(self, hidden_states):
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
-class DeepseekV2AttentionNet(nn.Layer):
+class DeepseekV2AttentionAuto(nn.Layer):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config: DeepseekV2Config, layerwise_recompute: bool = False):
@@ -393,9 +393,7 @@ def forward(
         # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
         # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
 
-        print("k_nope.shape", k_nope.shape, "k_pe.shape", k_pe.shape)
         key_states = paddle.empty([bsz, q_len, self.num_heads, self.q_head_dim], dtype=self.config.dtype)
-        print("key_states.shape:", key_states.shape)
         # input[0]'s shape = [1, 2048, 16, 128], input[1]'s shape = [1, 2048, 1, 64].
         key_states = paddle.concat([k_nope, k_pe.expand([bsz, q_len, self.num_heads, k_pe.shape[-1]])], axis=3)
 
@@ -456,7 +454,7 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class DeepseekV2DecoderLayerNet(nn.Layer):
+class DeepseekV2DecoderLayerAuto(nn.Layer):
     def __init__(self, config: DeepseekV2Config, layer_idx: int, layerwise_recompute: bool = False):
         super().__init__()
         self.config = config
@@ -467,16 +465,16 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int, layerwise_recompute
 
         self.hidden_size = config.hidden_size
 
-        self.self_attn = DeepseekV2AttentionNet(config=config, layerwise_recompute=layerwise_recompute)
+        self.self_attn = DeepseekV2AttentionAuto(config=config, layerwise_recompute=layerwise_recompute)
 
         self.mlp = (
-            DeepseekV2MoENet(config)
+            DeepseekV2MoEAuto(config)
             if (
                 config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0
             )
-            else DeepseekV2MLPNet(config)
+            else DeepseekV2MLPAuto(config)
         )
         self.input_layernorm = DeepseekV2RMSNorm(config)
         self.post_attention_layernorm = DeepseekV2RMSNorm(config)
@@ -566,16 +564,16 @@ def forward(
         return outputs
 
 
-class DeepseekV2PretrainedModelNet(PretrainedModel):
+class DeepseekV2PretrainedModelAuto(PretrainedModel):
     config_class = DeepseekV2Config
     base_model_prefix = "deepseek_v2"
-    _no_split_modules = ["DeepseekV2DecoderLayerNet"]
+    _no_split_modules = ["DeepseekV2DecoderLayerAuto"]
 
 
 @register_base_model
-class DeepseekV2ModelNet(DeepseekV2PretrainedModelNet):
+class DeepseekV2ModelAuto(DeepseekV2PretrainedModelAuto):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayerNet`]
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayerAuto`]
 
     Args:
         config: DeepseekV2Config
@@ -597,7 +595,7 @@ def __init__(self, config: DeepseekV2Config):
 
         self.layers = nn.LayerList(
             [
-                DeepseekV2DecoderLayerNet(config, layer_idx, layer_idx not in self.no_recompute_layers)
+                DeepseekV2DecoderLayerAuto(config, layer_idx, layer_idx not in self.no_recompute_layers)
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
@@ -777,9 +775,9 @@ def forward(
         )
 
 
-class DeepseekV2LMHeadNet(nn.Layer):
+class DeepseekV2LMHeadAuto(nn.Layer):
     def __init__(self, config: DeepseekV2Config):
-        super(DeepseekV2LMHeadNet, self).__init__()
+        super(DeepseekV2LMHeadAuto, self).__init__()
 
         self.config = config
 
@@ -796,15 +794,15 @@ def forward(self, hidden_states, tensor_parallel_output=None):
         return logits
 
 
-class DeepseekV2ForCausalLMNet(DeepseekV2PretrainedModelNet):
+class DeepseekV2ForCausalLMAuto(DeepseekV2PretrainedModelAuto):
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: DeepseekV2Config):
         super().__init__(config)
         self.config = config
-        self.deepseek_v2 = DeepseekV2ModelNet(config)
+        self.deepseek_v2 = DeepseekV2ModelAuto(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = DeepseekV2LMHeadNet(config)
+        self.lm_head = DeepseekV2LMHeadAuto(config)
         self.criterion = DeepseekV2PretrainingCriterion(config)
 
     def get_input_embeddings(self):
@@ -851,9 +849,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLMNet
+        >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLMAuto
 
-        >>> model = DeepseekV2ForCausalLMNet.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> model = DeepseekV2ForCausalLMAuto.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.`
	`1`	`+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`
`@@ -39,14 +39,14 @@`
`39`	`39`	`AutoTokenizer,`
`40`	`40`	`CosineAnnealingWithWarmupDecay,`
`41`	`41`	`DeepseekV2Config,`
`42`		`- DeepseekV2ForCausalLMNet,`
	`42`	`+ DeepseekV2ForCausalLMAuto,`
`43`	`43`	`DeepseekV2PretrainingCriterion,`
`44`	`44`	`LinearAnnealingWithWarmupDecay,`
`45`	`45`	`)`
`46`	`46`	`from paddlenlp.utils.log import logger`
`47`	`47`
`48`	`48`	`MODEL_CLASSES = {`
`49`		`- "deepseekv2_network": (DeepseekV2Config, DeepseekV2ForCausalLMNet, DeepseekV2PretrainingCriterion),`
	`49`	`+ "deepseekv2_auto": (DeepseekV2Config, DeepseekV2ForCausalLMAuto, DeepseekV2PretrainingCriterion),`
`50`	`50`	`}`
`51`	`51`
`52`	`52`
`@@ -90,7 +90,7 @@ class PreTrainingArguments(AutoTrainingArguments):`
`90`	`90`	`)`
`91`	`91`	`sr: Optional[int] = field(default=0, metadata={"help": "The count of chunks without recompute."})`
`92`	`92`	`virtual_pipeline_seg_method: str = field(`
`93`		`- default="DeepseekV2DecoderLayerNet",`
	`93`	`+ default="DeepseekV2DecoderLayerAuto",`
`94`	`94`	`metadata={"help": "The seg method of spliting pp layer for virtual pipeline."},`
`95`	`95`	`)`
`96`	`96`	`# NOTE(gongenlei): new add autotuner_benchmark`