1- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
1+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
22# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
33#
44# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
6969)
7070
7171__all__ = [
72- "DeepseekV2LMHeadNet " ,
73- "DeepseekV2ForCausalLMNet " ,
74- "DeepseekV2ModelNet " ,
75- "DeepseekV2PretrainedModelNet " ,
72+ "DeepseekV2LMHeadAuto " ,
73+ "DeepseekV2ForCausalLMAuto " ,
74+ "DeepseekV2ModelAuto " ,
75+ "DeepseekV2PretrainedModelAuto " ,
7676]
7777
7878
@@ -169,7 +169,7 @@ def scaled_dot_product_attention(
169169 return (attn_output , attn_weights ) if output_attentions else attn_output
170170
171171
172- class DeepseekV2MLPNet (nn .Layer ):
172+ class DeepseekV2MLPAuto (nn .Layer ):
173173 def __init__ (self , config : DeepseekV2Config , hidden_size = None , intermediate_size = None , is_moe = False ):
174174 super ().__init__ ()
175175 self .config = config
@@ -187,7 +187,7 @@ def forward(self, x):
187187 return down_proj
188188
189189
190- class DeepseekV2MoENet (MoELayer ):
190+ class DeepseekV2MoEAuto (MoELayer ):
191191 """
192192 A mixed expert module containing shared experts.
193193 """
@@ -209,15 +209,15 @@ def __init__(self, config: DeepseekV2Config):
209209 super ().__init__ (
210210 config = config ,
211211 moe_num_experts = config .n_routed_experts ,
212- expert_class = DeepseekV2MLPNet ,
212+ expert_class = DeepseekV2MLPAuto ,
213213 expert_kwargs = {"config" : config , "intermediate_size" : config .moe_intermediate_size },
214214 gate = gate ,
215215 capacity = 2.0 ,
216216 )
217217 self .alpha = config .aux_loss_alpha
218218 if config .n_shared_experts is not None :
219219 intermediate_size = config .moe_intermediate_size * config .n_shared_experts
220- self .shared_experts = DeepseekV2MLPNet (config = config , intermediate_size = intermediate_size , is_moe = True )
220+ self .shared_experts = DeepseekV2MLPAuto (config = config , intermediate_size = intermediate_size , is_moe = True )
221221
222222 def forward (self , hidden_states ):
223223 final_hidden_states , l_aux , l_zloss = super ().forward (hidden_states )
@@ -231,7 +231,7 @@ def forward(self, hidden_states):
231231
232232
233233# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
234- class DeepseekV2AttentionNet (nn .Layer ):
234+ class DeepseekV2AttentionAuto (nn .Layer ):
235235 """Multi-headed attention from 'Attention Is All You Need' paper"""
236236
237237 def __init__ (self , config : DeepseekV2Config , layerwise_recompute : bool = False ):
@@ -393,9 +393,7 @@ def forward(
393393 # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
394394 # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
395395
396- print ("k_nope.shape" , k_nope .shape , "k_pe.shape" , k_pe .shape )
397396 key_states = paddle .empty ([bsz , q_len , self .num_heads , self .q_head_dim ], dtype = self .config .dtype )
398- print ("key_states.shape:" , key_states .shape )
399397 # input[0]'s shape = [1, 2048, 16, 128], input[1]'s shape = [1, 2048, 1, 64].
400398 key_states = paddle .concat ([k_nope , k_pe .expand ([bsz , q_len , self .num_heads , k_pe .shape [- 1 ]])], axis = 3 )
401399
@@ -456,7 +454,7 @@ def forward(
456454 return attn_output , attn_weights , past_key_value
457455
458456
459- class DeepseekV2DecoderLayerNet (nn .Layer ):
457+ class DeepseekV2DecoderLayerAuto (nn .Layer ):
460458 def __init__ (self , config : DeepseekV2Config , layer_idx : int , layerwise_recompute : bool = False ):
461459 super ().__init__ ()
462460 self .config = config
@@ -467,16 +465,16 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int, layerwise_recompute
467465
468466 self .hidden_size = config .hidden_size
469467
470- self .self_attn = DeepseekV2AttentionNet (config = config , layerwise_recompute = layerwise_recompute )
468+ self .self_attn = DeepseekV2AttentionAuto (config = config , layerwise_recompute = layerwise_recompute )
471469
472470 self .mlp = (
473- DeepseekV2MoENet (config )
471+ DeepseekV2MoEAuto (config )
474472 if (
475473 config .n_routed_experts is not None
476474 and layer_idx >= config .first_k_dense_replace
477475 and layer_idx % config .moe_layer_freq == 0
478476 )
479- else DeepseekV2MLPNet (config )
477+ else DeepseekV2MLPAuto (config )
480478 )
481479 self .input_layernorm = DeepseekV2RMSNorm (config )
482480 self .post_attention_layernorm = DeepseekV2RMSNorm (config )
@@ -566,16 +564,16 @@ def forward(
566564 return outputs
567565
568566
569- class DeepseekV2PretrainedModelNet (PretrainedModel ):
567+ class DeepseekV2PretrainedModelAuto (PretrainedModel ):
570568 config_class = DeepseekV2Config
571569 base_model_prefix = "deepseek_v2"
572- _no_split_modules = ["DeepseekV2DecoderLayerNet " ]
570+ _no_split_modules = ["DeepseekV2DecoderLayerAuto " ]
573571
574572
575573@register_base_model
576- class DeepseekV2ModelNet ( DeepseekV2PretrainedModelNet ):
574+ class DeepseekV2ModelAuto ( DeepseekV2PretrainedModelAuto ):
577575 """
578- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayerNet `]
576+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayerAuto `]
579577
580578 Args:
581579 config: DeepseekV2Config
@@ -597,7 +595,7 @@ def __init__(self, config: DeepseekV2Config):
597595
598596 self .layers = nn .LayerList (
599597 [
600- DeepseekV2DecoderLayerNet (config , layer_idx , layer_idx not in self .no_recompute_layers )
598+ DeepseekV2DecoderLayerAuto (config , layer_idx , layer_idx not in self .no_recompute_layers )
601599 for layer_idx in range (config .num_hidden_layers )
602600 ]
603601 )
@@ -777,9 +775,9 @@ def forward(
777775 )
778776
779777
780- class DeepseekV2LMHeadNet (nn .Layer ):
778+ class DeepseekV2LMHeadAuto (nn .Layer ):
781779 def __init__ (self , config : DeepseekV2Config ):
782- super (DeepseekV2LMHeadNet , self ).__init__ ()
780+ super (DeepseekV2LMHeadAuto , self ).__init__ ()
783781
784782 self .config = config
785783
@@ -796,15 +794,15 @@ def forward(self, hidden_states, tensor_parallel_output=None):
796794 return logits
797795
798796
799- class DeepseekV2ForCausalLMNet ( DeepseekV2PretrainedModelNet ):
797+ class DeepseekV2ForCausalLMAuto ( DeepseekV2PretrainedModelAuto ):
800798 _tied_weights_keys = ["lm_head.weight" ]
801799
802800 def __init__ (self , config : DeepseekV2Config ):
803801 super ().__init__ (config )
804802 self .config = config
805- self .deepseek_v2 = DeepseekV2ModelNet (config )
803+ self .deepseek_v2 = DeepseekV2ModelAuto (config )
806804 self .vocab_size = config .vocab_size
807- self .lm_head = DeepseekV2LMHeadNet (config )
805+ self .lm_head = DeepseekV2LMHeadAuto (config )
808806 self .criterion = DeepseekV2PretrainingCriterion (config )
809807
810808 def get_input_embeddings (self ):
@@ -851,9 +849,9 @@ def forward(
851849 Example:
852850
853851 ```python
854- >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLMNet
852+ >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLMAuto
855853
856- >>> model = DeepseekV2ForCausalLMNet .from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
854+ >>> model = DeepseekV2ForCausalLMAuto .from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
857855 >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
858856
859857 >>> prompt = "Hey, are you conscious? Can you talk to me?"
0 commit comments