diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_576_bf16.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_576_bf16.cu index 30d04287fd..ddfe281a9a 100644 --- a/src/turbomind/kernels/attention/codegen/attention_sm80_576_bf16.cu +++ b/src/turbomind/kernels/attention/codegen/attention_sm80_576_bf16.cu @@ -7,8 +7,7 @@ namespace turbomind { using namespace attention; -template void invokeAttention< - typename AttentionConfig::Kernel>( +template void invokeAttention::Kernel>( const AttentionParams& params); } // namespace turbomind diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_bf16.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_bf16.cu index da60f32a51..41fc8587f4 100644 --- a/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_bf16.cu +++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_bf16.cu @@ -7,16 +7,16 @@ namespace turbomind { using namespace attention; -template bool invokeDecoding>( - const AttentionParams& params); +template bool +invokeDecoding>(const AttentionParams& params); -template bool invokeDecoding>( - const AttentionParams& params); +template bool +invokeDecoding>(const AttentionParams& params); -template bool invokeDecoding>( - const AttentionParams& params); +template bool +invokeDecoding>(const AttentionParams& params); -template bool invokeDecoding>( - const AttentionParams& params); +template bool +invokeDecoding>(const AttentionParams& params); } // namespace turbomind diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u4.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u4.cu index 9839d612cb..9d7d518030 100644 --- a/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u4.cu +++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u4.cu @@ -7,10 +7,8 @@ namespace turbomind { using namespace attention; -template bool invokeDecoding>( - const AttentionParams&); +template bool invokeDecoding>(const AttentionParams&); -template bool invokeDecoding>( - const AttentionParams&); +template bool invokeDecoding>(const AttentionParams&); } // namespace turbomind diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u8.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u8.cu index a358b31627..60ef9cd3a9 100644 --- a/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u8.cu +++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_576_bf16_u8.cu @@ -7,10 +7,8 @@ namespace turbomind { using namespace attention; -template bool invokeDecoding>( - const AttentionParams&); +template bool invokeDecoding>(const AttentionParams&); -template bool invokeDecoding>( - const AttentionParams&); +template bool invokeDecoding>(const AttentionParams&); } // namespace turbomind diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 59ae583c95..46e7ba6045 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -691,20 +691,20 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n // noaux_tc: scores = scoring_func(logits), scores_for_choice = scores + correction_bias, // top-k on scores_for_choice, weights from scores; renormalize; apply routed_scale. // Threading: one token per block, threads cooperate over expert dimension. -__global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] - int8_t* masks, // [experts, tokens_padded] - int* accum, // [experts, tiles] - const float* logits, // [tokens, experts] - const float* bias, // [experts] or nullptr - int tokens, - int tokens_padded, - int experts, - int top_k, - bool norm_topk, - float routed_scale, - int log_tile, - int tiles, - bool use_sigmoid) +__global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] + int8_t* masks, // [experts, tokens_padded] + int* accum, // [experts, tiles] + const float* logits, // [tokens, experts] + const float* bias, // [experts] or nullptr + int tokens, + int tokens_padded, + int experts, + int top_k, + bool norm_topk, + float routed_scale, + int log_tile, + int tiles, + bool use_sigmoid) { const int ti = blockIdx.x; // one token per block if (ti >= tokens) { @@ -712,16 +712,16 @@ __global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] } extern __shared__ char smem[]; - float* scores = (float*)smem; - float* scores_for_choice = scores + experts; + float* scores = (float*)smem; + float* scores_for_choice = scores + experts; const float* row = logits + ti * experts; if (use_sigmoid) { // Sigmoid scoring: scores[e] = 1 / (1 + exp(-logit[e])) for (int e = threadIdx.x; e < experts; e += blockDim.x) { - float s = 1.0f / (1.0f + expf(-row[e])); - scores[e] = s; + float s = 1.0f / (1.0f + expf(-row[e])); + scores[e] = s; scores_for_choice[e] = s + (bias ? bias[e] : 0.f); } } @@ -739,7 +739,7 @@ __global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] float sum_exp = 0.f; for (int e = threadIdx.x; e < experts; e += blockDim.x) { - float s = expf(row[e] - max_logit); + float s = expf(row[e] - max_logit); scores[e] = s; sum_exp += s; } @@ -747,8 +747,8 @@ __global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] __syncthreads(); for (int e = threadIdx.x; e < experts; e += blockDim.x) { - float s = scores[e] / (sum_exp + 1e-20f); - scores[e] = s; + float s = scores[e] / (sum_exp + 1e-20f); + scores[e] = s; scores_for_choice[e] = s + (bias ? bias[e] : 0.f); } } @@ -784,7 +784,7 @@ __global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] } } if (best_e < 0) { - best_e = 0; + best_e = 0; topk_val[k] = 0.f; } else { @@ -819,29 +819,29 @@ __global__ void MoeGateNoAuxTCKernel(float* scales, // [top_k, tokens] } void invokeMoeGate_NoAuxTC(int* f2n, - int* f2E, - int* en2f, - int* offsets, - float* scales, - void* masks, - int* accum, - const float* logits, - const float* correction_bias, - int tokens, - int tokens_padded, - int experts, - int exp_per_tok, - bool norm_topk_prob, - float routed_scale, - bool use_sigmoid, - cudaStream_t st) + int* f2E, + int* en2f, + int* offsets, + float* scales, + void* masks, + int* accum, + const float* logits, + const float* correction_bias, + int tokens, + int tokens_padded, + int experts, + int exp_per_tok, + bool norm_topk_prob, + float routed_scale, + bool use_sigmoid, + cudaStream_t st) { TM_CHECK(exp_per_tok > 0); TM_CHECK_LE(exp_per_tok, 32); TM_CHECK_LE(exp_per_tok, experts); constexpr int base_log_tile = 9; - int log_tile = base_log_tile; + int log_tile = base_log_tile; while (((tokens_padded + (1 << log_tile) - 1) >> log_tile) > kMoeGateMaxTiles) { ++log_tile; } @@ -855,8 +855,8 @@ void invokeMoeGate_NoAuxTC(int* f2n, while (block_dim < experts && block_dim < 256) { block_dim *= 2; // next power of 2 } - const int blocks = tokens; - const size_t smem = sizeof(float) * experts * 2; + const int blocks = tokens; + const size_t smem = sizeof(float) * experts * 2; MoeGateNoAuxTCKernel<<>>(scales, (int8_t*)masks, @@ -875,17 +875,8 @@ void invokeMoeGate_NoAuxTC(int* f2n, constexpr int scan_threads = (1 << base_log_tile) / kMoeGateVecSize; const dim3 scan_blocks(tiles, experts + 1); - MoeScanKernel_v2<<>>(f2n, - f2E, - en2f, - offsets, - (int8_t*)masks, - accum, - log_tile, - tiles, - tokens, - tokens_padded, - experts); + MoeScanKernel_v2<<>>( + f2n, f2E, en2f, offsets, (int8_t*)masks, accum, log_tile, tiles, tokens, tokens_padded, experts); } template diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 3f42bd1166..5c82a377f0 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -91,9 +91,8 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight( // ffn_weight_type for their shared experts (int4 for mixed AWQ, // bfloat16 for GptOss mxfp4, same as weight_type otherwise). if (inter_size_) { - const bool is_moe_layer = layer_id < (int)moe_param.expert_num.size() - && moe_param.expert_num[layer_id]; - const DataType ffn_wtype = is_moe_layer ? model.ffn_weight_type : weight_type_; + const bool is_moe_layer = layer_id < (int)moe_param.expert_num.size() && moe_param.expert_num[layer_id]; + const DataType ffn_wtype = is_moe_layer ? model.ffn_weight_type : weight_type_; const bool is_cublas_gemm = byte_size(ffn_wtype, 8) == 16; ffn_weights.reset(new LlamaFfnWeight{ hidden_units_, diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 9a161f5d6a..82504af47e 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -45,9 +45,9 @@ struct ModelParam { // Full AWQ int4 int4 int4 // Mixed AWQ float16 int4 int4 // GptOss mxfp4 bfloat16 bfloat16 e2m1 - DataType weight_type; // attention weights - DataType expert_weight_type; // MoE routed expert weights - DataType ffn_weight_type; // dense FFN / shared expert weights + DataType weight_type; // attention weights + DataType expert_weight_type; // MoE routed expert weights + DataType ffn_weight_type; // dense FFN / shared expert weights int group_size; MLAParam mla; diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index 081bc7d709..f1a16f5a68 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -130,21 +130,21 @@ void MoeFfnLayer::Forward(ForwardParam& p) /// TODO: fix illegal memory access even if NaN are present in logits invokeMoeGate_V2(f2n_.data(), - f2E_.data(), - en2f_.data(), - offsets_.data(), - scales_.data(), - masks_.data(), - accum_.data(), - logits.data(), - tokens, - padded, - expert_num, - param_.experts_per_token, - softmax, - param_.norm_topk_prob, - param_.routed_scale, - st); + f2E_.data(), + en2f_.data(), + offsets_.data(), + scales_.data(), + masks_.data(), + accum_.data(), + logits.data(), + tokens, + padded, + expert_num, + param_.experts_per_token, + softmax, + param_.norm_topk_prob, + param_.routed_scale, + st); } sync_check_cuda_error(); diff --git a/src/turbomind/turbomind.cc b/src/turbomind/turbomind.cc index 2406961991..1525a3afae 100644 --- a/src/turbomind/turbomind.cc +++ b/src/turbomind/turbomind.cc @@ -466,8 +466,8 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac model_param_.weight_type = data_type_from_string(model["weight_type"].as()); model_param_.expert_weight_type = data_type_from_string(model["expert_weight_type"].as()); - model_param_.ffn_weight_type = data_type_from_string( - model["ffn_weight_type"].as(model["weight_type"].as())); + model_param_.ffn_weight_type = + data_type_from_string(model["ffn_weight_type"].as(model["weight_type"].as())); if (auto method = get_moe_method()) { moe_param_.method = *method;