diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py
index d0c1b38134..d1097f3fcf 100644
--- a/deepmd/descriptor/se_atten.py
+++ b/deepmd/descriptor/se_atten.py
@@ -1122,6 +1122,7 @@ def _filter_lower(
                             tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]),
                             two_embd,
                             last_layer_size=outputs_size[-1],
+                            is_sorted=len(self.exclude_types) == 0,
                         )
 
                 if (not self.uniform_seed) and (self.seed is not None):
diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h
index 395621d97e..2e2c021d9c 100644
--- a/source/lib/include/tabulate.h
+++ b/source/lib/include/tabulate.h
@@ -12,7 +12,8 @@ void tabulate_fusion_se_a_cpu(FPTYPE* out,
                               const FPTYPE* two_embed,
                               const int nloc,
                               const int nnei,
-                              const int last_layer_size);
+                              const int last_layer_size,
+                              const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x,
@@ -25,7 +26,8 @@ void tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x,
                                    const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei,
-                                   const int last_layer_size);
+                                   const int last_layer_size,
+                                   const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy,
@@ -37,7 +39,8 @@ void tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy,
                                         const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei,
-                                        const int last_layer_size);
+                                        const int last_layer_size,
+                                        const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_t_cpu(FPTYPE* out,
@@ -115,7 +118,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
                                    const FPTYPE* two_embed,
                                    const int nloc,
                                    const int nnei,
-                                   const int last_layer_size);
+                                   const int last_layer_size,
+                                   const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
@@ -128,7 +132,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
                                         const FPTYPE* dy,
                                         const int nloc,
                                         const int nnei,
-                                        const int last_layer_size);
+                                        const int last_layer_size,
+                                        const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
@@ -140,7 +145,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
                                              const FPTYPE* dz_dy_dem,
                                              const int nloc,
                                              const int nnei,
-                                             const int last_layer_size);
+                                             const int last_layer_size,
+                                             const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
@@ -219,7 +225,8 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out,
                                    const FPTYPE* two_embed,
                                    const int nloc,
                                    const int nnei,
-                                   const int last_layer_size);
+                                   const int last_layer_size,
+                                   const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
@@ -232,7 +239,8 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
                                         const FPTYPE* dy,
                                         const int nloc,
                                         const int nnei,
-                                        const int last_layer_size);
+                                        const int last_layer_size,
+                                        const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
@@ -244,7 +252,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
                                              const FPTYPE* dz_dy_dem,
                                              const int nloc,
                                              const int nnei,
-                                             const int last_layer_size);
+                                             const int last_layer_size,
+                                             const bool is_sorted = true);
 
 template <typename FPTYPE>
 void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out,
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index 06d1d49057..bb05fd59a4 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -128,7 +128,8 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     const FPTYPE stride0,
     const FPTYPE stride1,
     const int nnei,
-    const int last_layer_size) {
+    const int last_layer_size,
+    const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
@@ -141,7 +142,7 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
   FPTYPE var[6];
   for (int ii = 0; ii < nnei; ii++) {
     FPTYPE xx = em_x[block_idx * nnei + ii];
-    if (xx == ago) {
+    if (xx == ago && is_sorted) {
       unloop = true;
       breakpoint = ii;
     }
@@ -191,7 +192,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     const FPTYPE stride0,
     const FPTYPE stride1,
     const int nnei,
-    const int last_layer_size) {
+    const int last_layer_size,
+    const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
   extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
@@ -211,7 +213,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
   FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   for (int ii = warp_idx; ii < nnei; ii += KTILE) {
     FPTYPE xx = em_x[block_idx * nnei + ii];
-    if (ago == xx) {
+    if (ago == xx && is_sorted) {
       unloop = true;
       breakpoint = ii;
     }
@@ -286,7 +288,8 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     const FPTYPE stride0,
     const FPTYPE stride1,
     const int nnei,
-    const int last_layer_size) {
+    const int last_layer_size,
+    const bool is_sorted) {
   extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
@@ -304,7 +307,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
   for (int ii = 0; ii < nnei; ii++) {
     FPTYPE xx = em_x[block_idx * nnei + ii];
     FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii];
-    if (xx == ago) {
+    if (xx == ago && is_sorted) {
       unloop = true;
       breakpoint = ii;
     }
@@ -626,16 +629,18 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
                                    const FPTYPE* two_embed,
                                    const int nloc,
                                    const int nnei,
-                                   const int last_layer_size) {
+                                   const int last_layer_size,
+                                   const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
-      <<<nloc, last_layer_size>>>(
-          out, table, em_x, em, two_embed, table_info[0], table_info[1],
-          table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      <<<nloc, last_layer_size>>>(out, table, em_x, em, two_embed,
+                                  table_info[0], table_info[1], table_info[2],
+                                  table_info[3], table_info[4], nnei,
+                                  last_layer_size, is_sorted);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
@@ -651,7 +656,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
                                         const FPTYPE* dy,
                                         const int nloc,
                                         const int nnei,
-                                        const int last_layer_size) {
+                                        const int last_layer_size,
+                                        const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -664,7 +670,7 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0],
           table_info[1], table_info[2], table_info[3], table_info[4], nnei,
-          last_layer_size);
+          last_layer_size, is_sorted);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
@@ -679,7 +685,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
                                              const FPTYPE* dz_dy_dem,
                                              const int nloc,
                                              const int nnei,
-                                             const int last_layer_size) {
+                                             const int last_layer_size,
+                                             const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -690,7 +697,7 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0],
           table_info[1], table_info[2], table_info[3], table_info[4], nnei,
-          last_layer_size);
+          last_layer_size, is_sorted);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
@@ -852,7 +859,8 @@ template void tabulate_fusion_se_a_gpu_cuda<float>(float* out,
                                                    const float* two_embed,
                                                    const int nloc,
                                                    const int nnei,
-                                                   const int last_layer_size);
+                                                   const int last_layer_size,
+                                                   const bool is_sorted);
 template void tabulate_fusion_se_a_gpu_cuda<double>(double* out,
                                                     const double* table,
                                                     const double* table_info,
@@ -861,7 +869,8 @@ template void tabulate_fusion_se_a_gpu_cuda<double>(double* out,
                                                     const double* two_embed,
                                                     const int nloc,
                                                     const int nnei,
-                                                    const int last_layer_size);
+                                                    const int last_layer_size,
+                                                    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_gpu_cuda<float>(
     float* dy_dem_x,
     float* dy_dem,
@@ -873,7 +882,8 @@ template void tabulate_fusion_se_a_grad_gpu_cuda<float>(
     const float* dy,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_gpu_cuda<double>(
     double* dy_dem_x,
     double* dy_dem,
@@ -885,7 +895,8 @@ template void tabulate_fusion_se_a_grad_gpu_cuda<double>(
     const double* dy,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_grad_gpu_cuda<float>(
     float* dz_dy,
     const float* table,
@@ -896,7 +907,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda<float>(
     const float* dz_dy_dem,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_grad_gpu_cuda<double>(
     double* dz_dy,
     const double* table,
@@ -907,7 +919,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda<double>(
     const double* dz_dy_dem,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 
 template void tabulate_fusion_se_t_gpu_cuda<float>(float* out,
                                                    const float* table,
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
index 5aaf023262..6ae21d4e0b 100644
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -88,7 +88,8 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     const FPTYPE stride0,
     const FPTYPE stride1,
     const int nnei,
-    const int last_layer_size) {
+    const int last_layer_size,
+    const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
   HIP_DYNAMIC_SHARED(int, _data)
   const int_64 block_idx = blockIdx.x;  // nloc
@@ -105,7 +106,7 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
   for (int ii = 0; ii < nnei; ii++) {
     FPTYPE var[6];
     FPTYPE xx = em_x[block_idx * nnei + ii];
-    if (xx == ago) {
+    if (xx == ago && is_sorted) {
       unloop = true;
       breakpoint = ii;
     }
@@ -157,7 +158,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     const FPTYPE stride0,
     const FPTYPE stride1,
     const int nnei,
-    const int last_layer_size) {
+    const int last_layer_size,
+    const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
   HIP_DYNAMIC_SHARED(int, _data)
   const int_64 block_idx = blockIdx.x;  // nloc
@@ -177,7 +179,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
   FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
   for (int ii = 0; ii < nnei - warp_idx; ii += KTILE) {
     FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx];
-    if (ago == xx) {
+    if (ago == xx && is_sorted) {
       unloop = true;
       breakpoint = ii + warp_idx;
     }
@@ -260,7 +262,8 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     const FPTYPE stride0,
     const FPTYPE stride1,
     const int nnei,
-    const int last_layer_size) {
+    const int last_layer_size,
+    const bool is_sorted) {
   extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
@@ -277,7 +280,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     FPTYPE var[6];
     FPTYPE xx = em_x[block_idx * nnei + ii];
     FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii];
-    if (xx == ago) {
+    if (xx == ago && is_sorted) {
       unloop = true;
       breakpoint = ii;
     }
@@ -626,7 +629,8 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out,
                                    const FPTYPE* two_embed,
                                    const int nloc,
                                    const int nnei,
-                                   const int last_layer_size) {
+                                   const int last_layer_size,
+                                   const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -635,7 +639,7 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out,
           tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>),
       nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out,
       table, em_x, em, two_embed, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei, last_layer_size);
+      table_info[3], table_info[4], nnei, last_layer_size, is_sorted);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
@@ -651,7 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
                                         const FPTYPE* dy,
                                         const int nloc,
                                         const int nnei,
-                                        const int last_layer_size) {
+                                        const int last_layer_size,
+                                        const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -663,7 +668,8 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
           tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
       nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x,
       dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1],
-      table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
+      is_sorted);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
@@ -678,7 +684,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
                                              const FPTYPE* dz_dy_dem,
                                              const int nloc,
                                              const int nnei,
-                                             const int last_layer_size) {
+                                             const int last_layer_size,
+                                             const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -689,7 +696,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
                                                                 KK>),
       nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy,
       table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1],
-      table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
+      is_sorted);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
@@ -850,7 +858,8 @@ template void tabulate_fusion_se_a_gpu_rocm<float>(float* out,
                                                    const float* two_embed,
                                                    const int nloc,
                                                    const int nnei,
-                                                   const int last_layer_size);
+                                                   const int last_layer_size,
+                                                   const bool is_sorted);
 template void tabulate_fusion_se_a_gpu_rocm<double>(double* out,
                                                     const double* table,
                                                     const double* table_info,
@@ -859,7 +868,8 @@ template void tabulate_fusion_se_a_gpu_rocm<double>(double* out,
                                                     const double* two_embed,
                                                     const int nloc,
                                                     const int nnei,
-                                                    const int last_layer_size);
+                                                    const int last_layer_size,
+                                                    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_gpu_rocm<float>(
     float* dy_dem_x,
     float* dy_dem,
@@ -871,7 +881,8 @@ template void tabulate_fusion_se_a_grad_gpu_rocm<float>(
     const float* dy,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_gpu_rocm<double>(
     double* dy_dem_x,
     double* dy_dem,
@@ -883,7 +894,8 @@ template void tabulate_fusion_se_a_grad_gpu_rocm<double>(
     const double* dy,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_grad_gpu_rocm<float>(
     float* dz_dy,
     const float* table,
@@ -894,7 +906,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm<float>(
     const float* dz_dy_dem,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void tabulate_fusion_se_a_grad_grad_gpu_rocm<double>(
     double* dz_dy,
     const double* table,
@@ -905,7 +918,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm<double>(
     const double* dz_dy_dem,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 
 template void tabulate_fusion_se_t_gpu_rocm<float>(float* out,
                                                    const float* table,
diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc
index cd23991cf8..377e6d06db 100644
--- a/source/lib/src/tabulate.cc
+++ b/source/lib/src/tabulate.cc
@@ -86,7 +86,8 @@ void deepmd::tabulate_fusion_se_a_cpu(FPTYPE* out,
                                       const FPTYPE* two_embed,
                                       const int nloc,
                                       const int nnei,
-                                      const int last_layer_size) {
+                                      const int last_layer_size,
+                                      const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
   memset(out, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size);
   const FPTYPE lower = table_info[0];
@@ -107,7 +108,7 @@ void deepmd::tabulate_fusion_se_a_cpu(FPTYPE* out,
       ll[2] = em[ii * nnei * 4 + jj * 4 + 2];
       ll[3] = em[ii * nnei * 4 + jj * 4 + 3];
       FPTYPE xx = em_x[ii * nnei + jj];
-      if (ago == xx) {
+      if (ago == xx && is_sorted) {
         unloop = true;
       }
       int table_idx = 0;
@@ -165,7 +166,8 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x,
                                            const FPTYPE* dy,
                                            const int nloc,
                                            const int nnei,
-                                           const int last_layer_size) {
+                                           const int last_layer_size,
+                                           const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
   memset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei);
   memset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4);
@@ -189,7 +191,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x,
       ll[2] = em[ii * nnei * 4 + jj * 4 + 2];
       ll[3] = em[ii * nnei * 4 + jj * 4 + 3];
       FPTYPE xx = em_x[ii * nnei + jj];
-      if (ago == xx) {
+      if (ago == xx && is_sorted) {
         unloop = true;
       }
       int table_idx = 0;
@@ -249,7 +251,8 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy,
                                                 const FPTYPE* dz_dy_dem,
                                                 const int nloc,
                                                 const int nnei,
-                                                const int last_layer_size) {
+                                                const int last_layer_size,
+                                                const bool is_sorted) {
   memset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size);
   const FPTYPE lower = table_info[0];
   const FPTYPE upper = table_info[1];
@@ -275,7 +278,7 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy,
       hh[3] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 3];
       FPTYPE xx = em_x[ii * nnei + jj];
       FPTYPE dz_xx = dz_dy_dem_x[ii * nnei + jj];
-      if (ago == xx) {
+      if (ago == xx && is_sorted) {
         unloop = true;
       }
       int table_idx = 0;
@@ -604,16 +607,16 @@ void deepmd::tabulate_fusion_se_r_grad_grad_cpu(FPTYPE* dz_dy,
   }
 }
 
-template void deepmd::tabulate_fusion_se_a_cpu<float>(
-    float* out,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* two_embed,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
+template void deepmd::tabulate_fusion_se_a_cpu<float>(float* out,
+                                                      const float* table,
+                                                      const float* table_info,
+                                                      const float* em_x,
+                                                      const float* em,
+                                                      const float* two_embed,
+                                                      const int nloc,
+                                                      const int nnei,
+                                                      const int last_layer_size,
+                                                      const bool is_sorted);
 template void deepmd::tabulate_fusion_se_a_cpu<double>(
     double* out,
     const double* table,
@@ -623,7 +626,8 @@ template void deepmd::tabulate_fusion_se_a_cpu<double>(
     const double* two_embed,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void deepmd::tabulate_fusion_se_a_grad_cpu<float>(
     float* dy_dem_x,
     float* dy_dem,
@@ -635,7 +639,8 @@ template void deepmd::tabulate_fusion_se_a_grad_cpu<float>(
     const float* dy,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void deepmd::tabulate_fusion_se_a_grad_cpu<double>(
     double* dy_dem_x,
     double* dy_dem,
@@ -647,7 +652,8 @@ template void deepmd::tabulate_fusion_se_a_grad_cpu<double>(
     const double* dy,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void deepmd::tabulate_fusion_se_a_grad_grad_cpu<float>(
     float* dz_dy,
     const float* table,
@@ -658,7 +664,8 @@ template void deepmd::tabulate_fusion_se_a_grad_grad_cpu<float>(
     const float* dz_dy_dem,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 template void deepmd::tabulate_fusion_se_a_grad_grad_cpu<double>(
     double* dz_dy,
     const double* table,
@@ -669,7 +676,8 @@ template void deepmd::tabulate_fusion_se_a_grad_grad_cpu<double>(
     const double* dz_dy_dem,
     const int nloc,
     const int nnei,
-    const int last_layer_size);
+    const int last_layer_size,
+    const bool is_sorted);
 
 template void deepmd::tabulate_fusion_se_t_cpu<float>(
     float* out,
diff --git a/source/op/_tabulate_grad.py b/source/op/_tabulate_grad.py
index ac4d9df731..e91aa5fd2f 100644
--- a/source/op/_tabulate_grad.py
+++ b/source/op/_tabulate_grad.py
@@ -31,7 +31,14 @@ def _tabulate_fusion_se_a_grad_cc(op, dy):
 @ops.RegisterGradient("TabulateFusionSeAGrad")
 def _tabulate_fusion_se_a_grad_grad_cc(op, dy, dy_):
     dz_dy = op_module.tabulate_fusion_se_a_grad_grad(
-        op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5]
+        op.inputs[0],
+        op.inputs[1],
+        op.inputs[2],
+        op.inputs[3],
+        dy,
+        dy_,
+        op.inputs[5],
+        is_sorted=True,
     )
     return [None, None, None, None, dz_dy, None]
 
@@ -46,6 +53,7 @@ def _tabulate_fusion_se_atten_grad_cc(op, dy):
         op.inputs[4],
         dy,
         op.outputs[0],
+        is_sorted=op.get_attr("is_sorted"),
     )
     return [None, None, dy_dx, dy_df, dy_dtwo]
 
@@ -53,7 +61,14 @@ def _tabulate_fusion_se_atten_grad_cc(op, dy):
 @ops.RegisterGradient("TabulateFusionSeAttenGrad")
 def _tabulate_fusion_se_atten_grad_grad_cc(op, dy, dy_, dy_dtwo):
     dz_dy = op_module.tabulate_fusion_se_a_grad_grad(
-        op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[6]
+        op.inputs[0],
+        op.inputs[1],
+        op.inputs[2],
+        op.inputs[3],
+        dy,
+        dy_,
+        op.inputs[6],
+        is_sorted=op.get_attr("is_sorted"),
     )
     return [None, None, None, None, None, dz_dy, None]
 
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index a3ec2dbd82..0ac8745f64 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -63,7 +63,8 @@ REGISTER_OP("TabulateFusionSeAGradGrad")
     .Input("dz_dy_dem_x: T")
     .Input("dz_dy_dem: T")
     .Input("descriptor: T")
-    .Output("dz_dy: T");
+    .Output("dz_dy: T")
+    .Attr("is_sorted: bool = true");
 
 REGISTER_OP("TabulateFusionSeAtten")
     .Attr("T: {float, double} = DT_DOUBLE")
@@ -73,6 +74,7 @@ REGISTER_OP("TabulateFusionSeAtten")
     .Input("em: T")
     .Input("two_embed: T")
     .Attr("last_layer_size: int")
+    .Attr("is_sorted: bool = true")
     .Output("descriptor: T");
 
 REGISTER_OP("TabulateFusionSeAttenGrad")
@@ -86,7 +88,8 @@ REGISTER_OP("TabulateFusionSeAttenGrad")
     .Input("descriptor: T")
     .Output("dy_dem_x: T")
     .Output("dy_dem: T")
-    .Output("dy_dtwo: T");
+    .Output("dy_dtwo: T")
+    .Attr("is_sorted: bool = true");
 
 REGISTER_OP("TabulateFusionSeT")
     .Attr("T: {float, double} = DT_DOUBLE")
@@ -291,7 +294,9 @@ template <typename Device, typename FPTYPE>
 class TabulateFusionSeAGradGradOp : public OpKernel {
  public:
   explicit TabulateFusionSeAGradGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("is_sorted", &is_sorted));
+  }
   void Compute(OpKernelContext* context) override {
     // Grab the input tensor
     int context_input_index = 0;
@@ -330,25 +335,26 @@ class TabulateFusionSeAGradGradOp : public OpKernel {
 #if GOOGLE_CUDA
       deepmd::tabulate_fusion_se_a_grad_grad_gpu_cuda(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
-          nnei, last_layer_size);
+          nnei, last_layer_size, is_sorted);
 #endif  // GOOGLE_CUDA
 #if TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_grad_gpu_rocm(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
-          nnei, last_layer_size);
+          nnei, last_layer_size, is_sorted);
 #endif  // TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
                       "last layer of embedding net must be less than 1024!"));
     } else if (device == "CPU") {
-      deepmd::tabulate_fusion_se_a_grad_grad_cpu(dz_dy, table, table_info, em_x,
-                                                 em, dz_dy_dem_x, dz_dy_dem,
-                                                 nloc, nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_a_grad_grad_cpu(
+          dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
+          nnei, last_layer_size, is_sorted);
     }
   }
 
  private:
+  bool is_sorted;
   std::string device;
 };
 
@@ -359,6 +365,7 @@ class TabulateFusionSeAttenOp : public OpKernel {
       : OpKernel(context) {
     OP_REQUIRES_OK(context,
                    context->GetAttr("last_layer_size", &last_layer_size));
+    OP_REQUIRES_OK(context, context->GetAttr("is_sorted", &is_sorted));
   }
   void Compute(OpKernelContext* context) override {
     deepmd::safe_compute(
@@ -406,22 +413,24 @@ class TabulateFusionSeAttenOp : public OpKernel {
 #if GOOGLE_CUDA
       deepmd::tabulate_fusion_se_a_gpu_cuda(descriptor, table, table_info, em_x,
                                             em, two_embed, nloc, nnei,
-                                            last_layer_size);
+                                            last_layer_size, is_sorted);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_gpu_rocm(descriptor, table, table_info, em_x,
                                             em, two_embed, nloc, nnei,
-                                            last_layer_size);
+                                            last_layer_size, is_sorted);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
-                                       two_embed, nloc, nnei, last_layer_size);
+                                       two_embed, nloc, nnei, last_layer_size,
+                                       is_sorted);
     }
   }
 
  private:
   int last_layer_size;
+  bool is_sorted;
   std::string device;
 };
 
@@ -429,7 +438,9 @@ template <typename Device, typename FPTYPE>
 class TabulateFusionSeAttenGradOp : public OpKernel {
  public:
   explicit TabulateFusionSeAttenGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("is_sorted", &is_sorted));
+  }
   void Compute(OpKernelContext* context) override {
     deepmd::safe_compute(
         context, [this](OpKernelContext* context) { this->_Compute(context); });
@@ -483,22 +494,23 @@ class TabulateFusionSeAttenGradOp : public OpKernel {
 #if GOOGLE_CUDA
       deepmd::tabulate_fusion_se_a_grad_gpu_cuda(
           dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc,
-          nnei, last_layer_size);
+          nnei, last_layer_size, is_sorted);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_gpu_rocm(
           dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc,
-          nnei, last_layer_size);
+          nnei, last_layer_size, is_sorted);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
-                                            last_layer_size);
+                                            last_layer_size, is_sorted);
     }
   }
 
  private:
+  bool is_sorted;
   std::string device;
 };
 
diff --git a/source/tests/test_model_compression_se_atten.py b/source/tests/test_model_compression_se_atten.py
index 8ec5a6c75f..f047447975 100644
--- a/source/tests/test_model_compression_se_atten.py
+++ b/source/tests/test_model_compression_se_atten.py
@@ -94,7 +94,65 @@ def _init_models():
     return inputs, frozen_models, compressed_models
 
 
+@unittest.skipIf(
+    parse_version(tf.__version__) < parse_version("2"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.",
+)
+def _init_models_exclude_types():
+    data_file = str(tests_path / os.path.join("model_compression", "data"))
+    inputs, frozen_models, compressed_models = [], [], []
+    # 4 tests:
+    # - type embedding FP64, se_atten FP64
+    # - type embedding FP64, se_atten FP32
+    # - type embedding FP32, se_atten FP64
+    # - type embedding FP32, se_atten FP32
+    tests = [
+        {"se_atten precision": "float64", "type embedding precision": "float64"},
+        {"se_atten precision": "float64", "type embedding precision": "float32"},
+        {"se_atten precision": "float32", "type embedding precision": "float64"},
+        {"se_atten precision": "float32", "type embedding precision": "float32"},
+    ]
+    for i in range(4):
+        INPUT = str(tests_path / f"input{i}.json")
+        frozen_model = str(tests_path / f"dp-original-se-atten{i}-exclude-types.pb")
+        compressed_model = str(
+            tests_path / f"dp-compressed-se-atten{i}-exclude-types.pb"
+        )
+        jdata = j_loader(
+            str(tests_path / os.path.join("model_compression", "input.json"))
+        )
+        jdata["model"]["descriptor"] = {}
+        jdata["model"]["descriptor"]["type"] = "se_atten"
+        jdata["model"]["descriptor"]["exclude_types"] = [[0, 1]]
+        jdata["model"]["descriptor"]["precision"] = tests[i]["se_atten precision"]
+        jdata["model"]["descriptor"]["stripped_type_embedding"] = True
+        jdata["model"]["descriptor"]["sel"] = 120
+        jdata["model"]["descriptor"]["attn_layer"] = 0
+        jdata["model"]["type_embedding"] = {}
+        jdata["model"]["type_embedding"]["precision"] = tests[i][
+            "type embedding precision"
+        ]
+        jdata["training"]["training_data"]["systems"] = data_file
+        jdata["training"]["validation_data"]["systems"] = data_file
+        with open(INPUT, "w") as fp:
+            json.dump(jdata, fp, indent=4)
+
+        ret = run_dp("dp train " + INPUT)
+        np.testing.assert_equal(ret, 0, "DP train failed!")
+        ret = run_dp("dp freeze -o " + frozen_model)
+        np.testing.assert_equal(ret, 0, "DP freeze failed!")
+        ret = run_dp("dp compress " + " -i " + frozen_model + " -o " + compressed_model)
+        np.testing.assert_equal(ret, 0, "DP model compression failed!")
+
+        inputs.append(INPUT)
+        frozen_models.append(frozen_model)
+        compressed_models.append(compressed_model)
+
+    return inputs, frozen_models, compressed_models
+
+
 INPUTS, FROZEN_MODELS, COMPRESSED_MODELS = _init_models()
+INPUTS_ET, FROZEN_MODELS_ET, COMPRESSED_MODELS_ET = _init_models_exclude_types()
 
 
 def _get_default_places(nth_test):
@@ -505,8 +563,8 @@ def test_ase(self):
 class TestDeepPotAPBCExcludeTypes(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        self.dp_originals = [DeepPot(FROZEN_MODELS[i]) for i in range(4)]
-        self.dp_compresseds = [DeepPot(COMPRESSED_MODELS[i]) for i in range(4)]
+        self.dp_originals = [DeepPot(FROZEN_MODELS_ET[i]) for i in range(4)]
+        self.dp_compresseds = [DeepPot(COMPRESSED_MODELS_ET[i]) for i in range(4)]
         self.coords = np.array(
             [
                 12.83,
@@ -535,9 +593,9 @@ def setUpClass(self):
     @classmethod
     def tearDownClass(self):
         for i in range(4):
-            _file_delete(INPUTS[i])
-            _file_delete(FROZEN_MODELS[i])
-            _file_delete(COMPRESSED_MODELS[i])
+            _file_delete(INPUTS_ET[i])
+            _file_delete(FROZEN_MODELS_ET[i])
+            _file_delete(COMPRESSED_MODELS_ET[i])
         _file_delete("out.json")
         _file_delete("compress.json")
         _file_delete("checkpoint")