apache · tqchen · Jan 17, 2024 · Jan 17, 2024
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
@@ -26,7 +26,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing import relay_workload
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
+from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
@@ -126,7 +126,7 @@ def _get_sch_rules_for_dp4a(intrin):
     ]
 
 
-SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_INTRIN)
+SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_S8S8S32_INTRIN)
 SCH_RULES_FOR_SDOT4 = _get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
 
 POSTPROCS_FOR_VNNI = [

diff --git a/tests/python/meta_schedule/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/meta_schedule/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -25,7 +25,7 @@
 )
 from tvm.script import tir as T
 from tvm.target import Target
-from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
+from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN
 from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
@@ -281,7 +281,7 @@ def _dense(m, n, k, in_dtype, out_dtype):
         types=None,
         sch_rules=[
             ms.schedule_rule.MultiLevelTilingWithIntrin(
-                DP4A_INTRIN,
+                DP4A_S8S8S32_INTRIN,
                 structure="SSSRRSRS",
                 tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
                 max_innermost_factor=64,
@@ -343,7 +343,7 @@ def dp4a_dense_0(
                                     W_shared[v_j, v_k_o * 4 : v_k_o * 4 + 4],
                                 )
                                 T.writes(compute_local[v_i, v_j])
-                                T.block_attr({"meta_schedule.auto_tensorize": "dp4a"})
+                                T.block_attr({"meta_schedule.auto_tensorize": "dp4a_s8s8s32"})
                                 with T.init():
                                     with T.block("compute_init"):
                                         T.reads()