diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py index 7831e5c8d72d..8b5dd63fc859 100644 --- a/tests/python/integration/test_auto_tensorize.py +++ b/tests/python/integration/test_auto_tensorize.py @@ -26,7 +26,7 @@ from tvm import relay from tvm.meta_schedule.testing import relay_workload from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base -from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN +from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN @@ -126,7 +126,7 @@ def _get_sch_rules_for_dp4a(intrin): ] -SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_INTRIN) +SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_S8S8S32_INTRIN) SCH_RULES_FOR_SDOT4 = _get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN) POSTPROCS_FOR_VNNI = [ diff --git a/tests/python/meta_schedule/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/meta_schedule/test_meta_schedule_schedule_rule_mlt_intrin.py index 1f682d8018bc..39675d0531e1 100644 --- a/tests/python/meta_schedule/test_meta_schedule_schedule_rule_mlt_intrin.py +++ b/tests/python/meta_schedule/test_meta_schedule_schedule_rule_mlt_intrin.py @@ -25,7 +25,7 @@ ) from tvm.script import tir as T from tvm.target import Target -from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN +from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN @@ -281,7 +281,7 @@ def _dense(m, n, k, in_dtype, out_dtype): types=None, sch_rules=[ ms.schedule_rule.MultiLevelTilingWithIntrin( - DP4A_INTRIN, + DP4A_S8S8S32_INTRIN, structure="SSSRRSRS", tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"], max_innermost_factor=64, @@ -343,7 +343,7 @@ def dp4a_dense_0( W_shared[v_j, v_k_o * 4 : v_k_o * 4 + 4], ) T.writes(compute_local[v_i, v_j]) - T.block_attr({"meta_schedule.auto_tensorize": "dp4a"}) + T.block_attr({"meta_schedule.auto_tensorize": "dp4a_s8s8s32"}) with T.init(): with T.block("compute_init"): T.reads()