Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/python/integration/test_auto_tensorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from tvm import relay
from tvm.meta_schedule.testing import relay_workload
from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN
from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
Expand Down Expand Up @@ -126,7 +126,7 @@ def _get_sch_rules_for_dp4a(intrin):
]


SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_INTRIN)
SCH_RULES_FOR_DP4A = _get_sch_rules_for_dp4a(DP4A_S8S8S32_INTRIN)
SCH_RULES_FOR_SDOT4 = _get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)

POSTPROCS_FOR_VNNI = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
)
from tvm.script import tir as T
from tvm.target import Target
from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
from tvm.tir.tensor_intrin.arm_cpu import DP4A_S8S8S32_INTRIN
from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN

Expand Down Expand Up @@ -281,7 +281,7 @@ def _dense(m, n, k, in_dtype, out_dtype):
types=None,
sch_rules=[
ms.schedule_rule.MultiLevelTilingWithIntrin(
DP4A_INTRIN,
DP4A_S8S8S32_INTRIN,
structure="SSSRRSRS",
tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
max_innermost_factor=64,
Expand Down Expand Up @@ -343,7 +343,7 @@ def dp4a_dense_0(
W_shared[v_j, v_k_o * 4 : v_k_o * 4 + 4],
)
T.writes(compute_local[v_i, v_j])
T.block_attr({"meta_schedule.auto_tensorize": "dp4a"})
T.block_attr({"meta_schedule.auto_tensorize": "dp4a_s8s8s32"})
with T.init():
with T.block("compute_init"):
T.reads()
Expand Down