Skip to content

Commit 30c0643

Browse files
[microNPU][ETHOSU] Add Vela's logic to select configuration block
For the case when cascader is enabled, the logic of choosing the optimal configuration block from TVM will be used in other cases, the Vela's logic will be used except the cases when dev_force_block_config option is specified.
1 parent 9710d81 commit 30c0643

4 files changed

Lines changed: 148 additions & 9 deletions

File tree

python/tvm/relay/backend/contrib/ethosu/vela_api.py

Lines changed: 87 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ def get_optimal_block_config(
5656
Therefore, we need to pick an optimal block configuration considering bandwidth
5757
to bring IFM blocks and the number of OFM block computes need to happen
5858
to cover the OFM as indicated by the npu op.
59+
For the case when cascader is enabled, the logic of choosing the optimal configuration block
60+
from TVM will be used in other cases, the Vela's logic will be used except
61+
the cases when dev_force_block_config option is specified.
5962
6063
Parameters
6164
----------
@@ -73,8 +76,90 @@ def get_optimal_block_config(
7376
if options and options.dev_force_block_config:
7477
block_config = [int(v) for v in options.dev_force_block_config.split("x")]
7578
return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
76-
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
77-
return _get_optimal_block_config(all_valid_block_configs)
79+
elif options and options.enable_cascader:
80+
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
81+
return _get_optimal_block_config(all_valid_block_configs)
82+
else:
83+
return _find_block_config_with_vela(npu_op, accel_config)
84+
85+
86+
def _find_block_config_with_vela(
87+
npu_op: vapi.NpuOperation, accelerator: vapi.NpuAccelerator
88+
) -> vapi.NpuShape3D:
89+
"""An internal function to get block config using Vela's logic.
90+
91+
Parameters
92+
----------
93+
npu_op : ethosu.vela.api.NpuOperation
94+
The NPU operation
95+
accelerator : ethosu.vela.api.NpuAccelerator
96+
The NPU accelerator
97+
98+
Returns
99+
-------
100+
ethosu.vela.api.NpuShape3D :
101+
The optimal block config for the operator
102+
"""
103+
from ethosu.vela.architecture_features import Accelerator
104+
from ethosu.vela.architecture_features import create_default_arch
105+
from ethosu.vela.register_command_stream_generator import resampling_mode_map
106+
from ethosu.vela.register_command_stream_util import to_kernel
107+
from ethosu.vela.operation import NpuBlockType
108+
from ethosu.vela.architecture_allocator import find_block_config
109+
from ethosu.vela.shape4d import Shape4D
110+
111+
if isinstance(npu_op, vapi.NpuConv2DOperation):
112+
block_type = NpuBlockType.ConvolutionMxN
113+
elif isinstance(npu_op, vapi.NpuConvDepthWiseOperation):
114+
block_type = NpuBlockType.ConvolutionDepthWise
115+
elif isinstance(npu_op, vapi.NpuPoolingOperation):
116+
block_type = (
117+
NpuBlockType.ReduceSum
118+
if npu_op.sub_op_type == vapi.NpuPoolingOp.REDUCE_SUM
119+
else NpuBlockType.Pooling
120+
)
121+
elif isinstance(npu_op, vapi.NpuElementWiseOperation):
122+
block_type = NpuBlockType.ElementWise
123+
else:
124+
assert 0, "Unsupported operation"
125+
126+
ifm_shape = Shape4D(1, npu_op.ifm.shape.height, npu_op.ifm.shape.width, npu_op.ifm.shape.depth)
127+
ifm2_shape = None
128+
if npu_op.ifm2:
129+
ifm2_shape = Shape4D(
130+
1, npu_op.ifm2.shape.height, npu_op.ifm2.shape.width, npu_op.ifm2.shape.depth
131+
)
132+
ofm_shape = Shape4D(1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth)
133+
134+
ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
135+
ifm_bits = npu_op.ifm.data_type.size_in_bits()
136+
lut_banks = 0
137+
if npu_op.activation:
138+
lut_banks = 2 if npu_op.activation.op_type == vapi.NpuActivationOp.TABLE_LOOKUP else 0
139+
140+
has_scaling = True
141+
for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
142+
if tensor and tensor.quantization is None:
143+
has_scaling = False
144+
break
145+
146+
arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
147+
148+
cfg = find_block_config(
149+
arch,
150+
block_type,
151+
ofm_shape,
152+
ifm_shape,
153+
ifm2_shape,
154+
npu_op.ifm2_scalar is not None,
155+
ifm_bits,
156+
to_kernel(npu_op.kernel),
157+
lut_banks,
158+
has_scaling,
159+
ifm_resampling_mode,
160+
)
161+
assert cfg is not None, f"There is no configuration suitable for {accelerator}"
162+
return vapi.NpuShape3D(cfg.ofm_block.height, cfg.ofm_block.width, cfg.ofm_block.depth)
78163

79164

80165
def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) -> vapi.NpuShape3D:

tests/python/contrib/test_ethosu/test_networks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@
4444
@pytest.mark.parametrize(
4545
"accel_type, model_url, workspace_size",
4646
[
47-
("ethos-u65-256", MOBILENET_V1_URL, 2338848),
47+
("ethos-u65-256", MOBILENET_V1_URL, 2338864),
4848
("ethos-u65-256", MOBILENET_V2_URL, 2264320),
49-
("ethos-u55-256", MOBILENET_V1_URL, 1793376),
49+
("ethos-u55-256", MOBILENET_V1_URL, 1793392),
5050
("ethos-u55-256", MOBILENET_V2_URL, 2217152),
5151
("ethos-u55-128", MOBILENET_V2_URL, 2217152),
5252
("ethos-u55-64", MOBILENET_V2_URL, 2217152),

tests/python/contrib/test_ethosu/test_replace_conv2d.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -633,11 +633,15 @@ def _get_func(
633633

634634
reference_mod = trial[0]
635635
params = trial[1:]
636-
func = _get_func(*params[:-1])
637-
mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
638-
script = mod.script()
639-
mod = tvm.script.from_source(script)
640-
tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
636+
config = {
637+
"enable_cascader": True,
638+
}
639+
with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-u.options": config}):
640+
func = _get_func(*params[:-1])
641+
mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
642+
script = mod.script()
643+
mod = tvm.script.from_source(script)
644+
tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
641645

642646

643647
# fmt: off

tests/python/contrib/test_ethosu/test_vela_api.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,28 @@ def main(
222222
__tvm_meta__ = None
223223

224224

225+
# fmt: off
226+
@tvm.script.ir_module
227+
class Module3:
228+
@T.prim_func
229+
def main(ethos_u_0_i0: T.Buffer((1, 299, 299, 2), "int8"), ethosu_write: T.Buffer((1, 299, 299, 3), "int8")):
230+
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
231+
p2_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
232+
ax0_ax1_fused_ax2_fused_ax3_fused = T.int32()
233+
p2_global_1 = T.Buffer((128,), "uint8", data=p2_global)
234+
with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1056):
235+
p1_encoded = T.Buffer((128,), "uint8")
236+
T.call_extern("handle", "ethosu_copy", p1_encoded[0], 128, p2_global_1[0])
237+
nn = T.int32()
238+
T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", T.int64(179570))
239+
ethos_u_0_i0_1 = T.Buffer((178802,), "int8", data=ethos_u_0_i0.data)
240+
ethosu_write_1 = T.Buffer((268203,), "int8", data=ethosu_write.data)
241+
T.call_extern("handle", "ethosu_conv2d", "int8", 299, 299, 2, 299, 0, 299, ethos_u_0_i0_1[0], 0, 0, 0, T.float32(0.0039215683937072754), -128, "NHWC", 598, 2, 1, "int8", 299, 299, 3, 299, 0, 299, ethosu_write_1[0], 0, 0, 0, T.float32(0.025585981085896492), -128, "NHWC", 897, 3, 1, 2, 3, 1, 1, 1, 2, p2_global_1[0], 96, T.int8(-1), T.int8(-1), 0, p2_global_1[96], 32, T.int8(-1), T.int8(-1), 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", 32, 12, 8)
242+
243+
__tvm_meta__ = None
244+
# fmt: on
245+
246+
225247
def test_get_optimal_block_config():
226248
block_configs_cases = [
227249
{
@@ -559,5 +581,33 @@ def verify(test_vec, mock_enc_w):
559581
verify(_test_vec, _mock_enc_w)
560582

561583

584+
def test_find_block_config_with_vela():
585+
block_configs_cases = [
586+
{
587+
"accel_type": vapi.NpuAccelerator.Ethos_U55_256,
588+
"ref": vapi.NpuShape3D(30, 12, 8),
589+
},
590+
{
591+
"accel_type": vapi.NpuAccelerator.Ethos_U55_128,
592+
"ref": vapi.NpuShape3D(17, 10, 8),
593+
},
594+
{
595+
"accel_type": vapi.NpuAccelerator.Ethos_U55_64,
596+
"ref": vapi.NpuShape3D(25, 5, 8),
597+
},
598+
{
599+
"accel_type": vapi.NpuAccelerator.Ethos_U55_32,
600+
"ref": vapi.NpuShape3D(25, 5, 4),
601+
},
602+
]
603+
604+
mod = Module3
605+
ethosu_conv2d_call = mod["main"].body.body.seq[1].body.value
606+
npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
607+
608+
for case in block_configs_cases:
609+
assert vela_api._find_block_config_with_vela(npu_op, case["accel_type"]) == case["ref"]
610+
611+
562612
if __name__ == "__main__":
563613
tvm.testing.main()

0 commit comments

Comments
 (0)