Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
180 commits
Select commit Hold shift + click to select a range
51c4a5a
Vulkan tests use executorch_core
kirklandsign May 8, 2025
4863988
Handle avg_pool2d with padding == 0 as no padding
3l1 May 8, 2025
bf5b99a
Update buck2 to 2025-05-06 (#10742)
swolchok May 8, 2025
bb7e50f
Tests use executorch_core
kirklandsign May 8, 2025
b1d00e2
[ET-VK] Introduce generic export pass for fusing Q/DQ nodes (#10771)
pytorchbot May 8, 2025
5e8295e
[ET-VK] Implement linear_qcs4w (#10772)
pytorchbot May 8, 2025
d9c6f80
Arm backend: Add model name to -llama_inputs (#10775)
mansnils May 8, 2025
3c21e3a
Arm Backend: Update unit tests for TOSA 1.0 (#10776)
SaoirseARM May 8, 2025
6346bfd
Automatically announce declared options (#10766)
jathu May 8, 2025
c352813
Arm backend: Remove redundant validation check for op_where (#10773)
Sebastian-Larsson May 8, 2025
d24eda4
Arm backend: Replace asserts with exceptions in permutation code (#10…
Sebastian-Larsson May 8, 2025
a37b369
Minor vector sizing change. (#10753)
trviv May 8, 2025
380c4f1
Allow options to be set by presets (#10767)
jathu May 8, 2025
d25ce54
Convert the unit test from java to kotlin (#10702)
phaiting May 8, 2025
ac26555
Create a macos-arm64 preset (#10768)
jathu May 8, 2025
5ad676d
Extract trace from prepare_and_convert and remove export_program
mcremon-meta May 8, 2025
277c39d
Make constant_folding's _DEFAULT_SKIP_TARGETS public
ThomasJannaud May 8, 2025
b1b46ee
: constant fold None
ThomasJannaud May 9, 2025
6e3cb79
to make TIE quantized conv operator to fall back to hifi quantized co…
wl1026sun May 9, 2025
01a5d81
Arm Backend: Use tosa_ref_model only if it is avaiable
digantdesai May 9, 2025
7e1f3e3
Use std::align_alloc in file_data_loader
lucylq May 9, 2025
6759d35
fix transpose / permutations fusion pass
ThomasJannaud May 9, 2025
54a14d9
Arm backend: Suppress colors in pre-push if non-interactive (#10783)
perheld May 9, 2025
f7c906f
Cortex-M: Use q/dq ops in Arm Ethos Runner (#10782)
digantdesai May 9, 2025
b98c3ab
Save some size in pattern/{bitwise,comparison}_op.h (#10489)
swolchok May 9, 2025
80752f4
Reapply #9842: Save some size in dtype_util when dtype selective buil…
swolchok May 9, 2025
f688329
Reapply #9841: Migrate elementwise_util callers to the variants with …
swolchok May 9, 2025
b866837
Use torchtune 0.6.1 (#10792)
kirklandsign May 9, 2025
6e959be
bugfix
digantdesai May 9, 2025
bf50527
fix bug with sequential backends
mcr229 May 9, 2025
1c2b7ba
Remove FLATC_EXECUTABLE and the ability to bring your own flatc (#10781)
jathu May 9, 2025
b173722
Introduce assertj test lib to make the throw exception test more accu…
phaiting May 10, 2025
fbb3ad1
Arm backend: Fix ensures check in UnsqueezeScalarPlaceholdersPass (#1…
YufengShi-dudu May 12, 2025
4909db1
Arm backend: Update rescale to handle more dtypes
oscarandersson8218 May 2, 2025
fa2e1f2
Arm backend: Rename const-tensors for TOSA 1.0
oscarandersson8218 May 6, 2025
0a30c42
NXP Backend: Add eIQ Neutron Backend (#10196)
robert-kalmar May 12, 2025
b11807c
[llava] Remove torch.jit.save in llava example
larryliu0820 May 12, 2025
e113c00
Move EXECUTORCH_PAL_DEFAULT to default preset (#10798)
jathu May 12, 2025
adde519
Make a separate target for kernel utils (#10788)
kirklandsign May 12, 2025
4e5ffa3
[ET-VK] Return fence after waiting is done. (#10808)
pytorchbot May 12, 2025
500842a
Update backends-coreml.md (#10816)
metascroy May 12, 2025
a868166
Xnnpack test for program-data separation (#10817)
pytorchbot May 12, 2025
42b55f4
Move EXECUTORCH_LOG_LEVEL to default preset (#10799)
jathu May 12, 2025
d966a47
Forward-fixing G3 lt kernel
mcremon-meta May 12, 2025
e3a6825
Move EXECUTORCH_ENABLE_PROGRAM_VERIFICATION to default preset (#10800)
jathu May 12, 2025
d4c9a30
[jit] Remove TorchScript from doc (#10825)
larryliu0820 May 12, 2025
2e890df
Android Qwen thinking mode prompt support (#10668)
kirklandsign May 12, 2025
27e159e
mediatek llama runner use executorch_core (#10754)
kirklandsign May 12, 2025
1da5168
Rename "topic: not user facing" (#10828)
pytorchbot May 12, 2025
df8fc61
[jit] Remove @torch.jit.export (#10824)
larryliu0820 May 12, 2025
4a738bd
Move EXECUTORCH_ENABLE_EVENT_TRACER to default preset (#10801)
jathu May 12, 2025
4006cd2
Refactor _to_edge_and_lower_llama to remove args
jackzhxng May 13, 2025
2ee2e03
Move OPTIMIZE_SIZE to default preset (#10802)
jathu May 13, 2025
756f86a
Delete EXECUTORCH_BUILD_ANDROID_JNI (#10803)
jathu May 13, 2025
d7201ab
[Executorch][llm] Add support for ring kv cache and ring attention (#…
pytorchbot May 13, 2025
6f4df1a
Refactor _get_source_transforms to remove args
jackzhxng May 13, 2025
473c77b
Move simple options to default preset (#10804)
jathu May 13, 2025
aa73a55
Default to file load mode in module (#10827)
GregoryComer May 13, 2025
19d3bce
[ET-VK] Removing un used push constants for conv2d pw. (#10841)
pytorchbot May 13, 2025
65d931e
Correct model name in examples/arm/run.sh (#10815)
mansnils May 13, 2025
b30f912
Use certifi certs for buck download (#10095)
GregoryComer May 13, 2025
7993bb2
Arm backend: Update partitioner de-tagging iteration order (#10813)
oscarandersson8218 May 13, 2025
dcd25eb
Arm backend: Refactor pass tests for TOSA V1.0 (#10843)
oscarandersson8218 May 13, 2025
f8e7264
Move dependent options to default preset (#10805)
jathu May 13, 2025
0bb059f
[Executorch][llm] Add ring buffer based kv cache and mask calculation…
pytorchbot May 13, 2025
e71b3aa
Arm backend: Fix mypy linting in pre-push (#10850)
AdrianLundell May 13, 2025
e7ec913
Delete executorch_print_configuration_summary (#10806)
jathu May 13, 2025
0b231c4
[Executorch][llm] Make custom update cache op operate on indices (#10…
pytorchbot May 13, 2025
d338eea
Update ownership for the build system (#10837)
jathu May 13, 2025
3ffe697
Add floatValue to ExecuTorch value
f-meloni May 13, 2025
ef30b25
[Executorch][llm] Enable leveraging ring kv cache via module swap (#1…
pytorchbot May 13, 2025
f1ef702
Qualcomm AI Engine Direct - Flags for CI (#9536)
winskuo-quic May 13, 2025
d0360b7
BUCK forward fix on NXP backend
iseeyuan May 13, 2025
e13b086
Arm backend: Merge decompose/convert meandim pass (#10844)
AdrianLundell May 13, 2025
518324f
Build flatcc for the host (#10855)
jathu May 13, 2025
d0b4ed6
Arm backend: Decompose sum in pass (#10852)
AdrianLundell May 13, 2025
9ded0a2
Allow graceful handling of cpuinfo init failure
GregoryComer May 13, 2025
b20419d
Use the install method for flatc (#10859)
jathu May 13, 2025
4a89327
Remove EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT (#10860)
jathu May 13, 2025
abaee69
Update documents for Express SDK update (#10462)
neuropilot-captain May 14, 2025
62cf849
Update CI for HF Optimum models (#10820)
guangy10 May 14, 2025
ed80e3b
[LlamaDemo] Replace some tokens
kirklandsign May 14, 2025
001e5ef
Change lowbit example to use 4-bit as default in example (#10865)
metascroy May 14, 2025
a21022c
[Executorch][llm] Enable local global attention in export_llama scrip…
pytorchbot May 14, 2025
f785386
Arm backend: Update operator support for TOSA-1.0+INT+u55 (#10849)
per May 14, 2025
e1738cc
Arm backend: Update NEGATE with TOSA 1.0 support (#10845)
per May 14, 2025
fa5048b
Support prequant qwen3 (#10839)
metascroy May 14, 2025
101746e
Arm backend: Example external model to be used by the ahead of time a…
Juanfi8 May 14, 2025
12b5eb6
Add a pass to fuse mul.Scalar into dequant
mcremon-meta May 14, 2025
f39e694
fix Swift compiler assert (#10874)
rmaz May 14, 2025
587f2f8
[jit] Remove more reference to TorchScript (#10856)
larryliu0820 May 14, 2025
e7d39c2
Fix broken tests
jackzhxng May 14, 2025
d67fb52
Use the built-in notify crate for OSS Buck (#10884)
shoumikhin May 14, 2025
1b593ad
Recipe and Input class definitions with e2e export (#10034)
tarun292 May 15, 2025
c913634
Refactor quantize.py functions to remove args
jackzhxng May 15, 2025
5222489
Arm backend: Add validation for same dtype to operators (#10872)
Sebastian-Larsson May 15, 2025
ccabb2e
Arm backend: Refactor misc tests for TOSA V1.0 (#10851)
oscarandersson8218 May 15, 2025
b09e793
Arm backend: Increase atol, rtol to tosa_BI test in test_unary.py (#1…
fumchin May 15, 2025
02d315d
Arm backend: Check in tosa.fbs for TOSA 0.80 and 1.0 (#10870)
oscarandersson8218 May 15, 2025
b427188
Arm backend: Increase atol and rtol to tosa_BI test in test_conv_cons…
fumchin May 15, 2025
b058afb
Arm backend: Add test for DeiT Tiny for TOSA BI (#10846)
martinlsm May 15, 2025
9dece67
Arm backend: Add DecomposeLinalgVectorNorm pass + tests (#10848)
wwwind May 15, 2025
0121dae
Arm backend: Update parse_test_name script (#10902)
AdrianLundell May 15, 2025
43ab323
Arm backend: Add arange.default dummy tests (#10901)
AdrianLundell May 15, 2025
0a6f622
Arm Backend: Updated TosaPipelineBI default qtol (#10907)
SaoirseARM May 15, 2025
879235b
Introduce `platform-config` in CompileSpec for MediaTek backend (#10464)
neuropilot-captain May 15, 2025
56eb18b
Pipe in local_global attention (#10883)
jackzhxng May 15, 2025
47164cc
Update lint_urls.sh (#10919)
shoumikhin May 15, 2025
41063f7
Update llama runner README.md (#10869)
jackzhxng May 15, 2025
9f6c0f2
Update Qwen3 README.md (#10882)
jackzhxng May 15, 2025
a63a648
Default CMAKE_SYSTEM_PROCESSOR to the host (#10912)
jathu May 15, 2025
24789c8
Added debug logs for loading/executing model methods (#10915)
leafs1 May 15, 2025
78fe7ee
Define PYTHON_EXECUTABLE only once in cmake (#10911)
jathu May 15, 2025
7175ca4
Fix CatFromSliceCopyPass indexing issue.
abeakkas May 16, 2025
71767c3
Update javadoc in Module.java
kirklandsign May 16, 2025
bc0fdf3
Add getitem support in graph builder.
hsharma35 May 16, 2025
1244672
Don't build executor runner with Apple frameworks (#10933)
shoumikhin May 16, 2025
e09f33c
Android check pte exists
kirklandsign May 16, 2025
d069d65
Add pass to convert kwargs to args + populate optional args.
hsharma35 May 16, 2025
4b67dc9
Arm backend: Do not delegate casting to FP dtypes with BI profile (#1…
YufengShi-dudu May 16, 2025
12af535
Arm backend: Fix TOSA 1.0 node visitor for sum (#10908)
per May 16, 2025
6f015f6
Arm backend: Improve broadcasting (#10940)
oscarandersson8218 May 16, 2025
2ec8678
Arm backend: Refactor Quantizer test to allow for TOSA 1.0 (#10905)
SaoirseARM May 16, 2025
54e7c75
Arm backend: Add validation steps to op_neg (#10942)
Sebastian-Larsson May 16, 2025
94d1381
Arm backend: Reenable test_fuse_const_ops_tosa_BI (#10847)
martinlsm May 16, 2025
f39a1bb
Arm backend: Allocate the scratch buffer runtime rather than in the p…
gggekov May 16, 2025
5c6d4e5
Arm backend: Refactor test_scalars to new naming standard (#10944)
AdrianLundell May 16, 2025
fd87e98
Arm backend: Convert remaining asserts in operators to raise errors (…
Sebastian-Larsson May 16, 2025
8953279
Broadcast implementation in quantized_add
suvadeep89 May 16, 2025
a0d9c7e
Arm backend: Fix sigmoid int16 and int32 flakyness (#10548)
oscarandersson8218 May 16, 2025
8d53a28
Remove op_registration_util.bzl and rely on shim (#10935)
lucylq May 16, 2025
d0464f8
Sync shim_et/xplat/executorch/kernels/optimized/op_registration_util.…
lucylq May 16, 2025
d0848ca
Forward fix #10851 for arm backend
iseeyuan May 16, 2025
502db64
Create a pybind preset (#10932)
jathu May 16, 2025
d18a52d
Fix libflatccrt race (#10918)
jathu May 16, 2025
309faf8
Update ModuleTest.swift (#10948)
shoumikhin May 16, 2025
851b373
Make test_fuse_mul_into_dequant use GraphBuilder.
eigen-k May 16, 2025
9cce48d
Fix export llava (#10947)
jackzhxng May 16, 2025
7719d31
Mostly sync BlasKernel.cpp with ATen ReducedPrecisionGemvFastPathKern…
pytorchbot May 16, 2025
3cdc4b8
Update Android demo app README.md (#10922)
jackzhxng May 16, 2025
d9fcea1
Build pybind preset in CI (#10936)
jathu May 16, 2025
f8218d1
Copy executorch codegen from pytorch torchgen to executorch repo
larryliu0820 May 16, 2025
6f59e89
Android backend used by method
kirklandsign May 17, 2025
9aaea31
Add copy API to ExecuTorchValue (#10954)
bsoyluoglu May 17, 2025
9663bfb
Hook up PreprocessAll flow to EdgeManager
mcr229 May 18, 2025
7d9dd46
Arm backend: Remove fast scratch part for now
kirklandsign May 19, 2025
6ad47df
Arm backend: Do not run model unit tests in parallel (#10953)
zingo May 19, 2025
bb50792
init
anzr299 May 19, 2025
6925c5e
small fix
anzr299 May 19, 2025
5e23cb9
minor fix
anzr299 May 19, 2025
e04a901
add data aware wc
anzr299 May 19, 2025
fb5750e
minor fix
anzr299 May 19, 2025
de72d65
Delete redundant pybind workflows (#10957)
jathu May 19, 2025
78227f0
Suppport unary log in xnnpack delegate (#10952)
leafs1 May 19, 2025
3032398
Make test_no_replace_quant_permute_dequant_with_requantize use GraphB…
eigen-k May 19, 2025
95e27ed
Make test_replace_quant_view_dequant_with_requantize use GraphBuilder.
eigen-k May 19, 2025
ea9eeb8
add quantization support for disable_dynamic_shapes
anzr299 May 19, 2025
a905728
ToOutVarPass skips inplace ops
JacobSzwejbka May 19, 2025
770569d
minor fix
anzr299 May 19, 2025
e365e15
Add input size validation to Module.execute (#10701)
keyprocedure May 19, 2025
4d7b64e
support function + method variants
JacobSzwejbka May 19, 2025
b2f9ef9
Remove ReplaceTCopyWithTransform
mcremon-meta May 19, 2025
9aedbeb
Arm backend: Make the CI green by not testing Dedicated_Sram for the …
gggekov May 19, 2025
cb3eba0
Fix Windows build (#10946)
SS-JIA May 19, 2025
d1c2683
Qualcomm AI Engine Direct - fix for pytorch uplevel (#10769)
haowhsu-quic May 20, 2025
b73f9d5
Lint links for modified lines only on PR (#10994)
shoumikhin May 20, 2025
7d9b15f
Add etdump to android
kirklandsign May 20, 2025
40736e2
Move optimized target definitions to op_registration.bzl (#10986)
pytorchbot May 20, 2025
7d194cf
Add a android log implementation
kirklandsign May 20, 2025
56018e1
Dtype selective build for optimized ops (#10992)
pytorchbot May 20, 2025
9916cee
partitioner update
anzr299 May 20, 2025
379129d
Arm backend: Add support for BN fusing during QAT (#10967)
oscarandersson8218 May 20, 2025
d509ee3
Arm backend: Refactor models to allow for TOSA 1.0 (#10904)
SaoirseARM May 20, 2025
08dfe52
Arm backend: Adjust AvgPool2d padding when window is not divisible by…
tom-arm May 20, 2025
da62d5f
Arm backend: Clean up matmul tests (#10971)
oscarandersson8218 May 20, 2025
78779b0
Merge branch 'export_llama_executorch' into main
anzr299 May 20, 2025
3871a5f
Merge main
anzr299 May 20, 2025
0c20955
update for latest
anzr299 May 20, 2025
d3730ea
quant and fp16 temp fix
cavusmustafa Jun 5, 2025
3fef8fd
enable import override for export_to_edge with openvino
cavusmustafa Jun 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Cortex-M: Use q/dq ops in Arm Ethos Runner (pytorch#10782)
  • Loading branch information
digantdesai authored May 9, 2025
commit f7c906f6158d546c84495ca308806e6944cb9ea5
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ option(EXECUTORCH_USE_DL "Use libdl library" ON)

option(EXECUTORCH_BUILD_CADENCE "Build the Cadence DSP backend" OFF)

option(EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" OFF)

#
# pthreadpool: build pthreadpool library. Disable on unsupported platforms
#
Expand Down Expand Up @@ -715,6 +717,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
endif()

if(EXECUTORCH_BUILD_CORTEX_M)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
endif()

if(EXECUTORCH_BUILD_DEVTOOLS)
if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
Expand Down
1 change: 1 addition & 0 deletions backends/arm/scripts/build_executorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ cmake \
-DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-DEXECUTORCH_BUILD_CORTEX_M=ON \
-DEXECUTORCH_ENABLE_LOGGING=ON \
${build_devtools_flags} \
${build_with_etdump_flags} \
Expand Down
7 changes: 7 additions & 0 deletions backends/arm/test/test_arm_baremetal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,13 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh
echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add
examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=mul

# Cortex-M op tests
examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qadd --bundleio
examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio
examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio --no_delegate --portable_kernels="aten::sub.out,aten::add.out,aten::mul.out"
examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=qops --bundleio

echo "${TEST_SUITE_NAME}: PASS"
}

Expand Down
61 changes: 61 additions & 0 deletions backends/cortex_m/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Kernel library for Cortex-M operators. Please keep this file formatted by running:
# ~~~
# cmake-format -i CMakeLists.txt
# ~~~
cmake_minimum_required(VERSION 3.19)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()

# Source root directory for executorch.
if(NOT EXECUTORCH_ROOT)
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)

if(NOT PYTHON_EXECUTABLE)
resolve_python_executable()
endif()

# Cortex-M ops kernel sources
set(_cortex_m_kernels__srcs
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
)

# Generate C++ bindings to register kernels into Executorch (for runtime).
# Here select all ops in operators.yaml
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")

# Generate bindings for the kernels
generate_bindings_for_kernels(
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
)
message("Generated files ${gen_command_sources}")

# Build a library for _cortex_m_kernels_srcs
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
target_link_libraries(cortex_m_kernels PRIVATE executorch)
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})

# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
gen_operators_lib(
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
)

install(
TARGETS cortex_m_kernels cortex_m_ops_lib
DESTINATION lib
PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
)
93 changes: 70 additions & 23 deletions backends/cortex_m/ops/op_dequantize_per_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ namespace {
*/
void check_dequantize_args(
const Tensor& input,
int64_t zero_point,
int64_t quant_min,
int64_t quant_max,
ScalarType dtype,
Expand All @@ -39,6 +40,18 @@ void check_dequantize_args(
"input.scalar_type() %" PRId8 " is not char type",
static_cast<int8_t>(input.scalar_type()));

// Check zp range
ET_CHECK_MSG(
zero_point >= quant_min,
"zero_point must be %" PRId64 " <= quant_min %" PRId64,
zero_point,
quant_min);
ET_CHECK_MSG(
zero_point <= quant_max,
"zero_point must be %" PRId64 " >= quant_max %" PRId64,
zero_point,
quant_max);

// Check output dtype is float
ET_CHECK_MSG(
out.scalar_type() == ScalarType::Float,
Expand Down Expand Up @@ -73,18 +86,10 @@ void check_dequantize_args(
/**
* Scalar implementation of quantization for a single value.
*/
template <typename K, typename T>
T dequantize_val(
float scale,
int32_t zero_point,
K value,
int64_t quant_min,
int64_t quant_max) {
(void)quant_min;
(void)quant_max;
return static_cast<T>((static_cast<int32_t>(value) - zero_point) * scale);
template <typename Q, typename F>
F dequantize_val(float scale, int32_t zero_point, Q qvalue) {
return static_cast<F>((static_cast<int32_t>(qvalue) - zero_point) * scale);
}

} // namespace

Tensor& dequantize_per_tensor_out(
Expand All @@ -106,29 +111,71 @@ Tensor& dequantize_per_tensor_out(
"Failed to resize out Tensor in dequantize_per_tensor_out");

// Validate input parameters
check_dequantize_args(input, quant_min, quant_max, dtype, out);
check_dequantize_args(input, zero_point, quant_min, quant_max, dtype, out);

// Pre-compute inverse scale for better performance
int32_t zp = static_cast<int32_t>(zero_point);
int32_t qmin = static_cast<int32_t>(quant_min);
int32_t qmax = static_cast<int32_t>(quant_max);

// Get pointers to input and output data
const int8_t* input_data = input.const_data_ptr<int8_t>();
float* out_data = out.mutable_data_ptr<float>();
const size_t numel = input.numel();

size_t i = 0;
#if defined(HAS_HELIUM_SIMD)
// Helium MVE implementation for float32 to int8 quantization
#Error "Implement MVE version!"
#else
// Scalar implementation for float32 to int8 quantization
for (size_t i = 0; i < numel; i++) {
out_data[i] =
dequantize_val<int8_t, float>(scale, zp, input_data[i], qmin, qmax);
// Helium MVE implementation for int8 to float quantization
static uint8x16_t voffset{
0x0,
0x8,
0x4,
0xC,
0x1,
0x9,
0x5,
0xD,
0x2,
0xA,
0x6,
0xE,
0x3,
0xB,
0x7,
0xF};

int16x8_t vzp = vdupq_n_s16(static_cast<int16_t>(zp));
float32x4_t vscale = vdupq_n_f32(static_cast<float>(scale));

for (; i + 15 < numel; i += 16) {
int8x16_t in_084C195D2A6E3B7F =
vldrbq_gather_offset_s8(input_data, voffset);

int16x8_t in_04152637 = vsubq_s16(vmovlbq_s8(in_084C195D2A6E3B7F), vzp);
int16x8_t in_8C9DAEBF = vsubq_s16(vmovltq_s8(in_084C195D2A6E3B7F), vzp);

float32x4_t inf_0123 = vcvtq_f32_s32(vmovlbq_s16(in_04152637));
float32x4_t inf_4567 = vcvtq_f32_s32(vmovltq_s16(in_04152637));
float32x4_t inf_89AB = vcvtq_f32_s32(vmovlbq_s16(in_8C9DAEBF));
float32x4_t inf_CDEF = vcvtq_f32_s32(vmovltq_s16(in_8C9DAEBF));

float32x4_t out_0123 = vmulq_f32(inf_0123, vscale);
float32x4_t out_4567 = vmulq_f32(inf_4567, vscale);
float32x4_t out_89AB = vmulq_f32(inf_89AB, vscale);
float32x4_t out_CDEF = vmulq_f32(inf_CDEF, vscale);

vstrwq_f32(out_data + 0, out_0123);
vstrwq_f32(out_data + 4, out_4567);
vstrwq_f32(out_data + 8, out_89AB);
vstrwq_f32(out_data + 12, out_CDEF);

input_data += 16;
out_data += 16;
}
#endif
#endif // defined(HAS_HELIUM_SIMD)

for (; i < numel; i++) {
*out_data = dequantize_val<int8_t, float>(scale, zp, *input_data);
*input_data++;
*out_data++;
}
return out;
}

Expand Down
111 changes: 96 additions & 15 deletions backends/cortex_m/ops/op_quantize_per_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ void check_quantize_args(
"input.scalar_type() %" PRId8 " is not float type",
static_cast<int8_t>(input.scalar_type()));

// Check output dtype is int8 (Char)
// Check output dtype is int8
ET_CHECK_MSG(
out.scalar_type() == ScalarType::Char,
"out.scalar_type() %" PRId8 " is not int8 (Char)",
static_cast<int8_t>(out.scalar_type()));

// Check dtype is int8 (Char)
// Check dtype is int8
ET_CHECK_MSG(
dtype == ScalarType::Char,
"dtype %" PRId8 " is not int8 (Char)",
Expand Down Expand Up @@ -75,18 +75,18 @@ void check_quantize_args(
/**
* Scalar implementation of quantization for a single value.
*/
template <typename T, typename K>
T quantize_val(
float inv_scale,
template <typename Q, typename F>
Q quantize_val(
F inv_scale,
int32_t zero_point,
K value,
F value,
int64_t quant_min,
int64_t quant_max) {
int32_t qvalue =
zero_point + static_cast<int32_t>(std::nearbyint(inv_scale * value));
qvalue = std::max<int32_t>(qvalue, static_cast<int32_t>(quant_min));
qvalue = std::min<int32_t>(qvalue, static_cast<int32_t>(quant_max));
return static_cast<T>(qvalue);
return static_cast<Q>(qvalue);
}

} // namespace
Expand Down Expand Up @@ -123,16 +123,97 @@ Tensor& quantize_per_tensor_out(
int8_t* out_data = out.mutable_data_ptr<int8_t>();
const size_t numel = input.numel();

size_t i = 0;

#if defined(HAS_HELIUM_SIMD)
// Helium MVE implementation for float32 to int8 quantization
#Error "Implement MVE version!"
#else
// Scalar implementation for float32 to int8 quantization
for (size_t i = 0; i < numel; i++) {
out_data[i] =
quantize_val<int8_t, float>(inv_scale, zp, input_data[i], qmin, qmax);
// Helium MVE implementation for float32 to int8 quantization
static uint8x16_t voffset{
0x0,
0x8,
0x4,
0xC,
0x1,
0x9,
0x5,
0xD,
0x2,
0xA,
0x6,
0xE,
0x3,
0xB,
0x7,
0xF};

float32x4_t inv_scale_vec = vdupq_n_f32(inv_scale);

// Magic number for float to int conversion, round to nearest even integer
// int magic_round(float f): interpret_as_int32(f + magic_float) - magic_int
// where,
// magic_float = 12582912.0f = (2 ** 23 + 2 ** 22) = (1.5 * 2 ** 23)
// magic_int = 1262485504 = 0x4B400000 = bit_pattern_as_int32(magic_float)

float magic_float = 12582912.0f;
int32_t magic_int = 1262485504;

float32x4_t vmagic_float = vdupq_n_f32(magic_float);
int32x4_t vmagic_int_less_zp =
vdupq_n_s32(magic_int - static_cast<int32_t>(zp));

int16x8_t vqmin = vdupq_n_s16(qmin);
int16x8_t vqmax = vdupq_n_s16(qmax);

// TODO: Measure performnce, we are spilling
for (; i + 15 < numel; i += 16) {
float32x4_t in_0123 = vldrwq_f32(input_data + 0);
float32x4_t in_4567 = vldrwq_f32(input_data + 4);
float32x4_t in_89AB = vldrwq_f32(input_data + 8);
float32x4_t in_CDEF = vldrwq_f32(input_data + 12);

float32x4_t outf_0123 = vfmaq_f32(vmagic_float, in_0123, inv_scale_vec);
float32x4_t outf_4567 = vfmaq_f32(vmagic_float, in_4567, inv_scale_vec);
float32x4_t outf_89AB = vfmaq_f32(vmagic_float, in_89AB, inv_scale_vec);
float32x4_t outf_CDEF = vfmaq_f32(vmagic_float, in_CDEF, inv_scale_vec);

int32x4_t out_0123 =
vsubq_s32(vreinterpretq_s32_f32(outf_0123), vmagic_int_less_zp);
int32x4_t out_4567 =
vsubq_s32(vreinterpretq_s32_f32(outf_4567), vmagic_int_less_zp);
int32x4_t out_89AB =
vsubq_s32(vreinterpretq_s32_f32(outf_89AB), vmagic_int_less_zp);
int32x4_t out_CDEF =
vsubq_s32(vreinterpretq_s32_f32(outf_CDEF), vmagic_int_less_zp);

int16x8_t out_04152637;
int16x8_t out_8C9DAEBF;
out_04152637 = vmovnbq_s32(out_04152637, out_0123);
out_04152637 = vmovntq_s32(out_04152637, out_4567);
out_8C9DAEBF = vmovnbq_s32(out_8C9DAEBF, out_89AB);
out_8C9DAEBF = vmovntq_s32(out_8C9DAEBF, out_CDEF);

int16x8_t out_04152637_clamped =
vminq_s16(vmaxq_s16(out_04152637, vqmin), vqmax);
int16x8_t out_8C9DAEBF_clamped =
vminq_s16(vmaxq_s16(out_8C9DAEBF, vqmin), vqmax);

int8x16_t out_084C195D2A6E3B7F;
out_084C195D2A6E3B7F =
vmovnbq_s16(out_084C195D2A6E3B7F, out_04152637_clamped);
out_084C195D2A6E3B7F =
vmovntq_s16(out_084C195D2A6E3B7F, out_8C9DAEBF_clamped);

vstrbq_scatter_offset_s8(out_data, voffset, out_084C195D2A6E3B7F);
input_data += 16;
out_data += 16;
}
#endif // defined(HAS_HELIUM_SIMD)

for (; i < numel; i++) {
*out_data =
quantize_val<int8_t, float>(inv_scale, zp, *input_data, qmin, qmax);
input_data++;
out_data++;
}
#endif

return out;
}
Expand Down
Loading