executorch/backends/cadence/hifi/kernels/kernels.h at f160dedc3f4cee50a25ef61e3a31ff8963a25f97 · dijopaul/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <inttypes.h>
#include <stddef.h>
#include <xa_type_def.h>

/* For NNLIB APIs */
#include "xa_nnlib_kernels_api.h"

/* Potential NNLIB function/APIs */
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
                                const WORD32 *const p_out_shape,
                                const FLOAT32 * __restrict__ p_inp1,
                                const WORD32 *const p_inp1_shape,
                                const FLOAT32 * __restrict__ p_inp2,
                                const WORD32 *const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
                                const WORD32 *const p_out_shape,
                                const FLOAT32 * __restrict__ p_inp1,
                                const WORD32 *const p_inp1_shape,
                                const FLOAT32 * __restrict__ p_inp2,
                                const WORD32 *const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out,
                                const FLOAT32 * __restrict__ p_inp1,
                                const FLOAT32 * __restrict__ p_inp2,
                                WORD32 num_elm,
                                WORD32 mode);

extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
                                    FLOAT32 * __restrict__ p_out,
                                    const WORD32 *const p_out_shape,
                                    const FLOAT32 * __restrict__ p_inp1,
                                    const WORD32 *const p_inp1_shape,
                                    const FLOAT32 * __restrict__ p_inp2,
                                    const WORD32 *const p_inp2_shape,
                                    WORD32 mode);

extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
                                const WORD32 *const p_out_shape,
                                const FLOAT32 * __restrict__ p_inp1,
                                const WORD32 *const p_inp1_shape,
                                const FLOAT32 * __restrict__ p_inp2,
                                const WORD32 *const p_inp2_shape);

namespace impl {
namespace HiFi {
namespace kernels {

void memcpy(void* dst, const void* src, size_t num_bytes);

WORD32 matmul_asym8uxasym8u_asym8u(
    UWORD8* __restrict__ p_out, // output uint8 matrix
    const UWORD8* __restrict__ p_mat1, // weight uint8 matrix
    const UWORD8* __restrict__ p_vec1, // input uint8 matrix
    const WORD32* __restrict__ p_bias, // bias int32 vec
    WORD32 rows, // rows of p_mat1
    WORD32 cols1, // columns of p_mat1
    WORD32 row_stride1, // row stride of p_mat1
    WORD32 vec_count, // rows of p_mat2
    WORD32 vec_offset, // vec_offset of p_mat2.
    WORD32 out_offset, // out_offset, i.e., offset of next output element
    WORD32 out_stride, // out_stride, i.e., stride to go to next output row
    WORD32 mat1_zero_bias, // zero_point of p_mat1
    WORD32 vec1_zero_bias, // zero_point of p_vec1
    const WORD32* __restrict__ out_multiplier,
    const WORD32* __restrict__ out_shift,
    WORD32 out_zero_bias,
    bool per_channel_quantized = false); // per-channel quantized weight

template <typename T>
T quantize(const float x, float scale, int32_t zero_point);

template <typename T>
float dequantize(const T x, float scale, int32_t zero_point);

template <typename T>
void quantize(
    T* __restrict__ y,
    const float* __restrict__ x,
    float scale,
    int32_t zero_point,
    size_t size);

// Deuantize an int8_t/uint8_t/int16_t array to an fp32 array
template <typename T>
void dequantize(
    float* __restrict__ y,
    const T* __restrict__ x,
    float scale,
    int32_t zero_point,
    size_t size);

}; // namespace kernels
}; // namespace HiFi
}; // namespace impl