forked from cad-audio/executorch
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathkernels.h
More file actions
105 lines (89 loc) · 4.07 KB
/
kernels.h
File metadata and controls
105 lines (89 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <inttypes.h>
#include <stddef.h>
#include <xa_type_def.h>
/* For NNLIB APIs */
#include "xa_nnlib_kernels_api.h"
/* Potential NNLIB function/APIs */
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
const WORD32 *const p_out_shape,
const FLOAT32 * __restrict__ p_inp1,
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape);
extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
const WORD32 *const p_out_shape,
const FLOAT32 * __restrict__ p_inp1,
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape);
extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out,
const FLOAT32 * __restrict__ p_inp1,
const FLOAT32 * __restrict__ p_inp2,
WORD32 num_elm,
WORD32 mode);
extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
FLOAT32 * __restrict__ p_out,
const WORD32 *const p_out_shape,
const FLOAT32 * __restrict__ p_inp1,
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape,
WORD32 mode);
extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
const WORD32 *const p_out_shape,
const FLOAT32 * __restrict__ p_inp1,
const WORD32 *const p_inp1_shape,
const FLOAT32 * __restrict__ p_inp2,
const WORD32 *const p_inp2_shape);
namespace impl {
namespace HiFi {
namespace kernels {
void memcpy(void* dst, const void* src, size_t num_bytes);
WORD32 matmul_asym8uxasym8u_asym8u(
UWORD8* __restrict__ p_out, // output uint8 matrix
const UWORD8* __restrict__ p_mat1, // weight uint8 matrix
const UWORD8* __restrict__ p_vec1, // input uint8 matrix
const WORD32* __restrict__ p_bias, // bias int32 vec
WORD32 rows, // rows of p_mat1
WORD32 cols1, // columns of p_mat1
WORD32 row_stride1, // row stride of p_mat1
WORD32 vec_count, // rows of p_mat2
WORD32 vec_offset, // vec_offset of p_mat2.
WORD32 out_offset, // out_offset, i.e., offset of next output element
WORD32 out_stride, // out_stride, i.e., stride to go to next output row
WORD32 mat1_zero_bias, // zero_point of p_mat1
WORD32 vec1_zero_bias, // zero_point of p_vec1
const WORD32* __restrict__ out_multiplier,
const WORD32* __restrict__ out_shift,
WORD32 out_zero_bias,
bool per_channel_quantized = false); // per-channel quantized weight
template <typename T>
T quantize(const float x, float scale, int32_t zero_point);
template <typename T>
float dequantize(const T x, float scale, int32_t zero_point);
template <typename T>
void quantize(
T* __restrict__ y,
const float* __restrict__ x,
float scale,
int32_t zero_point,
size_t size);
// Deuantize an int8_t/uint8_t/int16_t array to an fp32 array
template <typename T>
void dequantize(
float* __restrict__ y,
const T* __restrict__ x,
float scale,
int32_t zero_point,
size_t size);
}; // namespace kernels
}; // namespace HiFi
}; // namespace impl