Skip to content

Commit 6b4eacd

Browse files
authored
[NPU] apply npu storage format into npu_op_runner, test=develop (PaddlePaddle#290)
1 parent 53a6528 commit 6b4eacd

8 files changed

Lines changed: 171 additions & 97 deletions

File tree

.pre-commit-config.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@ repos:
3535
hooks:
3636
- id: black
3737
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
38-
- repo: https://github.com/pycqa/isort
39-
rev: 5.11.2
40-
hooks:
41-
- id: isort
4238
- repo: https://github.com/PyCQA/flake8
4339
rev: 4.0.1
4440
hooks:

backends/npu/kernels/batch_norm_kernel.cc

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ void BatchNormKernel(const Context& dev_ctx,
5050
data_layout_str,
5151
FLAGS_npu_storage_format));
5252

53-
if (FLAGS_npu_storage_format) {
53+
if (FLAGS_npu_storage_format &&
54+
x_dims.size() == 4) { // TODO(qili93): add 3D support
5455
AllocNPUTensor<T>(dev_ctx, ACL_FORMAT_NC1HWC0, y);
5556
} else {
5657
dev_ctx.template Alloc<T>(y);
@@ -111,7 +112,8 @@ void BatchNormKernel(const Context& dev_ctx,
111112
{{"epsilon", epsilon}});
112113
runner_infer.Run(stream);
113114
} else {
114-
if (FLAGS_npu_storage_format) {
115+
if (FLAGS_npu_storage_format &&
116+
x_dims.size() == 4) { // TODO(qili93): add 3D support
115117
AllocNPUTensor<T>(dev_ctx, ACL_FORMAT_NC1HWC0, mean_out);
116118
AllocNPUTensor<T>(dev_ctx, ACL_FORMAT_NC1HWC0, variance_out);
117119
AllocNPUTensor<T>(dev_ctx, ACL_FORMAT_NC1HWC0, saved_mean);
@@ -123,12 +125,16 @@ void BatchNormKernel(const Context& dev_ctx,
123125
dev_ctx.template Alloc<float>(saved_variance);
124126
}
125127

128+
// BN3DTrainingReduce will throw output size mismatch if output tensor in
129+
// NCHW format should change output tensor format same with input tensor
130+
// format NDCHW or NDHWC
126131
phi::DenseTensorMeta meta = {
127-
phi::DataType::FLOAT32, mean_out->dims(), x.layout()};
132+
phi::DataType::FLOAT32, mean_out->dims(), x_tensor.layout()};
128133
phi::DenseTensor sum, square_sum;
129134
sum.set_meta(meta);
130135
square_sum.set_meta(meta);
131-
if (FLAGS_npu_storage_format) {
136+
if (FLAGS_npu_storage_format &&
137+
x_dims.size() == 4) { // TODO(qili93): add 3D support
132138
AllocNPUTensor<float>(dev_ctx, ACL_FORMAT_NC1HWC0, &sum);
133139
AllocNPUTensor<float>(dev_ctx, ACL_FORMAT_NC1HWC0, &square_sum);
134140
} else {
@@ -138,19 +144,43 @@ void BatchNormKernel(const Context& dev_ctx,
138144

139145
std::string reduce_name =
140146
(x.dims().size() == 5) ? "BN3DTrainingReduce" : "BNTrainingReduce";
141-
const auto& runner_reduce = NpuOpRunner(
142-
reduce_name, {x_tensor}, {sum, square_sum}, {{"epsilon", epsilon}});
143-
runner_reduce.Run(stream);
147+
NpuOpRunner runner_reduce;
148+
runner_reduce.SetType(reduce_name)
149+
.AddInput(x_tensor)
150+
.AddOutput(sum)
151+
.AddOutput(square_sum)
152+
.AddAttrs({{"epsilon", epsilon}})
153+
.Run(stream);
154+
155+
// BN3DTrainingUpdate will throw output size mismatch if output tensor in
156+
// NCHW format should change output tensor format same with input tensor
157+
// format NDCHW or NDHWC
158+
if (x_dims.size() == 5) {
159+
mean_out->set_meta(meta);
160+
variance_out->set_meta(meta);
161+
saved_mean->set_meta(meta);
162+
saved_variance->set_meta(meta);
163+
}
144164

145165
std::string update_name =
146166
(x.dims().size() == 5) ? "BN3DTrainingUpdate" : "BNTrainingUpdate";
147-
const auto& runner_update = NpuOpRunner(
148-
update_name,
149-
{x_tensor, sum, square_sum, scale, bias, running_mean, running_var},
150-
{y_tensor, *mean_out, *variance_out, *saved_mean, *saved_variance},
151-
{{"factor", static_cast<float>(momentum)},
152-
{"epsilon", static_cast<float>(epsilon)}});
153-
runner_update.Run(stream);
167+
NpuOpRunner runner_update;
168+
runner_update.SetType(update_name)
169+
.AddInput(x_tensor)
170+
.AddInput(sum)
171+
.AddInput(square_sum)
172+
.AddInput(scale)
173+
.AddInput(bias)
174+
.AddInput(running_mean)
175+
.AddInput(running_var)
176+
.AddOutput(y_tensor)
177+
.AddOutput(*mean_out)
178+
.AddOutput(*variance_out)
179+
.AddOutput(*saved_mean)
180+
.AddOutput(*saved_variance)
181+
.AddAttrs({{"epsilon", static_cast<float>(epsilon)}})
182+
.AddAttrs({{"factor", static_cast<float>(momentum)}})
183+
.Run(stream);
154184
}
155185
}
156186

@@ -246,7 +276,8 @@ void BatchNormGradKernel(
246276

247277
auto stream = dev_ctx.stream();
248278
if (d_scale && d_bias) {
249-
if (FLAGS_npu_storage_format) {
279+
if (FLAGS_npu_storage_format &&
280+
x_dims.size() == 4) { // TODO(qili93): add 3D support
250281
AllocNPUTensor<float>(dev_ctx, ACL_FORMAT_NC1HWC0, d_scale);
251282
AllocNPUTensor<float>(dev_ctx, ACL_FORMAT_NC1HWC0, d_bias);
252283
} else {
@@ -271,7 +302,8 @@ void BatchNormGradKernel(
271302
}
272303

273304
if (d_x) {
274-
if (FLAGS_npu_storage_format) {
305+
if (FLAGS_npu_storage_format &&
306+
x_dims.size() == 4) { // TODO(qili93): add 3D support
275307
AllocNPUTensor<T>(dev_ctx, ACL_FORMAT_NC1HWC0, d_x);
276308
} else {
277309
dev_ctx.template Alloc<T>(d_x);
@@ -332,6 +364,9 @@ void BatchNormInferKernel(const Context& dev_ctx,
332364
const auto& x_dims = x.dims();
333365
const bool channel_last = data_layout_str == "NHWC" && x_dims.size() > 2;
334366

367+
VLOG(1) << "0 -- BatchNormInferKernel: Attr <channel_last> = "
368+
<< channel_last;
369+
335370
PADDLE_ENFORCE_EQ(
336371
channel_last && FLAGS_npu_storage_format,
337372
false,
@@ -343,7 +378,8 @@ void BatchNormInferKernel(const Context& dev_ctx,
343378
data_layout_str,
344379
FLAGS_npu_storage_format));
345380

346-
if (FLAGS_npu_storage_format) {
381+
if (FLAGS_npu_storage_format &&
382+
x_dims.size() == 4) { // TODO(qili93): add 3D support
347383
AllocNPUTensor<T>(dev_ctx, ACL_FORMAT_NC1HWC0, y);
348384
} else {
349385
dev_ctx.template Alloc<T>(y);

backends/npu/kernels/funcs/npu_op_prepare.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,12 @@ namespace custom_kernel {
3131
inline std::string DebugNPUTensor(const phi::DenseTensor& tensor) {
3232
std::stringstream ss;
3333
if (tensor.initialized()) {
34-
ss << ": format: " << tensor.layout() << ", dims: [" << tensor.dims() << "]"
34+
ss << ": dtype: " << tensor.dtype() << ", format: " << tensor.layout()
35+
<< ", dims: [" << tensor.dims() << "]"
3536
<< ", capacity: <" << tensor.capacity() << ">, ";
3637
} else {
37-
ss << ": format: " << tensor.layout() << ", dims: [" << tensor.dims()
38-
<< "]";
38+
ss << ": dtype: " << tensor.dtype() << ", format: " << tensor.layout()
39+
<< ", dims: [" << tensor.dims() << "]";
3940
}
4041

4142
if (!tensor.storage_properties_initialized()) {

backends/npu/kernels/funcs/npu_op_runner.cc

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414

1515
#include "kernels/funcs/npu_op_runner.h"
1616

17-
#include <map>
18-
1917
#include "acl/acl_op_compiler.h"
2018
#include "kernels/funcs/npu_enforce.h"
2119
#include "kernels/funcs/npu_funcs.h"
@@ -311,24 +309,49 @@ std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
311309

312310
aclTensorDesc *NpuOpRunner::CreateTensorDesc(phi::DenseTensor tensor,
313311
aclMemType mem_type) {
314-
auto dtype = ConvertToNpuDtype(tensor.dtype());
315-
auto format = ConvertToNpuFormat(tensor.layout());
316-
auto dims = phi::vectorize(tensor.dims());
317-
int size = dims.size();
318-
319-
if (op_type_ == "DropOutGenMask" && size == 1 && *(dims.data()) == 1) {
320-
size = 0;
312+
auto data_type = ConvertToNpuDtype(tensor.dtype());
313+
auto origin_format = ConvertToNpuFormat(tensor.layout());
314+
auto origin_dims = phi::vectorize(tensor.dims());
315+
316+
auto origin_size = origin_dims.size();
317+
if (op_type_ == "DropOutGenMask" && origin_size == 1 &&
318+
*(origin_dims.data()) == 1) {
319+
origin_size = 0;
321320
}
322321

323-
VLOG(4) << "NPU dtype:" << dtype << " "
324-
<< "rank:" << dims.size() << " dims: " << tensor.dims()
325-
<< " format:" << format;
326-
327-
auto *desc = aclCreateTensorDesc(dtype, size, dims.data(), format);
322+
auto *desc = aclCreateTensorDesc(
323+
data_type, origin_size, origin_dims.data(), origin_format);
328324
PADDLE_ENFORCE_NOT_NULL(
329325
desc, phi::errors::External("Call aclCreateTensorDesc failed."));
330-
PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorFormat(desc, format));
331-
PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorShape(desc, size, dims.data()));
326+
327+
if (tensor.storage_properties_initialized()) {
328+
auto npu_properties =
329+
tensor.storage_properties<phi::NPUStorageProperties>();
330+
int64_t storage_format = npu_properties.storage_format;
331+
auto storage_dims = phi::vectorize(npu_properties.storage_dims);
332+
PADDLE_ENFORCE_NPU_SUCCESS(
333+
aclSetTensorFormat(desc, (aclFormat)storage_format));
334+
PADDLE_ENFORCE_NPU_SUCCESS(
335+
aclSetTensorShape(desc, storage_dims.size(), storage_dims.data()));
336+
VLOG(1) << "CreateTensorDesc for OP: " << op_type_
337+
<< ", data_type: " << data_type
338+
<< ", origin_format: " << origin_format
339+
<< ", storage_format: " << storage_format
340+
<< ", origin_dims: " << tensor.dims()
341+
<< ", storage_dims: " << npu_properties.storage_dims;
342+
} else {
343+
PADDLE_ENFORCE_NPU_SUCCESS(
344+
aclSetTensorFormat(desc, (aclFormat)origin_format));
345+
PADDLE_ENFORCE_NPU_SUCCESS(
346+
aclSetTensorShape(desc, origin_size, origin_dims.data()));
347+
VLOG(1) << "CreateTensorDesc for OP: " << op_type_
348+
<< ", data_type: " << data_type
349+
<< ", origin_format: " << origin_format
350+
<< ", storage_format: " << origin_format
351+
<< ", origin_dims: " << tensor.dims()
352+
<< ", storage_dims: " << tensor.dims();
353+
}
354+
332355
if (mem_type == ACL_MEMTYPE_HOST) {
333356
PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type));
334357
}

backends/npu/kernels/mean_all_kernel.cc

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,31 @@ template <typename T, typename Context>
2121
void MeanAllKernel(const Context& dev_ctx,
2222
const phi::DenseTensor& x,
2323
phi::DenseTensor* out) {
24-
std::vector<int> axes;
25-
NPUAttributeMap attr_input = {{"keep_dims", false}, {"axes", axes}};
24+
auto rank = x.dims().size();
25+
auto out_dims = out->dims();
2626
dev_ctx.template Alloc<T>(out);
27-
const auto& runner = NpuOpRunner("ReduceMeanD", {x}, {*out}, attr_input);
27+
if (rank == 0) { // scalar
28+
TensorCopy(dev_ctx, x, false, out);
29+
out->Resize(out_dims); // copy will reset the dims.
30+
return;
31+
}
32+
2833
auto stream = dev_ctx.stream();
29-
runner.Run(stream);
34+
35+
std::vector<int64_t> reduce_dims;
36+
reduce_dims.reserve(rank);
37+
for (decltype(rank) i = 0; i < rank; ++i) {
38+
reduce_dims.push_back(i);
39+
}
40+
41+
NpuOpRunner runner;
42+
runner.SetType("ReduceMean")
43+
.AddInput(x)
44+
.AddInput(dev_ctx, std::move(reduce_dims))
45+
.AddOutput(*out)
46+
.AddAttr("keep_dims", false)
47+
.AddAttr("noop_with_empty_axes", true)
48+
.Run(stream);
3049
}
3150

3251
template <typename T, typename Context>

0 commit comments

Comments
 (0)