diff --git a/kernels/portable/cpu/op__clone_dim_order.cpp b/kernels/portable/cpu/op__clone_dim_order.cpp index 83045768cf2..9a4f68bdc46 100644 --- a/kernels/portable/cpu/op__clone_dim_order.cpp +++ b/kernels/portable/cpu/op__clone_dim_order.cpp @@ -10,6 +10,9 @@ #include #include +#include +#include + namespace torch { namespace executor { namespace native { @@ -19,6 +22,30 @@ using Tensor = executorch::aten::Tensor; template using OptionalArrayRef = executorch::aten::OptionalArrayRef; +namespace { + +/** + * Checks the conditions for fast path direct memcpy. This can be used + * when the output dim order is unchanged. + */ +bool check_fast_path_conditions( + const Tensor& in, + OptionalArrayRef dim_order) { + if (!dim_order.has_value()) { + // No dim order means preserve input dim order. + return true; + } + + auto input_dim_order = in.dim_order(); + return std::equal( + dim_order.value().begin(), + dim_order.value().end(), + input_dim_order.begin(), + input_dim_order.end()); +} + +} // namespace + /** * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? * dim_order=None, Tensor(a!) out) -> Tensor(a!) @@ -55,13 +82,18 @@ Tensor& _clone_dim_order_out( return out; } - // Select the correct input dtype and copy the tensors. - ET_SWITCH_REALHBBF16_TYPES( - self.scalar_type(), - ctx, - "dim_order_ops::_clone_dim_order.out", - CTYPE, - [&] { _to_dim_order_copy_impl(self, out); }); + // Dispatch to the fast path if we can use direct memcpy. + if (check_fast_path_conditions(self, dim_order)) { + std::memcpy(out.mutable_data_ptr(), self.const_data_ptr(), self.nbytes()); + } else { + // Select the correct input dtype and copy the tensors. + ET_SWITCH_REALHBBF16_TYPES( + self.scalar_type(), + ctx, + "dim_order_ops::_clone_dim_order.out", + CTYPE, + [&] { _to_dim_order_copy_impl(self, out); }); + } return out; } @@ -77,4 +109,4 @@ Tensor& _clone_dim_order_out( } // namespace native } // namespace executor -} // namespace torch \ No newline at end of file +} // namespace torch