From e9a4489a1639dfd26766c05ac46d49c978396138 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 21 Mar 2025 17:29:56 -0700
Subject: [PATCH 1/4] Update

[ghstack-poisoned]
---
 kernels/aten/functions.yaml                   |  2 +
 kernels/portable/cpu/op_elu.cpp               | 62 ++++++++++++
 kernels/portable/functions.yaml               |  5 +
 kernels/test/CMakeLists.txt                   |  1 +
 kernels/test/op_elu_test.cpp                  | 95 +++++++++++++++++++
 kernels/test/targets.bzl                      |  1 +
 .../kernels/portable/op_registration_util.bzl |  7 ++
 7 files changed, 173 insertions(+)
 create mode 100644 kernels/portable/cpu/op_elu.cpp
 create mode 100644 kernels/test/op_elu_test.cpp
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index 7069f9140ab..a8fa6611478 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -141,6 +141,8 @@
 
 - op: div.out_mode
 
+- op: elu.out
+
 - op: embedding.out
 
 - op: empty.out
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
new file mode 100644
index 00000000000..d4846fb1bfb
--- /dev/null
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <type_traits>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native {
+
+Tensor& elu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+
+  static constexpr const char op_name[] = "elu.out";
+  ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&]() {
+    using MathT = std::
+        conditional_t<c10::is_reduced_floating_point_v<CTYPE>, float, CTYPE>;
+    MathT math_alpha = 0;
+    MathT math_scale = 0;
+    MathT math_input_scale = 0;
+    ET_EXTRACT_SCALAR(alpha, math_alpha);
+    ET_EXTRACT_SCALAR(scale, math_scale);
+    ET_EXTRACT_SCALAR(input_scale, math_input_scale);
+    const auto negcoef = math_alpha * math_scale;
+    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
+        [negcoef, math_scale, math_input_scale](auto x) {
+          return MathT(x) <= MathT(0)
+              ? std::expm1(MathT(x) * math_input_scale) * negcoef
+              : MathT(x) * math_scale;
+        },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::FLOATHBF16,
+        out,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+  });
+  return out;
+}
+
+} // namespace torch::executor::native
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 29dfe8b1a0c..5e45a210a70 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -329,6 +329,11 @@
     - arg_meta: null
       kernel_name: torch::executor::eq_tensor_out
 
+- op: elu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::elu_out
+
 - op: erf.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index b9f48f0c9a1..42578acbedd 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -135,6 +135,7 @@ set(all_test_sources
     "op_detach_copy_test.cpp"
     "op_diagonal_copy_test.cpp"
     "op_div_test.cpp"
+    "op_elu_test.cpp"
     "op_embedding_test.cpp"
     "op_empty_test.cpp"
     "op_eq_test.cpp"
diff --git a/kernels/test/op_elu_test.cpp b/kernels/test/op_elu_test.cpp
new file mode 100644
index 00000000000..73ee8ac31a7
--- /dev/null
+++ b/kernels/test/op_elu_test.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::string_view;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpEluTest : public OperatorTest {
+ protected:
+  Tensor& op_elu_out(
+      const Tensor& self,
+      const Scalar& alpha,
+      const Scalar& scale,
+      const Scalar& input_scale,
+      Tensor& out) {
+    return torch::executor::aten::elu_outf(
+        context_, self, alpha, scale, input_scale, out);
+  }
+
+  template <ScalarType DTYPE>
+  void test_elu_execution() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {3, 2};
+
+    Tensor in = tf.make(sizes, /*data=*/{-0.125, -0.25, -1, 0, 1.25, 100});
+
+    Tensor out = tf.zeros(sizes);
+
+    // Run full gelu.
+    op_elu_out(in, 1.25, 1, 1, out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {-0.146879, -0.276499, -0.790151, 0, 1.25, 100}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_integer_elu_dies() {
+    TensorFactory<DTYPE> tf;
+
+    Tensor in = tf.ones({1});
+    Tensor out = tf.ones({1});
+    ET_EXPECT_KERNEL_FAILURE(context_, op_elu_out(in, 1, 1, 1, out));
+  }
+};
+
+TEST_F(OpEluTest, Basic) {
+#define TEST_ENTRY(ctype, dtype) test_elu_execution<ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpEluTest, UnhandledDtypeDies) {
+#define TEST_ENTRY(ctype, dtype) test_integer_elu_dies<ScalarType::dtype>();
+  ET_FORALL_INT_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpEluTest, MismatchedOutputDtypeDies) {
+  // Two different dtypes. This test uses two types with the same size to
+  // demonstrate that the ScalarType itself matters, not the size of the
+  // tensor elements.
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+
+  const std::vector<int32_t> sizes = {2, 2};
+
+  Tensor a = tf_float.ones(sizes);
+
+  // Destination with a dtype different from the input.
+  Tensor out = tf_double.zeros(sizes);
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_elu_out(a, 1, 1, 1, out));
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 18ab0ac2e28..3824551a46b 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -215,6 +215,7 @@ def define_common_targets():
     _common_op_test("op_detach_copy_test", ["aten", "portable"])
     _common_op_test("op_diagonal_copy_test", ["aten", "portable"])
     _common_op_test("op_div_test", ["aten", "portable", "optimized"])
+    _common_op_test("op_elu_test", ["aten", "portable"])
     _common_op_test("op_embedding_test", ["aten", "portable"])
     _common_op_test("op_empty_test", ["aten", "portable"])
     _common_op_test("op_eq_test", ["aten", "portable"])
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index b56413b92f4..a1ffdc1eed3 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -482,6 +482,13 @@ ATEN_OPS = (
             ":scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_elu",
+        deps = [
+            ":scalar_utils",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+        ],
+    ),
     op_target(
         name = "op_embedding",
         deps = [

From e055ac0e5f392c64b748f6ba854855c60225d2fd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 21 Mar 2025 17:30:00 -0700
Subject: [PATCH 2/4] Update

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_elu.cpp  | 102 ++++++++++++++++++++++++++++++
 kernels/optimized/cpu/targets.bzl |   8 +++
 kernels/optimized/optimized.yaml  |   5 ++
 kernels/test/CMakeLists.txt       |   1 +
 kernels/test/targets.bzl          |   2 +-
 5 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 kernels/optimized/cpu/op_elu.cpp

diff --git a/kernels/optimized/cpu/op_elu.cpp b/kernels/optimized/cpu/op_elu.cpp
new file mode 100644
index 00000000000..c7cd7aec653
--- /dev/null
+++ b/kernels/optimized/cpu/op_elu.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/native/cpu/Elu.h>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch::executor::native {
+
+namespace {
+template <typename CTYPE>
+void elu(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  const CTYPE* in_data = input.const_data_ptr<CTYPE>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+  using MathT =
+      std::conditional_t<c10::is_reduced_floating_point_v<CTYPE>, float, CTYPE>;
+  MathT math_alpha = 0;
+  MathT math_scale = 0;
+  MathT math_input_scale = 0;
+  ET_EXTRACT_SCALAR(alpha, math_alpha);
+  ET_EXTRACT_SCALAR(scale, math_scale);
+  ET_EXTRACT_SCALAR(input_scale, math_input_scale);
+  const auto scalar_func =
+      at::native::get_scalar_elu_elementwise_func<CTYPE, MathT>(
+          math_alpha, math_scale, math_input_scale);
+  const auto vec_func = at::native::get_vectorized_elu_elementwise_func<CTYPE>(
+      math_alpha, math_scale, math_input_scale);
+
+  ::executorch::extension::parallel_for(
+      0,
+      out.numel(),
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        using Vec = at::vec::Vectorized<CTYPE>;
+        const auto vectorized_begin =
+            begin + (Vec::size() - begin % Vec::size()) % Vec::size();
+        const auto vectorized_end = end - (end % Vec::size());
+        // Scalar prologue.
+        for (const auto idx : c10::irange(begin, vectorized_begin)) {
+          out_data[idx] = scalar_func(in_data[idx]);
+        }
+
+        // Main vectorized loop.
+        for (auto idx = vectorized_begin; idx < vectorized_end;
+             idx += Vec::size()) {
+          auto result_vec = vec_func(Vec::loadu(&in_data[idx]));
+          result_vec.store(&out_data[idx]);
+        }
+
+        // Scalar epilogue.
+        for (const auto idx : c10::irange(vectorized_end, end)) {
+          out_data[idx] = scalar_func(in_data[idx]);
+        }
+      });
+}
+} // namespace
+
+Tensor& opt_elu_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_floating_type(in), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+
+  ET_SWITCH_FLOATHBF16_TYPES(
+      input.scalar_type(), context, "elu.out", CTYPE, [&]() {
+        elu<CTYPE>(context, input, alpha, scale, input_scale, out);
+      });
+  return out;
+}
+
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index b868a5901fd..5fd7b74d33e 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -25,6 +25,14 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
     ),
+    op_target(
+        name = "op_elu",
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
+    ),
     op_target(name = "op_exp"),
     op_target(
         name = "op_fft_r2c",
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index 4f90059aa93..864c3ed5780 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -37,6 +37,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_div_scalar_out
 
+- op: elu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_elu_out
+
 - op: exp.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 42578acbedd..2d497dfc124 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -274,6 +274,7 @@ set(_optimized_kernels_test_sources
     "op_add_test.cpp"
     "op_bmm_test.cpp"
     "op_div_test.cpp"
+    "op_elu_test.cpp"
     "op_exp_test.cpp"
     "op_fft_r2c_test.cpp"
     "op_gelu_test.cpp"
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 3824551a46b..05e678c6229 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -215,7 +215,7 @@ def define_common_targets():
     _common_op_test("op_detach_copy_test", ["aten", "portable"])
     _common_op_test("op_diagonal_copy_test", ["aten", "portable"])
     _common_op_test("op_div_test", ["aten", "portable", "optimized"])
-    _common_op_test("op_elu_test", ["aten", "portable"])
+    _common_op_test("op_elu_test", ["aten", "portable", "optimized"])
     _common_op_test("op_embedding_test", ["aten", "portable"])
     _common_op_test("op_empty_test", ["aten", "portable"])
     _common_op_test("op_eq_test", ["aten", "portable"])

From 84778c7083dd0940341c0c9894be5936abe73540 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 25 Mar 2025 08:56:58 -0700
Subject: [PATCH 3/4] Update

[ghstack-poisoned]
---
 exir/dialects/edge/op/sample_input.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/exir/dialects/edge/op/sample_input.py b/exir/dialects/edge/op/sample_input.py
index 23d87053c9e..449f7476be5 100644
--- a/exir/dialects/edge/op/sample_input.py
+++ b/exir/dialects/edge/op/sample_input.py
@@ -424,6 +424,15 @@
         ],
         "returns": [Return(ArgType.Tensor)],
     },
+    "elu.default": { # (Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+        "args": [
+            InArg(ArgType.Tensor),
+            InArg(ArgType.Scalar),
+            InArg(ArgType.Scalar),
+            InArg(ArgType.Scalar),
+        ],
+        "returns": [Return(ArgType.Tensor)],
+    },
     "embedding.default": {  # (Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
         "args": [
             InArg(ArgType.Tensor),

From 58b6a5b9280cf5996c44067b79e4bdb7c885eb67 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 25 Mar 2025 08:57:03 -0700
Subject: [PATCH 4/4] Update

[ghstack-poisoned]
---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 install_requirements.py               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index b17dd3f8f95..ee800549518 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-295f2ed4d103017f7e19a7b8263ece606cd629db
+59d5cf083b4f860dea76fe8936076177f9367f10
diff --git a/install_requirements.py b/install_requirements.py
index 0331f76522a..ba76106502f 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250311"
+NIGHTLY_VERSION = "dev20250325"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -80,7 +80,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.7.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
         (
             f"torchvision==0.22.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly