From aef99e39fda769b919c94c7d2e309b8fa36aa831 Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Thu, 13 Mar 2025 23:59:56 -0700
Subject: [PATCH 1/5] [ExecuTorch][Weight Sharing] Track Named Data Store in
 EdgeProgramManager

Pull Request resolved: https://github.com/pytorch/executorch/pull/9151

We enable Backends to return Named Data by adding NamedDataStoreOutput to the preprocess result. This is a completely BC change, as no backends with an implemented preprocess will see any change if nothing is explicitly implemented.

For backend developers to leverage the new NamedDataStore, they can initialize a new NamedDataStore() within preprocess, add_named_data to the data store, and return the NamedDataStore.get_named_data_store_output() in the preprocess result like such:

```
def preprocess(ExportedProgram, List[CompileSpecs]) -> PreprocessResult:
    named_data_store = NamedDataStore()

    for node in exported_program.graph.nodes:
        named_data_store.add_named_data("name", bytes)

    return PreprocessResult(
        processed_bytes=bytes,
        debug_handle_map={},
        data_store_output= named_data_store.get_named_data_store_output()
    )
```


Under the hood, the data store output is embedded in the loweredbackendmodule, (serializing loweredbackendmodule by itself with the a named_data_store_output is still a todo). But via the EdgeProgramManager path, we add the named_data_store_outputs to the edge_program_manger's named data store to keep track of all the named data returned by backends.
ghstack-source-id: 271732049
@exported-using-ghexport

Differential Revision: [D70451660](https://our.internmc.facebook.com/intern/diff/D70451660/)
---
 exir/backend/backend_api.py                   |   1 +
 exir/backend/backend_details.py               |   7 ++
 exir/backend/test/TARGETS                     |  56 +++++++++
 .../test/backend_with_named_data_map.py       | 115 ++++++++++++++++++
 .../test/test_backend_with_named_data_map.py  |  83 +++++++++++++
 exir/lowered_backend_module.py                |  14 +++
 exir/program/_program.py                      |  27 +++-
 7 files changed, 301 insertions(+), 2 deletions(-)
 create mode 100644 exir/backend/test/backend_with_named_data_map.py
 create mode 100644 exir/backend/test/test_backend_with_named_data_map.py

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 519f184871a..d5bd574ec5a 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -120,6 +120,7 @@ def to_backend(
                 backend_id=backend_id,
                 processed_bytes=preprocess_result.processed_bytes,
                 compile_specs=compile_specs,
+                named_data_store_output=preprocess_result.data_store_output,
             )
             lowered_module.meta = {
                 "debug_handle_map": preprocess_result.debug_handle_map
diff --git a/exir/backend/backend_details.py b/exir/backend/backend_details.py
index bdbc1a1fafd..248d03f2b05 100644
--- a/exir/backend/backend_details.py
+++ b/exir/backend/backend_details.py
@@ -9,6 +9,8 @@
 
 from typing import Dict, List, Optional, Tuple, Union
 
+from executorch.exir._serialize._named_data_store import NamedDataStoreOutput
+
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
 
@@ -24,6 +26,11 @@ class PreprocessResult:
     debug_handle_map: Optional[Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]] = (
         None
     )
+    # Data Store output created from NamedDataStore.
+
+    # Named Data store contains all the named data that is stored in the PTE file,
+    # but retrieveable by delegates via the NamedDataMap at runtime.
+    data_store_output: Optional[NamedDataStoreOutput] = None
 
 
 """
diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS
index b453f4c722a..f0ba618936d 100644
--- a/exir/backend/test/TARGETS
+++ b/exir/backend/test/TARGETS
@@ -38,6 +38,62 @@ python_library(
     ],
 )
 
+python_library(
+    name = "backend_with_named_data_map",
+    srcs = [
+        "backend_with_named_data_map.py",
+    ],
+    visibility = [
+        "//executorch/...",
+        "//executorch/test/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir:delegate",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir:print_program",
+        "//executorch/exir:schema",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
+    ],
+)
+
+python_unittest(
+    name = "test_backend_with_named_data_map",
+    srcs = [
+        "test_backend_with_named_data_map.py",
+    ],
+    visibility = [
+        "//executorch/...",
+        "//executorch/test/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir:delegate",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir:print_program",
+        "//executorch/exir:schema",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+        ":backend_with_named_data_map",
+    ],
+)
+
 python_library(
     name = "qnn_backend_demo",
     srcs = [
diff --git a/exir/backend/test/backend_with_named_data_map.py b/exir/backend/test/backend_with_named_data_map.py
new file mode 100644
index 00000000000..47dbc294133
--- /dev/null
+++ b/exir/backend/test/backend_with_named_data_map.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, final, List, Tuple
+
+import torch
+from executorch.exir._serialize._named_data_store import NamedDataStore
+
+from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+    generate_pattern_op_partitions,
+)
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.graph_module import get_control_flow_submodules
+from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+# Backend details are final (cannot be subclassed).
+@final
+class BackendWithNamedDataMap(BackendDetails):
+    """
+    Test Backend for Named Data Map Functionality
+
+    This backend returns no processed_bytes, instead it uses
+    the named data store and serializes the name of the op
+    as the key and the data as its code value
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        op_codes = {
+            exir_ops.edge.aten.sin.default: 0,
+            exir_ops.edge.aten.add.Tensor: 1,
+            exir_ops.edge.aten.sub.Tensor: 2,
+            exir_ops.edge.aten.mul.Tensor: 3,
+            exir_ops.edge.aten.div.Tensor: 4,
+        }
+        ndm = NamedDataStore()
+        for node in edge_program.graph.nodes:
+            if node.op == "call_function":
+                if node.target in op_codes.keys():
+                    ndm.add_named_data(
+                        node.target.__name__, bytes(op_codes[node.target])
+                    )
+
+        return PreprocessResult(
+            processed_bytes=bytes(b""),
+            debug_handle_map={},
+            data_store_output=ndm.get_named_data_store_output(),
+        )
+
+
+class SimpleOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.sin.default,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.div.Tensor,
+        ]
+
+
+@final
+class BackendWithNDMPartitioner(Partitioner):
+    def __init__(self) -> None:
+        self._op_support = SimpleOperatorSupport()
+        self.backend_id = BackendWithNamedDataMap.__name__
+
+    def _partition_gm(
+        self, graph_module: torch.fx.GraphModule, id_start: int = 0
+    ) -> Tuple[int, Dict[str, DelegationSpec]]:
+        partition_tags: Dict[str, DelegationSpec] = {}
+        partition_list = generate_pattern_op_partitions(
+            graph_module, op_support=self._op_support
+        )
+
+        num_partitions_in_gm = len(partition_list)
+        for partition in partition_list:
+            curr_par_id = partition.id or 0
+            delegation_tag = f"tag_{curr_par_id + id_start}"
+            for node in partition.nodes:
+                node.meta["delegation_tag"] = delegation_tag
+            delegation_spec = DelegationSpec(self.backend_id, [])
+            partition_tags[delegation_tag] = delegation_spec
+
+        start_idx_for_submodules = num_partitions_in_gm
+        for _, submodule, _ in get_control_flow_submodules(graph_module):
+            start_idx_for_submodules, ret_partition_tags = self._partition_gm(
+                submodule, start_idx_for_submodules
+            )
+            partition_tags.update(ret_partition_tags)
+
+        return start_idx_for_submodules, partition_tags
+
+    def partition(self, edge_program: ExportedProgram) -> PartitionResult:
+        _, partition_tags = self._partition_gm(edge_program.graph_module)
+        return PartitionResult(
+            tagged_exported_program=edge_program,
+            partition_tags=partition_tags,
+        )
diff --git a/exir/backend/test/test_backend_with_named_data_map.py b/exir/backend/test/test_backend_with_named_data_map.py
new file mode 100644
index 00000000000..cc7aad641f0
--- /dev/null
+++ b/exir/backend/test/test_backend_with_named_data_map.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.exir import to_edge
+from executorch.exir.backend.backend_api import to_backend
+
+from executorch.exir.backend.test.backend_with_named_data_map import (
+    BackendWithNamedDataMap,
+    BackendWithNDMPartitioner,
+)
+
+
+class TestBackendWithNamedDataMap(unittest.TestCase):
+    def test_lowered_backend_module_has_output(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        ep = to_edge(torch.export.export(M(), (torch.randn(1, 2),)))
+        lowered = to_backend(
+            BackendWithNamedDataMap.__name__, ep.exported_program(), []
+        )
+
+        buffer_entries = lowered.named_data_store_output.buffers
+        self.assertTrue(len(buffer_entries) == 1)
+        stored_data = lowered.named_data_store_output.pte_data
+
+        self.assertTrue("aten.add.Tensor" in stored_data)
+        self.assertTrue(buffer_entries[0].buffer == bytes(1))
+
+    def test_named_data_with_partitioner(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                y = x + x
+                y = torch.cos(y)
+                y = y + y
+                y = torch.sin(y)
+                return y - y
+
+        ep = to_edge(torch.export.export(M(), (torch.randn(1, 2),)))
+        ep.to_backend(BackendWithNDMPartitioner())
+
+        ndm_output = ep._named_data_store.get_named_data_store_output()
+        buffer_entries = ndm_output.buffers
+        stored_data = ndm_output.pte_data
+        self.assertEqual(len(buffer_entries), 3)
+        self.assertTrue("aten.add.Tensor" in stored_data)
+        self.assertTrue("aten.sub.Tensor" in stored_data)
+        self.assertTrue("aten.sin.default" in stored_data)
+
+    def test_named_data_with_control_flow(self):
+        class M(torch.nn.Module):
+            def true_branch(self, x):
+                y = x * x
+                y = torch.cos(y)
+                return torch.sin(y)
+
+            def false_branch(self, x):
+                return torch.sin(x)
+
+            def forward(self, x, y):
+                z = x / y
+                z = torch.cond(z.sum() > 0, self.true_branch, self.false_branch, [x])
+                return z - z
+
+        ep = to_edge(torch.export.export(M(), (torch.randn(1, 2), torch.randn(1, 2))))
+        ep.to_backend(BackendWithNDMPartitioner())
+
+        ndm_output = ep._named_data_store.get_named_data_store_output()
+        buffer_entries = ndm_output.buffers
+        stored_data = ndm_output.pte_data
+        self.assertEqual(len(buffer_entries), 4)
+        self.assertTrue("aten.sub.Tensor" in stored_data)
+        self.assertTrue("aten.div.Tensor" in stored_data)
+        self.assertTrue("aten.sin.default" in stored_data)
+        self.assertTrue("aten.mul.Tensor" in stored_data)
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index dde6a397d9a..ed155555ef5 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -14,6 +14,7 @@
 import torch
 import torch.utils._pytree as pytree
 from executorch.exir._serialize import _serialize_pte_binary
+from executorch.exir._serialize._named_data_store import NamedDataStoreOutput
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.delegate import executorch_call_delegate, get_lowered_module_name
 from executorch.exir.emit import emit_program
@@ -62,6 +63,9 @@ class LoweredBackendModule(torch.nn.Module):
         CompileSpec
     ]  # A list of backend-specific objects with static metadata to configure the "compilation" process.
     _original_exported_program: ExportedProgram  # The original EXIR module
+    _named_data_store_output: Optional[
+        NamedDataStoreOutput
+    ]  # Named Data serialized by the backend
 
     def __init__(
         self,
@@ -69,12 +73,14 @@ def __init__(
         backend_id: str,
         processed_bytes: bytes,
         compile_specs: List[CompileSpec],
+        named_data_store_output: Optional[NamedDataStoreOutput] = None,
     ) -> None:
         super().__init__()
         self._original_exported_program = edge_program
         self._backend_id = backend_id
         self._processed_bytes = processed_bytes
         self._compile_specs = compile_specs
+        self._named_data_store_output = named_data_store_output
 
     # pyre-ignore
     def __deepcopy__(self, memo: Optional[Dict[int, Any]]) -> "LoweredBackendModule":
@@ -134,6 +140,13 @@ def original_module(self) -> ExportedProgram:
         """
         return self._original_exported_program
 
+    @property
+    def named_data_store_output(self) -> Optional[NamedDataStoreOutput]:
+        """
+        Returns the Named Data Store Output
+        """
+        return self._named_data_store_output
+
     # TODO(chenlai): consolidate the seriailization config with serialize_to_flatbuffer api
     def buffer(
         self,
@@ -154,6 +167,7 @@ def buffer(
                 segment_alignment=segment_alignment,
                 constant_tensor_alignment=constant_tensor_alignment,
                 delegate_alignment=delegate_alignment,
+                named_data=self.named_data_store_output,
             )
         )
         return out
diff --git a/exir/program/_program.py b/exir/program/_program.py
index ed9dace34d1..c00c003263f 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -26,6 +26,7 @@
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
+from executorch.exir.delegate import executorch_call_delegate, is_lowered_module
 from executorch.exir.emit import emit_program, EmitterOutput
 from executorch.exir.emit._emitter import _DelegateDebugIdentifierMap
 from executorch.exir.error import ExportError
@@ -1304,6 +1305,7 @@ def __init__(
         constant_methods: Optional[Dict[str, Any]] = None,
         compile_config: Optional[EdgeCompileConfig] = None,
         ops_set_to_not_decompose: Optional[List[torch._ops.OpOverload]] = None,
+        named_data_store: Optional[NamedDataStore] = None,
     ):
         """
         Should not be called directly by users. User should use :func:'to_edge' instead.
@@ -1327,7 +1329,7 @@ def __init__(
         self._edge_programs: Dict[str, ExportedProgram] = edge_programs
         self._config_methods = constant_methods
 
-        self._named_data_store = NamedDataStore()
+        self._named_data_store = named_data_store or NamedDataStore()
 
     @property
     def methods(self) -> Set[str]:
@@ -1437,9 +1439,30 @@ def to_backend(
             for name, program in self._edge_programs.items():
                 new_edge_programs[name] = to_backend(program, partitioner)
 
+        # collected all the named data into the named data store for deduplication
+        def collect_named_data_store_outputs(
+            graph_module: torch.fx.GraphModule,
+        ) -> None:
+            for node in graph_module.graph.nodes:
+                if node.target == executorch_call_delegate:
+                    lbm = getattr(graph_module, node.args[0].name)
+                    assert is_lowered_module(lbm)
+                    data_store_output = lbm.named_data_store_output
+                    if data_store_output is not None:
+                        self._named_data_store.merge_named_data_store(data_store_output)
+
+            for _, submod, _ in get_control_flow_submodules(graph_module):
+                collect_named_data_store_outputs(submod)
+
+        for _, program in new_edge_programs.items():
+            collect_named_data_store_outputs(program.graph_module)
+
         config = EdgeCompileConfig(_check_ir_validity=False)
         return EdgeProgramManager(
-            new_edge_programs, copy.deepcopy(self._config_methods), config
+            new_edge_programs,
+            copy.deepcopy(self._config_methods),
+            config,
+            named_data_store=self._named_data_store,
         )
 
     @et_logger("to_executorch")

From 6ab001983591f4fcd25809b174bf9f6a917bac4f Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Thu, 13 Mar 2025 23:59:57 -0700
Subject: [PATCH 2/5] [ExecuTorch][Weight Sharing][XNNPACK] load named data map
 data for xnnpack

Pull Request resolved: https://github.com/pytorch/executorch/pull/9152

If data is serialized into the NamedDataMap, then we overload getConstantDataPtr to retrieve the data from the named data map. This should be done in a Backwards Compatible way. Meaning if no data is serialized into the named data map, then we are still loading the data from the flatbuffer payload.

Since the runtime change here is being made before the AoT changes, All CI on this diff by itself should test that the changes made here are backwards compatitble.

Note: We do not resolve Runtime Memory usage at this point. WeightCache will be implemented in the next diff. Meaning If we load via the same key across different methods, we still pack twice and allocate two instances for the packed weights.
ghstack-source-id: 271732048
@exported-using-ghexport

Differential Revision: [D70315209](https://our.internmc.facebook.com/intern/diff/D70315209/)
---
 backends/xnnpack/runtime/XNNCompiler.cpp      | 56 +++++++++++++++----
 backends/xnnpack/runtime/XNNCompiler.h        |  1 +
 backends/xnnpack/runtime/XNNPACKBackend.cpp   |  7 ++-
 .../xnnpack/serialization/runtime_schema.fbs  |  9 +++
 backends/xnnpack/targets.bzl                  |  1 +
 5 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 8d8e9a13152..6a93ab73a2e 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/xnnpack/runtime/XNNHeader.h>
 #include <executorch/backends/xnnpack/serialization/schema_generated.h>
 #include <executorch/extension/threadpool/threadpool.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/executor/pte_data_map.h>
 #include <unordered_map>
 
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
@@ -22,7 +22,9 @@ namespace xnnpack {
 namespace delegate {
 
 using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
 using executorch::runtime::Result;
 
 /*
@@ -48,6 +50,7 @@ class CompileAllocator {
 using ValuePtr = const fb_xnnpack::XValue*;
 using NodePtr = const fb_xnnpack::XNode*;
 using GraphPtr = const fb_xnnpack::XNNGraph*;
+using ConstantDataOffsetPtr = const fb_xnnpack::ConstantDataOffset*;
 using DataType = fb_xnnpack::XNNDatatype;
 
 // Type for define node function. This is the function signature
@@ -162,7 +165,9 @@ data associated with the tensor value, then returns nullptr.
 const uint8_t* getConstantDataPtr(
     const fb_xnnpack::XNNTensorValue* tensor_value,
     GraphPtr flatbuffer_graph,
-    const uint8_t* constant_data_ptr) {
+    const uint8_t* constant_data_ptr,
+    const NamedDataMap* named_data_map,
+    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
   auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
     if (!constant_data_ptr) {
@@ -171,10 +176,31 @@ const uint8_t* getConstantDataPtr(
       const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
       return constant_buffer[buffer_idx]->storage()->data();
     } else {
-      const auto& constant_data_offsets = *flatbuffer_graph->constant_data();
-      uint64_t constant_data_offset =
-          constant_data_offsets[buffer_idx]->offset();
-      return constant_data_ptr + constant_data_offset;
+      ConstantDataOffsetPtr constant_data_offset =
+          flatbuffer_graph->constant_data()->Get(buffer_idx);
+      uint64_t offset = constant_data_offset->offset();
+
+      bool has_named_key = flatbuffers::IsFieldPresent(
+          constant_data_offset, fb_xnnpack::ConstantDataOffset::VT_NAMED_KEY);
+      // If there is no tensor name
+      if (!has_named_key) {
+        return constant_data_ptr + offset;
+      } else {
+        const std::string& data_name = constant_data_offset->named_key()->str();
+        Result<FreeableBuffer> buffer =
+            named_data_map->get_data(data_name.c_str());
+        if (!buffer.ok()) {
+          ET_LOG(
+              Error,
+              "Failed to get constant data for key %s",
+              data_name.c_str());
+          return nullptr;
+        }
+        const uint8_t* data_ptr =
+            static_cast<const uint8_t*>(buffer.get().data());
+        loaded_buffers_from_map.push_back(std::move(buffer.get()));
+        return data_ptr;
+      }
     }
   }
 
@@ -194,7 +220,9 @@ Error defineTensor(
     const uint8_t* constant_data_ptr,
     std::vector<uint32_t>& input_ids,
     std::vector<uint32_t>& output_ids,
-    CompileAllocator& allocator) {
+    CompileAllocator& allocator,
+    const NamedDataMap* named_data_map,
+    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -231,8 +259,12 @@ Error defineTensor(
 
   // Get Pointer to constant data from flatbuffer, if its non-constant
   // it is a nullptr
-  const uint8_t* buffer_ptr =
-      getConstantDataPtr(tensor_value, flatbuffer_graph, constant_data_ptr);
+  const uint8_t* buffer_ptr = getConstantDataPtr(
+      tensor_value,
+      flatbuffer_graph,
+      constant_data_ptr,
+      named_data_map,
+      loaded_buffers_from_map);
 
   xnn_status status;
   // The type we might have to convert to
@@ -1968,6 +2000,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
     size_t num_bytes,
     XNNExecutor* executor,
     MemoryAllocator* runtime_allocator,
+    const NamedDataMap* named_data_map,
     xnn_workspace_t workspace) {
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
@@ -2036,6 +2069,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   std::vector<uint32_t> input_ids;
   std::vector<uint32_t> output_ids;
   Error err = Error::Ok;
+  std::vector<FreeableBuffer> loaded_buffers_from_map;
   for (auto value : *flatbuffer_graph->xvalues()) {
     err = defineTensor(
         subgraph.get(),
@@ -2045,7 +2079,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         constant_data,
         input_ids,
         output_ids,
-        compile_allocator);
+        compile_allocator,
+        named_data_map,
+        loaded_buffers_from_map);
 
     if (err != Error::Ok) {
       return err;
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
index e66cb791ecb..3ea621a4d59 100644
--- a/backends/xnnpack/runtime/XNNCompiler.h
+++ b/backends/xnnpack/runtime/XNNCompiler.h
@@ -30,6 +30,7 @@ class XNNCompiler {
       size_t num_bytes,
       XNNExecutor* executor,
       executorch::runtime::MemoryAllocator* runtime_allocator,
+      const executorch::runtime::NamedDataMap* named_data_map,
       xnn_workspace_t workspace);
 };
 
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index 1938c5441a5..a01ba2da704 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -10,7 +10,7 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/platform/profiler.h>
+#include <executorch/runtime/executor/pte_data_map.h>
 
 #include <memory>
 #include <mutex>
@@ -29,6 +29,7 @@ using executorch::runtime::DelegateHandle;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
+using executorch::runtime::NamedDataMap;
 using executorch::runtime::Result;
 
 class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
@@ -79,13 +80,14 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
       return Error::MemoryAllocationFailed;
     }
 
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+
 #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
     // This is needed to serialize access to xnn_create_runtime which is not
     // thread safe. This can heppen when multiple threads call init() on
     // the same backend instance.
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
-
     // Executor has been allocated but not constructed, ensure that runtime_ is
     // nullptr by constructing it in place here. NOTE: Since we use placement
     // new and since this type is not trivially destructible, we must call the
@@ -96,6 +98,7 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
         processed->size(),
         executor,
         context.get_runtime_allocator(),
+        named_data_map,
         workspace_.get());
     // This backend does not need its processed data after compiling the model.
     processed->Free();
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 11cb48430ed..75074107c55 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -320,11 +320,20 @@ table XNNLeakyReLU {
 table ConstantDataOffset {
   // Constant data offsets are relative to the constant data base offset provided
   // in the XNNPACKHeader.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
   offset: uint64;
 
   // The size in bytes of valid data starting at the offset. The constant data
   // may be followed by padding before the next piece of constant data
   size: uint64;
+
+  // unique string id used to query the offset from the named data store.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
+  named_key: string;
 }
 
 table XNNGraph {
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index b89a999bc3d..3fd9c433372 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -60,6 +60,7 @@ def define_common_targets():
             "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
             "//executorch/extension/threadpool:threadpool",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
+            "//executorch/runtime/executor:pte_data_map"
         ],
         # XnnpackBackend.cpp needs to compile with executor as whole
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)

From 5f05cf7da33593f3c91d6d65f604eccaa7464ca9 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 14 Mar 2025 21:16:18 -0400
Subject: [PATCH 3/5] [ExecuTorch][Weight Sharing][XNNPACK] Serialize constant
 tensors into named data map (#9295)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/9153 by
@mcr229
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/mcr229/9/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/mcr229/9/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/mcr229/8/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/mcr229/9/orig
@diff-train-skip-merge

Co-authored-by: Max Ren <maxren@meta.com>
---
 backends/xnnpack/_passes/TARGETS              |  1 +
 .../_passes/fuse_batch_norm_with_conv.py      | 68 ++++++++++++++-----
 backends/xnnpack/operators/node_visitor.py    | 26 ++++---
 backends/xnnpack/serialization/schema.fbs     |  9 +++
 .../serialization/xnnpack_graph_schema.py     |  1 +
 .../xnnpack/utils/gen_xnnpack_constants.sh    |  1 +
 backends/xnnpack/utils/utils.py               | 16 +++++
 backends/xnnpack/utils/xnnpack_constants.py   |  6 +-
 backends/xnnpack/xnnpack_preprocess.py        |  6 +-
 9 files changed, 106 insertions(+), 28 deletions(-)

diff --git a/backends/xnnpack/_passes/TARGETS b/backends/xnnpack/_passes/TARGETS
index a199e1aab01..972980570ec 100644
--- a/backends/xnnpack/_passes/TARGETS
+++ b/backends/xnnpack/_passes/TARGETS
@@ -19,5 +19,6 @@ python_library(
         "//executorch/exir/passes:const_prop_pass",
         "//executorch/exir/passes:memory_format_ops_pass",
         "//executorch/exir/program:program",
+        "//executorch/backends/transforms:utils",
     ],
 )
diff --git a/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
index b0f4779eb4c..6f31fe698ba 100644
--- a/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
+++ b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
@@ -7,13 +7,22 @@
 import operator
 
 import torch
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
 
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 
-from executorch.backends.xnnpack.utils.utils import get_param_tensor, is_param_node
+from executorch.backends.xnnpack.utils.utils import (
+    get_param_tensor,
+    get_tensor_name,
+    is_param_node,
+)
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
+from torch.export.graph_signature import InputKind
 
 from torch.nn.utils.fusion import fuse_conv_bn_weights
 
@@ -28,7 +37,7 @@ class FuseBatchNormWithConvPass(XNNPACKPass):
 
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
-        counter = 0
+        constant_placeholders_to_delete = set()
         for conv in graph.nodes:
             # We want to discover a chain of conv -> batch_norm.
             # Only proceed if the current node is a conv node, and has a single
@@ -55,9 +64,11 @@ def call(self, graph_module: torch.fx.GraphModule):
             assert len(conv.args) == 9
 
             conv_weight = get_param_tensor(self.exported_program, conv.args[1])
+            conv_weight_name = get_tensor_name(self.exported_program, conv.args[1])
             assert conv_weight is not None
 
             conv_bias = get_param_tensor(self.exported_program, conv.args[2])
+            conv_bias_name = get_tensor_name(self.exported_program, conv.args[2])
 
             # Get the parameters from the batchnorm op
             assert (
@@ -95,23 +106,43 @@ def call(self, graph_module: torch.fx.GraphModule):
                 bn_bias,
                 is_transpose,
             )
+            fused_weight_name = (conv_weight_name + "_fused_bn").replace(".", "_")
+            if conv_bias_name == "":
+                fused_bias_name = (conv_weight_name + "_bias_fused_bn").replace(
+                    ".", "_"
+                )
+            else:
+                fused_bias_name = (conv_bias_name + "_fused_bn").replace(".", "_")
 
             # Modify the graph by updating the weight and bias of conv op
             # with the fused weight and bias params, and replacing all the users
             # of getitem(batchnorm) with the conv op.
-            with graph.inserting_before(conv):
-                fused_weight_name = f"_fused_with_bn_weight_{counter}"
-                graph_module.register_parameter(fused_weight_name, fused_weight)
-                fused_weight_node = graph.get_attr(fused_weight_name)
-                fused_bias_name = f"_fused_with_bn_bias_{counter}"
-                graph_module.register_parameter(fused_bias_name, fused_bias)
-                fused_bias_node = graph.get_attr(fused_bias_name)
-
-            # Update the weight and bias of conv op
-            conv_args = list(conv.args) + ([None] if len(conv.args) == 2 else [])
-            conv_args[1] = fused_weight_node
-            conv_args[2] = fused_bias_node
-            conv.args = tuple(conv_args)
+            with graph.inserting_before(conv.args[1]):
+                fused_conv_weight_node = create_constant_placeholder(
+                    exp_program=self.exported_program,
+                    graph=graph_module.graph,
+                    kind=InputKind.PARAMETER,
+                    name=fused_weight_name,
+                    data=fused_weight,
+                )
+                if fused_bias is not None:
+                    fused_conv_bias_node = create_constant_placeholder(
+                        exp_program=self.exported_program,
+                        graph=graph_module.graph,
+                        kind=InputKind.PARAMETER,
+                        name=fused_bias_name,
+                        data=fused_bias,
+                    )
+                else:
+                    fused_conv_bias_node = None
+
+                conv.args = (
+                    conv.args[0],
+                    fused_conv_weight_node,
+                    fused_conv_bias_node,
+                    *conv.args[3:],
+                )
+
             # Remove any use of batchnorm from the graph
             for user in bn.users.copy():
                 assert user.target == operator.getitem
@@ -119,8 +150,13 @@ def call(self, graph_module: torch.fx.GraphModule):
                 graph.erase_node(user)
 
             graph.erase_node(bn)
+            constant_placeholders_to_delete.update(conv.args[1:3] + bn.args[1:5])
 
-            counter += 1
+        if len(constant_placeholders_to_delete) > 0:
+            graph_module.graph.eliminate_dead_code()
+            for node in constant_placeholders_to_delete:
+                if (node is not None) and (len(node.users) == 0):
+                    delete_constant_placeholder(self.exported_program, node)
 
         graph_module.recompile()
         # To Regenerate meta data and shape information, retrace module
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
index 0a825a94bef..ec39d287346 100644
--- a/backends/xnnpack/operators/node_visitor.py
+++ b/backends/xnnpack/operators/node_visitor.py
@@ -34,11 +34,16 @@
     check_or_raise,
     get_input_node,
     get_param_tensor,
+    get_tensor_name,
     is_param_node,
     PERM_NCHW_TO_NHWC,
 )
 
-from executorch.backends.xnnpack.utils.xnnpack_constants import XNN_INVALID_VALUE_ID
+from executorch.backends.xnnpack.utils.xnnpack_constants import (
+    UINT64_MAX,
+    XNN_INVALID_VALUE_ID,
+)
+from executorch.exir._serialize._named_data_store import NamedDataStore
 from torch.export import ExportedProgram
 
 XNN_TYPE_MAP = {
@@ -46,8 +51,6 @@
 }
 
 from executorch.backends.xnnpack.serialization.xnnpack_graph_serialize import (
-    _aligned_size,
-    _pad_to,
     CONSTANT_TENSOR_ALIGNMENT,
 )
 
@@ -86,11 +89,11 @@ def __init__(
         self,
         exported_program: ExportedProgram,
         external_ids: Dict,
-        constant_data_bytes: bytearray,
+        named_data_store: NamedDataStore,
     ) -> None:
         self._external_ids = external_ids or {}
         self._exported_program = exported_program or None
-        self._constant_data_bytes = constant_data_bytes
+        self._named_data_store = named_data_store
 
     @property
     def external_ids(self) -> Dict:
@@ -579,11 +582,16 @@ def get_serialized_buffer_index(
             ctypes.POINTER(array_type),
         ).contents
 
-        offset = len(self._constant_data_bytes)
+        named_key = get_tensor_name(self.exported_program, get_attr_node)
+        if named_key == "":
+            raise ValueError(f"Tensor from node: {get_attr_node} has no name")
+
         size = const_val.untyped_storage().nbytes()
-        xnn_graph.constant_data.append(ConstantDataOffset(offset=offset, size=size))
-        self._constant_data_bytes.extend(
-            _pad_to(bytes(array), _aligned_size(size, CONSTANT_TENSOR_ALIGNMENT))
+        xnn_graph.constant_data.append(
+            ConstantDataOffset(offset=UINT64_MAX, size=size, named_key=named_key)
+        )
+        self._named_data_store.add_named_data(
+            named_key, bytes(array), alignment=CONSTANT_TENSOR_ALIGNMENT
         )
 
         return buffer_idx
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index 5a43481b98d..193656c30b1 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -316,11 +316,20 @@ table XNNLeakyReLU {
 table ConstantDataOffset {
   // Constant data offsets are relative to the constant data base offset provided
   // in the XNNPACKHeader.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
   offset: uint64;
 
   // The size in bytes of valid data starting at the offset. The constant data
   // may be followed by padding before the next piece of constant data
   size: uint64;
+
+  // unique string id used to query the offset from the named data store.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
+  named_key: string;
 }
 
 table XNNGraph {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 3276dac7869..3cb572c66ef 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -470,6 +470,7 @@ class XValue:
 class ConstantDataOffset:
     offset: int
     size: int
+    named_key: str = ""
 
 
 @dataclass
diff --git a/backends/xnnpack/utils/gen_xnnpack_constants.sh b/backends/xnnpack/utils/gen_xnnpack_constants.sh
index 6be9d4519f3..5fa92e5b038 100644
--- a/backends/xnnpack/utils/gen_xnnpack_constants.sh
+++ b/backends/xnnpack/utils/gen_xnnpack_constants.sh
@@ -26,5 +26,6 @@
 } > xnnpack_constants.py
 
 echo UINT32_MAX = 4294967295 >> xnnpack_constants.py
+echo UINT64_MAX = 18446744073709551615 >> xnnpack_constants.py
 awk '/^#define\s+XNN_/ { print $2,"=",$3} ' "$1"/include/xnnpack.h >> xnnpack_constants.py
 if ! grep -qc "^XNN_" xnnpack_constants.py; then false; fi
diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py
index b802d73c16b..fab95618807 100644
--- a/backends/xnnpack/utils/utils.py
+++ b/backends/xnnpack/utils/utils.py
@@ -131,6 +131,22 @@ def get_param_tensor(
     raise RuntimeError(f"unsupported param type, {node.op}.")
 
 
+def get_tensor_name(exp_prog: ExportedProgram, node: torch.fx.Node) -> str:
+    if node is None:
+        return ""
+    if is_param(exp_prog, node):
+        return exp_prog.graph_signature.inputs_to_parameters[node.name]
+    elif is_buffer(exp_prog, node):
+        return exp_prog.graph_signature.inputs_to_buffers[node.name]
+    elif is_lifted_tensor_constant(exp_prog, node):
+        return exp_prog.graph_signature.inputs_to_lifted_tensor_constants[node.name]
+    else:
+        assert isinstance(node.target, str)
+        return node.target
+
+    return ""
+
+
 def get_source_fn(node: torch.fx.Node) -> Optional[torch.fx.Node]:
     """
     Returns the source fn of the given node, return None if something goes wrong
diff --git a/backends/xnnpack/utils/xnnpack_constants.py b/backends/xnnpack/utils/xnnpack_constants.py
index 351cc8ad897..364819a2435 100644
--- a/backends/xnnpack/utils/xnnpack_constants.py
+++ b/backends/xnnpack/utils/xnnpack_constants.py
@@ -6,8 +6,11 @@
 
 # Auto-generated by gen_xnnpack_constants.sh script. Do not modify
 UINT32_MAX = 4294967295
+UINT64_MAX = 18446744073709551615
+XNN_EXTRA_BYTES = 128
 XNN_EXTRA_BYTES = 16
 XNN_MAX_TENSOR_DIMS = 6
+XNN_INVALID_VALUE_ID = UINT32_MAX
 XNN_FLAG_HINT_SPARSE_INFERENCE = 0x00000001
 XNN_FLAG_HINT_FP16_INFERENCE = 0x00000002
 XNN_FLAG_FORCE_FP16_INFERENCE = 0x00000004
@@ -26,7 +29,8 @@
 XNN_FLAG_YIELD_WORKERS = 0x00000010
 XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER = 0x00000020
 XNN_FLAG_KEEP_DIMS = 0x00000040
-XNN_EXTRA_QUANTIZATION_PARAMS = 8
+XNN_EXTRA_QUANTIZATION_PARAMS = 10
+XNN_MIN_BLOCKSIZE = 32
 XNN_VALUE_FLAG_EXTERNAL_INPUT = 0x00000001
 XNN_VALUE_FLAG_EXTERNAL_OUTPUT = 0x00000002
 XNN_VALUE_FLAG_PERSISTENT = 0x00000004
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 4548de4940a..84cdfd69a48 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -31,6 +31,7 @@
     XNN_VALUE_FLAG_EXTERNAL_INPUT,
     XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
 )
+from executorch.exir._serialize._named_data_store import NamedDataStore
 
 from executorch.exir.backend.backend_details import (
     BackendDetails,
@@ -103,7 +104,7 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
-
+        named_data_store = NamedDataStore()
         xnnpack_edge_compile_config = get_xnnpack_edge_compile_config()
 
         # Need to wrap EP here because xnnpack does addmm to linear
@@ -162,7 +163,7 @@ def preprocess(
         )
 
         constant_data_bytes = bytearray()
-        node_visitors = get_node_visitors(ep, node_to_external_map, constant_data_bytes)
+        node_visitors = get_node_visitors(ep, node_to_external_map, named_data_store)
 
         for node in graph_module.graph.nodes:
             if node.op == "call_function":
@@ -191,4 +192,5 @@ def preprocess(
                 xnnpack_graph, constant_data_bytes
             ),
             debug_handle_map={},
+            data_store_output=named_data_store.get_named_data_store_output(),
         )

From d08d93894afada186d3727b24f54a32b28634914 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 14 Mar 2025 22:31:14 -0400
Subject: [PATCH 4/5] [XNNPACK][Weights Cache] Initial Weights Cache Design
 with NamedDataMap (#9296)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/9154 by
@mcr229
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/mcr229/10/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/mcr229/10/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/mcr229/9/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/mcr229/10/orig
@diff-train-skip-merge

---------

Co-authored-by: Max Ren <maxren@meta.com>
---
 backends/xnnpack/runtime/XNNWeightsCache.cpp  | 237 +++++++++++++++
 backends/xnnpack/runtime/XNNWeightsCache.h    | 164 ++++++++++
 .../test/runtime/test_xnn_weights_cache.cpp   | 286 ++++++++++++++++++
 backends/xnnpack/test/targets.bzl             |  13 +
 extension/testing_util/targets.bzl            |   1 +
 schema/targets.bzl                            |   4 +
 6 files changed, 705 insertions(+)
 create mode 100644 backends/xnnpack/runtime/XNNWeightsCache.cpp
 create mode 100644 backends/xnnpack/runtime/XNNWeightsCache.h
 create mode 100644 backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp

diff --git a/backends/xnnpack/runtime/XNNWeightsCache.cpp b/backends/xnnpack/runtime/XNNWeightsCache.cpp
new file mode 100644
index 00000000000..f2842851d3a
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWeightsCache.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <sys/stat.h>
+#include <xnnpack.h>
+#include <string>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace delegate {
+
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+
+XNNWeightsCache::XNNWeightsCache() {
+  weights_cache_.context = this;
+  weights_cache_.look_up = (size_t(*)(
+      void*, const xnn_weights_cache_look_up_key*))XNNWeightsCache::look_up;
+  weights_cache_.reserve_space =
+      (void* (*)(void*, size_t))XNNWeightsCache::reserve_space;
+  weights_cache_.look_up_or_insert =
+      (size_t(*)(void*, const xnn_weights_cache_look_up_key*, void*, size_t))
+          XNNWeightsCache::look_up_or_insert;
+  weights_cache_.is_finalized = (bool (*)(void*))XNNWeightsCache::is_finalized;
+  weights_cache_.offset_to_addr =
+      (void* (*)(void*, size_t))XNNWeightsCache::offset_to_addr;
+  weights_cache_.delete_cache =
+      (enum xnn_status(*)(void*))XNNWeightsCache::delete_cache;
+}
+
+Error XNNWeightsCache::initialize_for_runtime(
+    MemoryAllocator* runtime_allocator,
+    const NamedDataMap* named_data_map) {
+  runtime_allocator_ = runtime_allocator;
+  named_data_map_ = named_data_map;
+  is_finalized_ = false;
+
+  return Error::Ok;
+}
+
+Result<std::vector<std::string>> XNNWeightsCache::finalize_for_runtime() {
+  is_finalized_ = true;
+
+  // All data has been packed by create_runtime
+  // so we clear the unpacked data as it is no longer needed
+  for (FreeableBuffer& buffer : unpacked_data_) {
+    buffer.Free();
+  }
+  unpacked_data_.clear();
+  unpacked_data_to_name_.clear();
+
+  std::vector<std::string> packed_data_names;
+  // update the reference count of all the packed data
+  // used by this runtime
+  for (auto& entry : name_to_packed_data_metadata_) {
+    if (entry.second.in_current_runtime) {
+      entry.second.ref_count++;
+      entry.second.in_current_runtime = false;
+      packed_data_names.push_back(entry.first);
+    }
+  }
+
+  return packed_data_names;
+}
+
+Result<const uint8_t*> XNNWeightsCache::load_unpacked_data(
+    const std::string& name) {
+  Result<FreeableBuffer> named_data = named_data_map_->get_data(name.c_str());
+  if (!named_data.ok()) {
+    ET_LOG(Error, "Failed to load constant data for key %s", name.c_str());
+    return Error::InvalidExternalData;
+  }
+  const uint8_t* data_pointer =
+      static_cast<const uint8_t*>(named_data.get().data());
+  unpacked_data_.push_back(std::move(named_data.get()));
+  unpacked_data_to_name_[data_pointer] = name;
+
+  return data_pointer;
+}
+
+Error XNNWeightsCache::delete_packed_data(
+    const std::vector<std::string>& packed_data_names) {
+  if (!is_finalized_) {
+    ET_LOG(
+        Error,
+        "Error, attempted to delete packed data from the cache but the cache is not finalized");
+    return Error::InvalidArgument;
+  }
+  for (const std::string& name : packed_data_names) {
+    auto entry = name_to_packed_data_metadata_.find(name);
+    if (entry == name_to_packed_data_metadata_.end()) {
+      ET_LOG(
+          Error,
+          "Error, attempted to deleted packed data: %s, from the cache but it wasn't found",
+          name.c_str());
+      return Error::InvalidArgument;
+    } else {
+      entry->second.ref_count--;
+      if (entry->second.ref_count == 0) {
+        void* packed_data_ptr = packed_data_ptrs_[entry->second.offset];
+        // Erase the key/value from the map frees the pointer holding the packed
+        // data
+        packed_pointer_to_container_.erase(packed_data_ptr);
+        // remove the pointer from the packed_data_ptrs_
+        packed_data_ptrs_[entry->second.offset] = nullptr;
+        // Erase the name to packed metadata entry
+        name_to_packed_data_metadata_.erase(entry->first);
+      }
+    }
+  }
+
+  return Error::Ok;
+}
+
+size_t XNNWeightsCache::look_up(
+    XNNWeightsCache* context,
+    const xnn_weights_cache_look_up_key* cache_key) {
+  const void* unpacked_weights_ptr = cache_key->kernel;
+  const void* unpacked_bias_ptr = cache_key->bias;
+  auto entry = context->unpacked_data_to_name_.find(unpacked_weights_ptr);
+
+  // Check if weight_pointer has been cached
+  if (entry == context->unpacked_data_to_name_.end()) {
+    return SIZE_MAX;
+  }
+
+  std::string weight_bias_name = entry->second;
+
+  // Check if bias_pointer has been cached
+  if (unpacked_bias_ptr != nullptr) {
+    auto bias_entry = context->unpacked_data_to_name_.find(unpacked_bias_ptr);
+    if (bias_entry != context->unpacked_data_to_name_.end()) {
+      weight_bias_name.append(bias_entry->second);
+    }
+  }
+
+  // check if weight_bias_name has been packed already
+  auto packed_weight_entry =
+      context->name_to_packed_data_metadata_.find(weight_bias_name);
+  if (packed_weight_entry == context->name_to_packed_data_metadata_.end()) {
+    return SIZE_MAX;
+  }
+  packed_weight_entry->second.in_current_runtime = true;
+
+  return packed_weight_entry->second.offset;
+}
+
+void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) {
+  // MemoryAllocator* allocator = context->runtime_allocator_;
+  // void* reserved_pointer = allocator->allocate(n,
+  // context->kPackedAllocationAlignment);
+
+  // return reserved_pointer;
+  std::string data_container;
+  data_container.resize(n + context->kPackedAllocationAlignment);
+  void* maybe_aligned_space = data_container.data();
+  void* aligned_space = (void*)((intptr_t)maybe_aligned_space + 64 -
+                                (intptr_t)maybe_aligned_space % 64);
+
+  context->packed_pointer_to_container_[aligned_space] =
+      std::move(data_container);
+  return aligned_space;
+}
+
+size_t XNNWeightsCache::look_up_or_insert(
+    XNNWeightsCache* context,
+    const xnn_weights_cache_look_up_key* cache_key,
+    void* ptr,
+    size_t size) {
+  size_t offset = context->look_up(context, cache_key);
+
+  if (offset != SIZE_MAX) {
+    void* saved_ptr = context->offset_to_addr(context, offset);
+    if (0 == memcmp(ptr, saved_ptr, size)) {
+      return offset;
+    }
+    // Failure, cache is out of date
+    return SIZE_MAX;
+  }
+
+  // Add to Cache if it is not finalized
+  size_t next_offset = context->packed_data_ptrs_.size();
+  auto entry = context->unpacked_data_to_name_.find(cache_key->kernel);
+
+  // Check if weight_pointer has been cached
+  if (entry != context->unpacked_data_to_name_.end()) {
+    std::string weight_bias_name = entry->second;
+    if (cache_key->bias != nullptr) {
+      auto bias_entry = context->unpacked_data_to_name_.find(cache_key->bias);
+      if (bias_entry != context->unpacked_data_to_name_.end()) {
+        weight_bias_name.append(bias_entry->second);
+      }
+    }
+    PackedDataMeta packed_data_metadata = {
+        .offset = next_offset,
+        .ref_count =
+            0, // ref_count is only incremented after finalizing for runtime
+        .in_current_runtime = true};
+    context->name_to_packed_data_metadata_[weight_bias_name] =
+        packed_data_metadata;
+  } else {
+    ET_LOG(
+        Info,
+        "Warning: Unpacked weight and bias were not registered with names, "
+        "this will add new cache entries for packed data and may affect performance.");
+  }
+  context->packed_data_ptrs_.push_back(ptr);
+
+  return next_offset;
+}
+
+bool XNNWeightsCache::is_finalized(XNNWeightsCache* context) {
+  return context->is_finalized_;
+}
+
+void* XNNWeightsCache::offset_to_addr(XNNWeightsCache* context, size_t offset) {
+  return context->packed_data_ptrs_[offset];
+}
+
+enum xnn_status XNNWeightsCache::delete_cache(XNNWeightsCache* context) {
+  return xnn_status_success;
+}
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.h b/backends/xnnpack/runtime/XNNWeightsCache.h
new file mode 100644
index 00000000000..bc00ac15fd0
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWeightsCache.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <xnnpack.h>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/executor/pte_data_map.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace delegate {
+
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+
+struct PackedDataMeta {
+  size_t offset;
+  // Count number of xnn_runtime_t this packed data is used in
+  size_t ref_count;
+  // true if this packed data was inserted or looked up for the
+  // current runtime being created
+  bool in_current_runtime;
+};
+
+class XNNWeightsCache {
+ public:
+  XNNWeightsCache();
+
+  /**
+   * Initializes the XNNWeightsCache for the next xnn_create_runtime
+   */
+  Error initialize_for_runtime(
+      MemoryAllocator* runtime_allocator,
+      const NamedDataMap* named_data_map);
+
+  /**
+   * Finalizes the weights cache after the weights have been packed
+   * in xnn_create_runtime.
+   *
+   * This should only be called after creating the runtime. Returns
+   * the name of all the packed weights used by this runtime
+   */
+  Result<std::vector<std::string>> finalize_for_runtime();
+
+  // Taken from XNN_ALLOCATION_ALIGNMENT in xnnpack/common.h
+  static const size_t kPackedAllocationAlignment = 64;
+
+  /**
+   * Returns XNNPACK's underlying weights_cache pointer
+   */
+  inline xnn_weights_cache_t get() {
+    return (xnn_weights_cache_t)&weights_cache_;
+  }
+
+  /**
+   * Returns the number of unpacked data
+   */
+  inline size_t get_num_unpacked_data() {
+    return unpacked_data_.size();
+  };
+
+  /**
+   * Returns the names of all unpacked data
+   */
+  inline std::vector<std::string> get_unpacked_data_names() {
+    std::vector<std::string> names;
+    for (const auto& pair : unpacked_data_to_name_) {
+      names.push_back(pair.second);
+    }
+    return names;
+  };
+
+  /**
+   * Returns the packed data names
+   */
+  inline std::vector<std::string> get_packed_data_names() {
+    std::vector<std::string> names;
+    for (const auto& pair : name_to_packed_data_metadata_) {
+      names.push_back(pair.first);
+    }
+    return names;
+  };
+
+  /**
+   * Loads unpacked named data from the NamedDataMap into this XNNWeightsCache
+   * and returns a pointer to the unpacked data. This unpacked data is given
+   * to XNNPACK's define_tensor APIs, and used as the cache key for
+   * look_up_or_insert.
+   * @param[in] name The name of the data to load
+   * @param[out] out the pointer to the unpacked data that was loaded
+   */
+  Result<const uint8_t*> load_unpacked_data(const std::string& name);
+
+  /**
+   * Deletes the packed data associated with the names given.
+   * Decrements the ref_count if the packed data is used by other
+   * models
+   *
+   */
+  Error delete_packed_data(const std::vector<std::string>& packed_names);
+
+ private:
+  // Runtime Allocator used to reserve memory for packed weights
+  MemoryAllocator* runtime_allocator_;
+
+  // Named Data Map used to load named data
+  const NamedDataMap* named_data_map_;
+
+  // Map of unpacked pointers to the data name
+  std::unordered_map<const void*, std::string> unpacked_data_to_name_;
+  // Map of data names to offset into the packed data
+  std::unordered_map<std::string, PackedDataMeta> name_to_packed_data_metadata_;
+  // Vector holding list of pointers to the packed data
+  std::vector<void*> packed_data_ptrs_;
+  // vector holding list of strings which are containers for packed_data_ptrs
+  std::unordered_map<void*, std::string> packed_pointer_to_container_;
+  // Vector hodling list of unpacked freeable buffers
+  std::vector<FreeableBuffer> unpacked_data_;
+  // xnnpack's weight cache provider
+  xnn_weights_cache_provider weights_cache_;
+  // whether or not the weight cache is finalized
+  bool is_finalized_;
+
+  // Function pointers to override XNNPACK's default xnn_weights_cache_provider
+  // functions.
+  static size_t look_up(
+      XNNWeightsCache* context,
+      const xnn_weights_cache_look_up_key* cache_key);
+
+  static void* reserve_space(XNNWeightsCache* context, size_t n);
+
+  static size_t look_up_or_insert(
+      XNNWeightsCache* context,
+      const xnn_weights_cache_look_up_key* cache_key,
+      void* ptr,
+      size_t size);
+
+  static bool is_finalized(XNNWeightsCache* context);
+
+  static void* offset_to_addr(XNNWeightsCache* context, size_t offset);
+
+  static enum xnn_status delete_cache(XNNWeightsCache* context);
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
new file mode 100644
index 00000000000..ca149a67b5e
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+
+#include <executorch/runtime/executor/pte_data_map.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/schema/program_generated.h>
+#include <gtest/gtest.h>
+#include <xnnpack.h>
+
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
+using executorch::extension::FileDataLoader;
+using executorch::extension::testing::TempFile;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+using executorch::runtime::internal::PteDataMap;
+
+class XNNWeightsCacheTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Creating a NamedDataMap from scratch is a little bit convoluted, so
+    // we copied a lot of setup from test_pte_data_map.cpp
+
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Create a sample Program with only named_data and segments. Technically
+    // not a valid Program; only used to test the PteDataMap.
+    // Create named data.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::NamedData>, 2>
+        named_data_arr = {
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "weight", /*segment_index=*/0),
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "bias", /*segment_index=*/1),
+        };
+    const auto named_data =
+        builder_.CreateVector(named_data_arr.data(), named_data_arr.size());
+
+    // Create segments.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::DataSegment>, 2>
+        segment_arr = {// @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_, /*offset=*/0, /*size=*/kSegmentSizes[0]),
+                       // @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_,
+                           /*offset=*/kSegmentAlignment * 2,
+                           /*size=*/kSegmentSizes[1])};
+    const auto segments =
+        builder_.CreateVector(segment_arr.data(), segment_arr.size());
+
+    // Create Program.
+    const auto program = executorch_flatbuffer::CreateProgram(
+        builder_, 0, 0, 0, 0, segments, 0, 0, named_data);
+
+    builder_.Finish(program);
+    program_ = executorch_flatbuffer::GetProgram(builder_.GetBufferPointer());
+
+    // Create sample segment data.
+    for (int i = 0; i < kSegmentSizes[0]; i++) {
+      sample_data_[i] = 1;
+    }
+    for (int i = kSegmentOffsets[1]; i < kSegmentOffsets[1] + kSegmentSizes[1];
+         i++) {
+      sample_data_[i] = 2;
+    }
+    TempFile tf(sample_data_.data(), sizeof(sample_data_));
+
+    // Wrap the sample data in a loader.
+    Result<FileDataLoader> loader =
+        FileDataLoader::from(tf.path().c_str(), kSegmentAlignment);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    data_map_loader_ =
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    Result<PteDataMap> data_map = PteDataMap::create(
+        data_map_loader_.get(),
+        0,
+        program_->named_data(),
+        program_->segments());
+    ASSERT_EQ(data_map.error(), Error::Ok);
+    data_map_ = std::make_unique<PteDataMap>(std::move(data_map.get()));
+
+    memory_allocator_ = std::make_unique<MemoryAllocator>(
+        memory_allocator_data_.size(), memory_allocator_data_.data());
+
+    xnn_status status = xnn_initialize(nullptr);
+    ASSERT_EQ(status, xnn_status_success);
+  }
+
+  void BuildAndRunGraphWithWeightsCache(
+      XNNWeightsCache& weight_cache,
+      const std::vector<size_t>& batches,
+      size_t input_channels,
+      size_t output_channels,
+      float* input_data,
+      float* output_data) {
+    // Defining subgraph
+    xnn_subgraph_t subgraph_ptr = nullptr;
+    xnn_status status = xnn_create_subgraph(
+        /*external_value_ids=*/2,
+        /*flags=*/0,
+        &subgraph_ptr);
+    ASSERT_EQ(status, xnn_status_success);
+    std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+        subgraph_ptr, &xnn_delete_subgraph);
+
+    // Define tensors
+    // Define input
+    uint32_t input_id;
+    std::vector<size_t> input_dims(batches);
+    input_dims.push_back(input_channels);
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        input_dims.size(),
+        input_dims.data(),
+        nullptr,
+        0,
+        XNN_VALUE_FLAG_EXTERNAL_INPUT,
+        &input_id);
+
+    // Define weight
+    uint32_t weight_id;
+    Result<const uint8_t*> weight_pointer =
+        weight_cache.load_unpacked_data("weight");
+    ASSERT_TRUE(weight_pointer.ok());
+    ASSERT_TRUE(weight_pointer.get() != nullptr);
+    std::vector<size_t> weight_dims{output_channels, input_channels};
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        weight_dims.size(),
+        weight_dims.data(),
+        weight_pointer.get(),
+        XNN_INVALID_VALUE_ID,
+        0,
+        &weight_id);
+    ASSERT_EQ(status, xnn_status_success);
+
+    // Define bias
+    uint32_t bias_id;
+    Result<const uint8_t*> bias_pointer =
+        weight_cache.load_unpacked_data("bias");
+    ASSERT_TRUE(bias_pointer.ok());
+    std::vector<size_t> bias_dims{output_channels};
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        bias_dims.size(),
+        bias_dims.data(),
+        bias_pointer.get(),
+        XNN_INVALID_VALUE_ID,
+        0,
+        &bias_id);
+
+    // Define output tensor
+    uint32_t output_id;
+    std::vector<size_t> output_dims(batches);
+    output_dims.push_back(output_channels);
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        output_dims.size(),
+        output_dims.data(),
+        nullptr,
+        1,
+        XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
+        &output_id);
+
+    // create xecond fully connected
+    status = xnn_define_fully_connected(
+        subgraph_ptr,
+        -std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::infinity(),
+        input_id,
+        weight_id,
+        bias_id,
+        output_id,
+        0);
+    // Create and Pack Weights
+    xnn_runtime_t runtime_ptr = nullptr;
+    status = xnn_create_runtime_v3(
+        subgraph_ptr, weight_cache.get(), nullptr, 0, &runtime_ptr);
+    Result<std::vector<std::string>> packed_weights_added =
+        weight_cache.finalize_for_runtime();
+    ASSERT_TRUE(packed_weights_added.ok());
+    ASSERT_EQ(packed_weights_added.get().size(), 1);
+    ASSERT_EQ(packed_weights_added.get()[0], "weightbias");
+
+    auto runtime = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
+        runtime_ptr, xnn_delete_runtime);
+
+    const std::array<xnn_external_value, 2> external = {
+        xnn_external_value{0, input_data},
+        xnn_external_value{1, output_data},
+    };
+
+    status = xnn_reshape_runtime(runtime.get());
+    status =
+        xnn_setup_runtime_v2(runtime.get(), external.size(), external.data());
+
+    ASSERT_EQ(status, xnn_status_success);
+    status = xnn_invoke_runtime(runtime.get());
+    ASSERT_EQ(status, xnn_status_success);
+  }
+
+  // Program builder constants.
+  static constexpr int kSegmentAlignment = 16;
+  static constexpr std::array<int, 2> kSegmentSizes{384, 128};
+  static constexpr std::array<int, 2> kSegmentOffsets{0, kSegmentAlignment * 2};
+  std::array<uint8_t, 512> sample_data_;
+
+  // Program builder.
+  flatbuffers::FlatBufferBuilder builder_;
+  const executorch_flatbuffer::Program* program_;
+
+  // Data loader for the sample data.
+  std::unique_ptr<FileDataLoader> data_map_loader_;
+
+  // PteDataMap
+  std::unique_ptr<PteDataMap> data_map_;
+
+  // MemoryAllocator
+  std::array<uint8_t, 200> memory_allocator_data_;
+  std::unique_ptr<MemoryAllocator> memory_allocator_;
+};
+
+TEST_F(XNNWeightsCacheTest, ReusePackedWeights) {
+  XNNWeightsCache weight_cache;
+  size_t padding = 32;
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t num_batches = 1;
+  for (size_t batch_dim : batches) {
+    num_batches *= batch_dim;
+  }
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  std::vector<float> input_tensor(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output_tensor(num_batches * output_channels, 0.0f);
+  float* input_data = input_tensor.data();
+  float* output_data = output_tensor.data();
+  weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      weight_cache,
+      batches,
+      input_channels,
+      output_channels,
+      input_data,
+      output_data);
+
+  weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      weight_cache,
+      batches,
+      input_channels,
+      output_channels,
+      input_data,
+      output_data);
+  ASSERT_EQ(weight_cache.get_num_unpacked_data(), 0);
+  weight_cache.delete_packed_data(weight_cache.get_packed_data_names());
+  std::vector<std::string> packed_data_names =
+      weight_cache.get_packed_data_names();
+  // Packed Data Still exists because it has a ref count of 2
+  ASSERT_EQ(packed_data_names.size(), 1);
+  weight_cache.delete_packed_data(weight_cache.get_packed_data_names());
+  packed_data_names = weight_cache.get_packed_data_names();
+  ASSERT_EQ(packed_data_names.size(), 0);
+}
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
index 30ce970a842..58589b70607 100644
--- a/backends/xnnpack/test/targets.bzl
+++ b/backends/xnnpack/test/targets.bzl
@@ -30,3 +30,16 @@ def define_common_targets():
             "//executorch/backends/xnnpack:xnnpack_backend",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_xnn_weights_cache",
+        srcs = ["runtime/test_xnn_weights_cache.cpp"],
+        deps = [
+            third_party_dep("XNNPACK"),
+            "//executorch/backends/xnnpack:xnnpack_backend",
+            "//executorch/runtime/executor:pte_data_map",
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/schema:program",
+        ],
+    )
diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl
index 2b12480dfff..95b1f94d182 100644
--- a/extension/testing_util/targets.bzl
+++ b/extension/testing_util/targets.bzl
@@ -17,5 +17,6 @@ def define_common_targets():
             "//executorch/extension/fb/ptez/decompression_methods/test/...",
             "//executorch/extension/fb/ptez/test/...",
             "//executorch/runtime/executor/test/...",
+            "//executorch/backends/xnnpack/test/...",
         ],
     )
diff --git a/schema/targets.bzl b/schema/targets.bzl
index 40c6d8d5c8d..c0036c7500a 100644
--- a/schema/targets.bzl
+++ b/schema/targets.bzl
@@ -78,6 +78,10 @@ def define_common_targets():
             # //executorch/runtime/executor/...
             "//executorch/codegen/tools/...",
             "//executorch/runtime/executor/...",
+            # Tests have a set up which uses raw flatbuffer.
+            # TODO will refactor these setup steps into 
+            # testing utils in runtime/executor/... path
+            "//executorch/backends/xnnpack/test/...",
         ],
         exported_headers = {
             OUTPUT_PROGRAM_HEADER: ":{}[{}]".format(PROGRAM_GEN_RULE_NAME, OUTPUT_PROGRAM_HEADER),

From 2a903f9609c2ee811604a1aff3c7995ed1a35c10 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 14 Mar 2025 22:31:40 -0400
Subject: [PATCH 5/5] [XNNPACK][Weights Cache] Enable in XNNPACK (#9297)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/9155 by
@mcr229
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/mcr229/11/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/mcr229/11/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/mcr229/10/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/mcr229/11/orig
@diff-train-skip-merge

---------

Co-authored-by: Max Ren <maxren@meta.com>
---
 backends/xnnpack/CMakeLists.txt               | 13 ++++
 backends/xnnpack/runtime/XNNCompiler.cpp      | 72 +++++++++++++++----
 backends/xnnpack/runtime/XNNCompiler.h        | 10 ++-
 backends/xnnpack/runtime/XNNExecutor.cpp      |  4 +-
 backends/xnnpack/runtime/XNNExecutor.h        |  8 ++-
 backends/xnnpack/runtime/XNNPACKBackend.cpp   | 42 +++++++++--
 backends/xnnpack/targets.bzl                  | 10 ++-
 .../xnnpack/test/runtime/test_xnnexecutor.cpp |  3 +-
 8 files changed, 131 insertions(+), 31 deletions(-)

diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 8b3bf3d91c1..ed0128f93f1 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -37,6 +37,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
 # Keeping this OFF by default due to regressions in decode and model load with
 # kleidi kernels
 option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
+
+# Turning this on cache weights between partitions and methods. If weights
+# are shared across methods/partitions then this can reduce load time and
+# memory usage
+
+# Keeping this off maintains existing behavior. Turning this on serializes
+# execution and initialization of delegates, to be revisited
+option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
+        "Enable weights cache to cache and manage all packed weights" OFF)
+
+if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
+  add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
+endif()
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 6a93ab73a2e..c0204831c07 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -11,7 +11,9 @@
 #include <executorch/backends/xnnpack/serialization/schema_generated.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wglobal-constructors"
@@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr(
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
     const NamedDataMap* named_data_map,
-    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
     if (!constant_data_ptr) {
@@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr(
         return constant_data_ptr + offset;
       } else {
         const std::string& data_name = constant_data_offset->named_key()->str();
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+        Result<const uint8_t*> data_ptr =
+            weights_cache->load_unpacked_data(data_name);
+        if (!data_ptr.ok()) {
+          ET_LOG(Error, "Failed to load weights from cache");
+          return nullptr;
+        }
+        return data_ptr.get();
+#else
         Result<FreeableBuffer> buffer =
             named_data_map->get_data(data_name.c_str());
         if (!buffer.ok()) {
@@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr(
         }
         const uint8_t* data_ptr =
             static_cast<const uint8_t*>(buffer.get().data());
-        loaded_buffers_from_map.push_back(std::move(buffer.get()));
+        freeable_buffers.push_back(std::move(buffer.get()));
         return data_ptr;
+#endif
       }
     }
   }
@@ -222,7 +235,8 @@ Error defineTensor(
     std::vector<uint32_t>& output_ids,
     CompileAllocator& allocator,
     const NamedDataMap* named_data_map,
-    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -264,7 +278,8 @@ Error defineTensor(
       flatbuffer_graph,
       constant_data_ptr,
       named_data_map,
-      loaded_buffers_from_map);
+      freeable_buffers,
+      weights_cache);
 
   xnn_status status;
   // The type we might have to convert to
@@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
     const void* buffer_pointer,
     size_t num_bytes,
     XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator,
-    const NamedDataMap* named_data_map,
-    xnn_workspace_t workspace) {
+    XNNWeightsCache* weights_cache,
+    xnn_workspace_t workspace,
+    const NamedDataMap* named_data_map) {
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
@@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   // Invalid ids do not need to be remapped
   remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);
 
+  // If weight cache is not on we hold onto all the unpacked buffers
+  // and we free them at the end
+  std::vector<FreeableBuffer> unpacked_buffers;
+
   // External Ids for inputs and outputs
   std::vector<uint32_t> input_ids;
   std::vector<uint32_t> output_ids;
   Error err = Error::Ok;
-  std::vector<FreeableBuffer> loaded_buffers_from_map;
   for (auto value : *flatbuffer_graph->xvalues()) {
     err = defineTensor(
         subgraph.get(),
@@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         output_ids,
         compile_allocator,
         named_data_map,
-        loaded_buffers_from_map);
+        unpacked_buffers,
+        weights_cache);
 
     if (err != Error::Ok) {
       return err;
@@ -2103,12 +2122,26 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 
   xnn_runtime_t runtime_ptr = nullptr;
 
+  // XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache
+  // just manages the unpacked weights until the runtime is created.
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  ET_CHECK_OR_RETURN_ERROR(
+      unpacked_buffers.size() == 0,
+      Internal,
+      "Weight Cache is enabled, which means unpacked buffers should be owned by the cache");
+  xnn_weights_cache_t weights_cache_ptr =
+      weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get()
+                                                 : nullptr;
+#else
+  xnn_weights_cache_t weights_cache_ptr = nullptr;
+#endif
+
 #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
   ET_CHECK_OR_RETURN_ERROR(
       workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
   status = xnn_create_runtime_v4(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       workspace,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
@@ -2116,7 +2149,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 #else
   status = xnn_create_runtime_v3(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
@@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       "XNN Runtime creation failed with code: %s",
       xnn_status_to_string(status));
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  auto packed_weights_names = weights_cache->finalize_for_runtime();
+  ET_CHECK_OR_RETURN_ERROR(
+      packed_weights_names.ok(),
+      Internal,
+      "Failed to finalize weights cache after creating the xnn runtime")
+#else
+  for (auto& buffer : unpacked_buffers) {
+    buffer.Free();
+  }
+  Result<std::vector<std::string>> packed_weights_names =
+      std::vector<std::string>();
+#endif
+
   err = executor->initialize( // NOLINT: runtime_ptr is non-null
       runtime_ptr,
       std::move(input_ids),
-      std::move(output_ids));
+      std::move(output_ids),
+      std::move(packed_weights_names.get()));
 
   return err;
 };
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
index 3ea621a4d59..bcc87351d7d 100644
--- a/backends/xnnpack/runtime/XNNCompiler.h
+++ b/backends/xnnpack/runtime/XNNCompiler.h
@@ -9,11 +9,9 @@
 #pragma once
 
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/platform/compiler.h>
-
 #include <xnnpack.h>
-#include <memory>
-#include <vector>
 
 namespace executorch {
 namespace backends {
@@ -29,9 +27,9 @@ class XNNCompiler {
       const void* buffer_pointer,
       size_t num_bytes,
       XNNExecutor* executor,
-      executorch::runtime::MemoryAllocator* runtime_allocator,
-      const executorch::runtime::NamedDataMap* named_data_map,
-      xnn_workspace_t workspace);
+      XNNWeightsCache* weights_cache,
+      xnn_workspace_t workspace,
+      const NamedDataMap* named_data_map);
 };
 
 } // namespace delegate
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
index 1ba549bb8d7..ae7c0d66ecb 100644
--- a/backends/xnnpack/runtime/XNNExecutor.cpp
+++ b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit;
 ET_NODISCARD Error XNNExecutor::initialize(
     xnn_runtime_t runtime,
     std::vector<uint32_t>&& input_ids,
-    std::vector<uint32_t>&& output_ids) {
+    std::vector<uint32_t>&& output_ids,
+    std::vector<std::string>&& packed_data_names) {
   runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
       runtime, xnn_delete_runtime);
 
@@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize(
   std::sort(output_ids_.begin(), output_ids_.end());
 
   externals_.resize(input_ids_.size() + output_ids_.size());
+  packed_data_names_ = std::move(packed_data_names);
 
   return Error::Ok;
 }
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
index 68ee18609e3..b98c902f44f 100644
--- a/backends/xnnpack/runtime/XNNExecutor.h
+++ b/backends/xnnpack/runtime/XNNExecutor.h
@@ -34,6 +34,7 @@ class XNNExecutor {
   std::vector<uint32_t> input_ids_;
   std::vector<uint32_t> output_ids_;
   std::vector<xnn_external_value> externals_;
+  std::vector<std::string> packed_data_names_;
 
  public:
   XNNExecutor() = default;
@@ -46,6 +47,10 @@ class XNNExecutor {
     return output_ids_.size();
   }
 
+  inline std::vector<std::string> get_packed_data_names() {
+    return packed_data_names_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
@@ -54,7 +59,8 @@ class XNNExecutor {
   ET_NODISCARD executorch::runtime::Error initialize(
       xnn_runtime_t runtime,
       std::vector<uint32_t>&& input_ids,
-      std::vector<uint32_t>&& output_ids);
+      std::vector<uint32_t>&& output_ids,
+      std::vector<std::string>&& packed_data_names);
 
   /**
    * Prepares the arguments for runtime graph execution.
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index a01ba2da704..1e2f07bd905 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
@@ -20,6 +21,7 @@
 namespace executorch {
 namespace backends {
 
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
 using executorch::runtime::BackendExecutionContext;
@@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    // This is needed to serialize access to xnn_create_runtime which is not
     // thread safe. This can heppen when multiple threads call init() on
     // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
+    weights_cache_->initialize_for_runtime(
+        context.get_runtime_allocator(), named_data_map);
+#endif
+
     // Executor has been allocated but not constructed, ensure that runtime_ is
     // nullptr by constructing it in place here. NOTE: Since we use placement
     // new and since this type is not trivially destructible, we must call the
@@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
         processed->data(),
         processed->size(),
         executor,
-        context.get_runtime_allocator(),
-        named_data_map,
-        workspace_.get());
+        weights_cache_.get(),
+        workspace_.get(),
+        named_data_map);
     // This backend does not need its processed data after compiling the model.
     processed->Free();
 
@@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
+#endif
+
     // Prepare Inputs/Outputs and Propagate Input Shapes
     Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
@@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
 
   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       // This is needed to serialize access to xnn_delete_runtime which is not
       // thread safe. This can heppen when multiple threads call destroy() on
       // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
+
 #ifdef ENABLE_XNNPACK_PROFILING
       executor->print_avg_op_timings();
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+      const std::lock_guard<std::mutex> lock_weights_cache(
+          weights_cache_mutex_);
+      weights_cache_->delete_packed_data(executor->get_packed_data_names());
+#endif
       // XNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~XNNExecutor();
@@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
   std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
       nullptr,
       &xnn_release_workspace};
+
+  // Weights cache is global to all delegate instances.
+  mutable std::mutex weights_cache_mutex_;
+  std::unique_ptr<XNNWeightsCache> weights_cache_ =
+      std::make_unique<XNNWeightsCache>();
+
+  // Lock Hiearchy for Mutexes:
+  // workspace_mutex_
+  // weights_cache_mutex_
 };
 
 namespace {
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 3fd9c433372..e97f1941ff7 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -6,11 +6,15 @@ def _get_preprocessor_flags():
     Disable if someone explictly specified a config option,
     else Enable otherwise
     """
-    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0":
-        return []
+    preprocessor_flags = []
+    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE")
+
+    if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE")
 
     # Enable if not disabled through config
-    return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
+    return preprocessor_flags
 
 def define_common_targets():
     runtime.cxx_library(
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
index a5a26004b49..42d925c1253 100644
--- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
+++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           },
           {
               1,
-          }),
+          },
+          {}),
       Error::Ok);
   TensorFactory<executorch::aten::ScalarType::Int> tf;
   auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});