From aef99e39fda769b919c94c7d2e309b8fa36aa831 Mon Sep 17 00:00:00 2001 From: Max Ren Date: Thu, 13 Mar 2025 23:59:56 -0700 Subject: [PATCH 1/5] [ExecuTorch][Weight Sharing] Track Named Data Store in EdgeProgramManager Pull Request resolved: https://github.com/pytorch/executorch/pull/9151 We enable Backends to return Named Data by adding NamedDataStoreOutput to the preprocess result. This is a completely BC change, as no backends with an implemented preprocess will see any change if nothing is explicitly implemented. For backend developers to leverage the new NamedDataStore, they can initialize a new NamedDataStore() within preprocess, add_named_data to the data store, and return the NamedDataStore.get_named_data_store_output() in the preprocess result like such: ``` def preprocess(ExportedProgram, List[CompileSpecs]) -> PreprocessResult: named_data_store = NamedDataStore() for node in exported_program.graph.nodes: named_data_store.add_named_data("name", bytes) return PreprocessResult( processed_bytes=bytes, debug_handle_map={}, data_store_output= named_data_store.get_named_data_store_output() ) ``` Under the hood, the data store output is embedded in the loweredbackendmodule, (serializing loweredbackendmodule by itself with the a named_data_store_output is still a todo). But via the EdgeProgramManager path, we add the named_data_store_outputs to the edge_program_manger's named data store to keep track of all the named data returned by backends. ghstack-source-id: 271732049 @exported-using-ghexport Differential Revision: [D70451660](https://our.internmc.facebook.com/intern/diff/D70451660/) --- exir/backend/backend_api.py | 1 + exir/backend/backend_details.py | 7 ++ exir/backend/test/TARGETS | 56 +++++++++ .../test/backend_with_named_data_map.py | 115 ++++++++++++++++++ .../test/test_backend_with_named_data_map.py | 83 +++++++++++++ exir/lowered_backend_module.py | 14 +++ exir/program/_program.py | 27 +++- 7 files changed, 301 insertions(+), 2 deletions(-) create mode 100644 exir/backend/test/backend_with_named_data_map.py create mode 100644 exir/backend/test/test_backend_with_named_data_map.py diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index 519f184871a..d5bd574ec5a 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -120,6 +120,7 @@ def to_backend( backend_id=backend_id, processed_bytes=preprocess_result.processed_bytes, compile_specs=compile_specs, + named_data_store_output=preprocess_result.data_store_output, ) lowered_module.meta = { "debug_handle_map": preprocess_result.debug_handle_map diff --git a/exir/backend/backend_details.py b/exir/backend/backend_details.py index bdbc1a1fafd..248d03f2b05 100644 --- a/exir/backend/backend_details.py +++ b/exir/backend/backend_details.py @@ -9,6 +9,8 @@ from typing import Dict, List, Optional, Tuple, Union +from executorch.exir._serialize._named_data_store import NamedDataStoreOutput + from executorch.exir.backend.compile_spec_schema import CompileSpec from torch.export.exported_program import ExportedProgram @@ -24,6 +26,11 @@ class PreprocessResult: debug_handle_map: Optional[Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]] = ( None ) + # Data Store output created from NamedDataStore. + + # Named Data store contains all the named data that is stored in the PTE file, + # but retrieveable by delegates via the NamedDataMap at runtime. + data_store_output: Optional[NamedDataStoreOutput] = None """ diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS index b453f4c722a..f0ba618936d 100644 --- a/exir/backend/test/TARGETS +++ b/exir/backend/test/TARGETS @@ -38,6 +38,62 @@ python_library( ], ) +python_library( + name = "backend_with_named_data_map", + srcs = [ + "backend_with_named_data_map.py", + ], + visibility = [ + "//executorch/...", + "//executorch/test/...", + ], + deps = [ + "//caffe2:torch", + "//caffe2/functorch:functorch_src", + "//executorch/exir:delegate", + "//executorch/exir:graph_module", + "//executorch/exir:lib", + "//executorch/exir:lowered_backend_module", + "//executorch/exir:print_program", + "//executorch/exir:schema", + "//executorch/exir/backend:backend_api", + "//executorch/exir/backend:compile_spec_schema", + "//executorch/exir/backend:partitioner", + "//executorch/exir/dialects:lib", + "//executorch/extension/pybindings:portable_lib", # @manual + "//executorch/extension/pytree:pylib", + "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", + ], +) + +python_unittest( + name = "test_backend_with_named_data_map", + srcs = [ + "test_backend_with_named_data_map.py", + ], + visibility = [ + "//executorch/...", + "//executorch/test/...", + ], + deps = [ + "//caffe2:torch", + "//caffe2/functorch:functorch_src", + "//executorch/exir:delegate", + "//executorch/exir:graph_module", + "//executorch/exir:lib", + "//executorch/exir:lowered_backend_module", + "//executorch/exir:print_program", + "//executorch/exir:schema", + "//executorch/exir/backend:backend_api", + "//executorch/exir/backend:compile_spec_schema", + "//executorch/exir/backend:partitioner", + "//executorch/exir/dialects:lib", + "//executorch/extension/pybindings:portable_lib", # @manual + "//executorch/extension/pytree:pylib", + ":backend_with_named_data_map", + ], +) + python_library( name = "qnn_backend_demo", srcs = [ diff --git a/exir/backend/test/backend_with_named_data_map.py b/exir/backend/test/backend_with_named_data_map.py new file mode 100644 index 00000000000..47dbc294133 --- /dev/null +++ b/exir/backend/test/backend_with_named_data_map.py @@ -0,0 +1,115 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, final, List, Tuple + +import torch +from executorch.exir._serialize._named_data_store import NamedDataStore + +from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult +from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( + generate_pattern_op_partitions, +) + +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, +) +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.graph_module import get_control_flow_submodules +from torch.export.exported_program import ExportedProgram +from torch.fx.passes.operator_support import OperatorSupportBase + + +# Backend details are final (cannot be subclassed). +@final +class BackendWithNamedDataMap(BackendDetails): + """ + Test Backend for Named Data Map Functionality + + This backend returns no processed_bytes, instead it uses + the named data store and serializes the name of the op + as the key and the data as its code value + """ + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + op_codes = { + exir_ops.edge.aten.sin.default: 0, + exir_ops.edge.aten.add.Tensor: 1, + exir_ops.edge.aten.sub.Tensor: 2, + exir_ops.edge.aten.mul.Tensor: 3, + exir_ops.edge.aten.div.Tensor: 4, + } + ndm = NamedDataStore() + for node in edge_program.graph.nodes: + if node.op == "call_function": + if node.target in op_codes.keys(): + ndm.add_named_data( + node.target.__name__, bytes(op_codes[node.target]) + ) + + return PreprocessResult( + processed_bytes=bytes(b""), + debug_handle_map={}, + data_store_output=ndm.get_named_data_store_output(), + ) + + +class SimpleOperatorSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.sin.default, + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.sub.Tensor, + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.div.Tensor, + ] + + +@final +class BackendWithNDMPartitioner(Partitioner): + def __init__(self) -> None: + self._op_support = SimpleOperatorSupport() + self.backend_id = BackendWithNamedDataMap.__name__ + + def _partition_gm( + self, graph_module: torch.fx.GraphModule, id_start: int = 0 + ) -> Tuple[int, Dict[str, DelegationSpec]]: + partition_tags: Dict[str, DelegationSpec] = {} + partition_list = generate_pattern_op_partitions( + graph_module, op_support=self._op_support + ) + + num_partitions_in_gm = len(partition_list) + for partition in partition_list: + curr_par_id = partition.id or 0 + delegation_tag = f"tag_{curr_par_id + id_start}" + for node in partition.nodes: + node.meta["delegation_tag"] = delegation_tag + delegation_spec = DelegationSpec(self.backend_id, []) + partition_tags[delegation_tag] = delegation_spec + + start_idx_for_submodules = num_partitions_in_gm + for _, submodule, _ in get_control_flow_submodules(graph_module): + start_idx_for_submodules, ret_partition_tags = self._partition_gm( + submodule, start_idx_for_submodules + ) + partition_tags.update(ret_partition_tags) + + return start_idx_for_submodules, partition_tags + + def partition(self, edge_program: ExportedProgram) -> PartitionResult: + _, partition_tags = self._partition_gm(edge_program.graph_module) + return PartitionResult( + tagged_exported_program=edge_program, + partition_tags=partition_tags, + ) diff --git a/exir/backend/test/test_backend_with_named_data_map.py b/exir/backend/test/test_backend_with_named_data_map.py new file mode 100644 index 00000000000..cc7aad641f0 --- /dev/null +++ b/exir/backend/test/test_backend_with_named_data_map.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch + +from executorch.exir import to_edge +from executorch.exir.backend.backend_api import to_backend + +from executorch.exir.backend.test.backend_with_named_data_map import ( + BackendWithNamedDataMap, + BackendWithNDMPartitioner, +) + + +class TestBackendWithNamedDataMap(unittest.TestCase): + def test_lowered_backend_module_has_output(self): + class M(torch.nn.Module): + def forward(self, x): + return x + x + + ep = to_edge(torch.export.export(M(), (torch.randn(1, 2),))) + lowered = to_backend( + BackendWithNamedDataMap.__name__, ep.exported_program(), [] + ) + + buffer_entries = lowered.named_data_store_output.buffers + self.assertTrue(len(buffer_entries) == 1) + stored_data = lowered.named_data_store_output.pte_data + + self.assertTrue("aten.add.Tensor" in stored_data) + self.assertTrue(buffer_entries[0].buffer == bytes(1)) + + def test_named_data_with_partitioner(self): + class M(torch.nn.Module): + def forward(self, x): + y = x + x + y = torch.cos(y) + y = y + y + y = torch.sin(y) + return y - y + + ep = to_edge(torch.export.export(M(), (torch.randn(1, 2),))) + ep.to_backend(BackendWithNDMPartitioner()) + + ndm_output = ep._named_data_store.get_named_data_store_output() + buffer_entries = ndm_output.buffers + stored_data = ndm_output.pte_data + self.assertEqual(len(buffer_entries), 3) + self.assertTrue("aten.add.Tensor" in stored_data) + self.assertTrue("aten.sub.Tensor" in stored_data) + self.assertTrue("aten.sin.default" in stored_data) + + def test_named_data_with_control_flow(self): + class M(torch.nn.Module): + def true_branch(self, x): + y = x * x + y = torch.cos(y) + return torch.sin(y) + + def false_branch(self, x): + return torch.sin(x) + + def forward(self, x, y): + z = x / y + z = torch.cond(z.sum() > 0, self.true_branch, self.false_branch, [x]) + return z - z + + ep = to_edge(torch.export.export(M(), (torch.randn(1, 2), torch.randn(1, 2)))) + ep.to_backend(BackendWithNDMPartitioner()) + + ndm_output = ep._named_data_store.get_named_data_store_output() + buffer_entries = ndm_output.buffers + stored_data = ndm_output.pte_data + self.assertEqual(len(buffer_entries), 4) + self.assertTrue("aten.sub.Tensor" in stored_data) + self.assertTrue("aten.div.Tensor" in stored_data) + self.assertTrue("aten.sin.default" in stored_data) + self.assertTrue("aten.mul.Tensor" in stored_data) diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index dde6a397d9a..ed155555ef5 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -14,6 +14,7 @@ import torch import torch.utils._pytree as pytree from executorch.exir._serialize import _serialize_pte_binary +from executorch.exir._serialize._named_data_store import NamedDataStoreOutput from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.delegate import executorch_call_delegate, get_lowered_module_name from executorch.exir.emit import emit_program @@ -62,6 +63,9 @@ class LoweredBackendModule(torch.nn.Module): CompileSpec ] # A list of backend-specific objects with static metadata to configure the "compilation" process. _original_exported_program: ExportedProgram # The original EXIR module + _named_data_store_output: Optional[ + NamedDataStoreOutput + ] # Named Data serialized by the backend def __init__( self, @@ -69,12 +73,14 @@ def __init__( backend_id: str, processed_bytes: bytes, compile_specs: List[CompileSpec], + named_data_store_output: Optional[NamedDataStoreOutput] = None, ) -> None: super().__init__() self._original_exported_program = edge_program self._backend_id = backend_id self._processed_bytes = processed_bytes self._compile_specs = compile_specs + self._named_data_store_output = named_data_store_output # pyre-ignore def __deepcopy__(self, memo: Optional[Dict[int, Any]]) -> "LoweredBackendModule": @@ -134,6 +140,13 @@ def original_module(self) -> ExportedProgram: """ return self._original_exported_program + @property + def named_data_store_output(self) -> Optional[NamedDataStoreOutput]: + """ + Returns the Named Data Store Output + """ + return self._named_data_store_output + # TODO(chenlai): consolidate the seriailization config with serialize_to_flatbuffer api def buffer( self, @@ -154,6 +167,7 @@ def buffer( segment_alignment=segment_alignment, constant_tensor_alignment=constant_tensor_alignment, delegate_alignment=delegate_alignment, + named_data=self.named_data_store_output, ) ) return out diff --git a/exir/program/_program.py b/exir/program/_program.py index ed9dace34d1..c00c003263f 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -26,6 +26,7 @@ from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.partitioner import Partitioner from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig +from executorch.exir.delegate import executorch_call_delegate, is_lowered_module from executorch.exir.emit import emit_program, EmitterOutput from executorch.exir.emit._emitter import _DelegateDebugIdentifierMap from executorch.exir.error import ExportError @@ -1304,6 +1305,7 @@ def __init__( constant_methods: Optional[Dict[str, Any]] = None, compile_config: Optional[EdgeCompileConfig] = None, ops_set_to_not_decompose: Optional[List[torch._ops.OpOverload]] = None, + named_data_store: Optional[NamedDataStore] = None, ): """ Should not be called directly by users. User should use :func:'to_edge' instead. @@ -1327,7 +1329,7 @@ def __init__( self._edge_programs: Dict[str, ExportedProgram] = edge_programs self._config_methods = constant_methods - self._named_data_store = NamedDataStore() + self._named_data_store = named_data_store or NamedDataStore() @property def methods(self) -> Set[str]: @@ -1437,9 +1439,30 @@ def to_backend( for name, program in self._edge_programs.items(): new_edge_programs[name] = to_backend(program, partitioner) + # collected all the named data into the named data store for deduplication + def collect_named_data_store_outputs( + graph_module: torch.fx.GraphModule, + ) -> None: + for node in graph_module.graph.nodes: + if node.target == executorch_call_delegate: + lbm = getattr(graph_module, node.args[0].name) + assert is_lowered_module(lbm) + data_store_output = lbm.named_data_store_output + if data_store_output is not None: + self._named_data_store.merge_named_data_store(data_store_output) + + for _, submod, _ in get_control_flow_submodules(graph_module): + collect_named_data_store_outputs(submod) + + for _, program in new_edge_programs.items(): + collect_named_data_store_outputs(program.graph_module) + config = EdgeCompileConfig(_check_ir_validity=False) return EdgeProgramManager( - new_edge_programs, copy.deepcopy(self._config_methods), config + new_edge_programs, + copy.deepcopy(self._config_methods), + config, + named_data_store=self._named_data_store, ) @et_logger("to_executorch") From 6ab001983591f4fcd25809b174bf9f6a917bac4f Mon Sep 17 00:00:00 2001 From: Max Ren Date: Thu, 13 Mar 2025 23:59:57 -0700 Subject: [PATCH 2/5] [ExecuTorch][Weight Sharing][XNNPACK] load named data map data for xnnpack Pull Request resolved: https://github.com/pytorch/executorch/pull/9152 If data is serialized into the NamedDataMap, then we overload getConstantDataPtr to retrieve the data from the named data map. This should be done in a Backwards Compatible way. Meaning if no data is serialized into the named data map, then we are still loading the data from the flatbuffer payload. Since the runtime change here is being made before the AoT changes, All CI on this diff by itself should test that the changes made here are backwards compatitble. Note: We do not resolve Runtime Memory usage at this point. WeightCache will be implemented in the next diff. Meaning If we load via the same key across different methods, we still pack twice and allocate two instances for the packed weights. ghstack-source-id: 271732048 @exported-using-ghexport Differential Revision: [D70315209](https://our.internmc.facebook.com/intern/diff/D70315209/) --- backends/xnnpack/runtime/XNNCompiler.cpp | 56 +++++++++++++++---- backends/xnnpack/runtime/XNNCompiler.h | 1 + backends/xnnpack/runtime/XNNPACKBackend.cpp | 7 ++- .../xnnpack/serialization/runtime_schema.fbs | 9 +++ backends/xnnpack/targets.bzl | 1 + 5 files changed, 62 insertions(+), 12 deletions(-) diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 8d8e9a13152..6a93ab73a2e 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #pragma clang diagnostic ignored "-Wmissing-prototypes" @@ -22,7 +22,9 @@ namespace xnnpack { namespace delegate { using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; using executorch::runtime::Result; /* @@ -48,6 +50,7 @@ class CompileAllocator { using ValuePtr = const fb_xnnpack::XValue*; using NodePtr = const fb_xnnpack::XNode*; using GraphPtr = const fb_xnnpack::XNNGraph*; +using ConstantDataOffsetPtr = const fb_xnnpack::ConstantDataOffset*; using DataType = fb_xnnpack::XNNDatatype; // Type for define node function. This is the function signature @@ -162,7 +165,9 @@ data associated with the tensor value, then returns nullptr. const uint8_t* getConstantDataPtr( const fb_xnnpack::XNNTensorValue* tensor_value, GraphPtr flatbuffer_graph, - const uint8_t* constant_data_ptr) { + const uint8_t* constant_data_ptr, + const NamedDataMap* named_data_map, + std::vector& loaded_buffers_from_map) { auto buffer_idx = tensor_value->constant_buffer_idx(); if (buffer_idx) { if (!constant_data_ptr) { @@ -171,10 +176,31 @@ const uint8_t* getConstantDataPtr( const auto& constant_buffer = *flatbuffer_graph->constant_buffer(); return constant_buffer[buffer_idx]->storage()->data(); } else { - const auto& constant_data_offsets = *flatbuffer_graph->constant_data(); - uint64_t constant_data_offset = - constant_data_offsets[buffer_idx]->offset(); - return constant_data_ptr + constant_data_offset; + ConstantDataOffsetPtr constant_data_offset = + flatbuffer_graph->constant_data()->Get(buffer_idx); + uint64_t offset = constant_data_offset->offset(); + + bool has_named_key = flatbuffers::IsFieldPresent( + constant_data_offset, fb_xnnpack::ConstantDataOffset::VT_NAMED_KEY); + // If there is no tensor name + if (!has_named_key) { + return constant_data_ptr + offset; + } else { + const std::string& data_name = constant_data_offset->named_key()->str(); + Result buffer = + named_data_map->get_data(data_name.c_str()); + if (!buffer.ok()) { + ET_LOG( + Error, + "Failed to get constant data for key %s", + data_name.c_str()); + return nullptr; + } + const uint8_t* data_ptr = + static_cast(buffer.get().data()); + loaded_buffers_from_map.push_back(std::move(buffer.get())); + return data_ptr; + } } } @@ -194,7 +220,9 @@ Error defineTensor( const uint8_t* constant_data_ptr, std::vector& input_ids, std::vector& output_ids, - CompileAllocator& allocator) { + CompileAllocator& allocator, + const NamedDataMap* named_data_map, + std::vector& loaded_buffers_from_map) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -231,8 +259,12 @@ Error defineTensor( // Get Pointer to constant data from flatbuffer, if its non-constant // it is a nullptr - const uint8_t* buffer_ptr = - getConstantDataPtr(tensor_value, flatbuffer_graph, constant_data_ptr); + const uint8_t* buffer_ptr = getConstantDataPtr( + tensor_value, + flatbuffer_graph, + constant_data_ptr, + named_data_map, + loaded_buffers_from_map); xnn_status status; // The type we might have to convert to @@ -1968,6 +2000,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( size_t num_bytes, XNNExecutor* executor, MemoryAllocator* runtime_allocator, + const NamedDataMap* named_data_map, xnn_workspace_t workspace) { Result header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; @@ -2036,6 +2069,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( std::vector input_ids; std::vector output_ids; Error err = Error::Ok; + std::vector loaded_buffers_from_map; for (auto value : *flatbuffer_graph->xvalues()) { err = defineTensor( subgraph.get(), @@ -2045,7 +2079,9 @@ ET_NODISCARD Error XNNCompiler::compileModel( constant_data, input_ids, output_ids, - compile_allocator); + compile_allocator, + named_data_map, + loaded_buffers_from_map); if (err != Error::Ok) { return err; diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h index e66cb791ecb..3ea621a4d59 100644 --- a/backends/xnnpack/runtime/XNNCompiler.h +++ b/backends/xnnpack/runtime/XNNCompiler.h @@ -30,6 +30,7 @@ class XNNCompiler { size_t num_bytes, XNNExecutor* executor, executorch::runtime::MemoryAllocator* runtime_allocator, + const executorch::runtime::NamedDataMap* named_data_map, xnn_workspace_t workspace); }; diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index 1938c5441a5..a01ba2da704 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include @@ -29,6 +29,7 @@ using executorch::runtime::DelegateHandle; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; +using executorch::runtime::NamedDataMap; using executorch::runtime::Result; class XnnpackBackend final : public ::executorch::runtime::BackendInterface { @@ -79,13 +80,14 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { return Error::MemoryAllocationFailed; } + const NamedDataMap* named_data_map = context.get_named_data_map(); + #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE // This is needed to serialize access to xnn_create_runtime which is not // thread safe. This can heppen when multiple threads call init() on // the same backend instance. const std::lock_guard lock(workspace_mutex_); #endif - // Executor has been allocated but not constructed, ensure that runtime_ is // nullptr by constructing it in place here. NOTE: Since we use placement // new and since this type is not trivially destructible, we must call the @@ -96,6 +98,7 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { processed->size(), executor, context.get_runtime_allocator(), + named_data_map, workspace_.get()); // This backend does not need its processed data after compiling the model. processed->Free(); diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index 11cb48430ed..75074107c55 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -320,11 +320,20 @@ table XNNLeakyReLU { table ConstantDataOffset { // Constant data offsets are relative to the constant data base offset provided // in the XNNPACKHeader. + // named_key and offset are mutually exclusive, meaning only one of these values + // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX. + // If the offset is not UINT64_MAX, then the named key must be an empty string offset: uint64; // The size in bytes of valid data starting at the offset. The constant data // may be followed by padding before the next piece of constant data size: uint64; + + // unique string id used to query the offset from the named data store. + // named_key and offset are mutually exclusive, meaning only one of these values + // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX. + // If the offset is not UINT64_MAX, then the named key must be an empty string + named_key: string; } table XNNGraph { diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index b89a999bc3d..3fd9c433372 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -60,6 +60,7 @@ def define_common_targets(): "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header", "//executorch/extension/threadpool:threadpool", "//executorch/runtime/core/exec_aten/util:tensor_util", + "//executorch/runtime/executor:pte_data_map" ], # XnnpackBackend.cpp needs to compile with executor as whole # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) From 5f05cf7da33593f3c91d6d65f604eccaa7464ca9 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 14 Mar 2025 21:16:18 -0400 Subject: [PATCH 3/5] [ExecuTorch][Weight Sharing][XNNPACK] Serialize constant tensors into named data map (#9295) This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: https://github.com/pytorch/executorch/pull/9153 by @mcr229 ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/9/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/9/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/8/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/9/orig @diff-train-skip-merge Co-authored-by: Max Ren --- backends/xnnpack/_passes/TARGETS | 1 + .../_passes/fuse_batch_norm_with_conv.py | 68 ++++++++++++++----- backends/xnnpack/operators/node_visitor.py | 26 ++++--- backends/xnnpack/serialization/schema.fbs | 9 +++ .../serialization/xnnpack_graph_schema.py | 1 + .../xnnpack/utils/gen_xnnpack_constants.sh | 1 + backends/xnnpack/utils/utils.py | 16 +++++ backends/xnnpack/utils/xnnpack_constants.py | 6 +- backends/xnnpack/xnnpack_preprocess.py | 6 +- 9 files changed, 106 insertions(+), 28 deletions(-) diff --git a/backends/xnnpack/_passes/TARGETS b/backends/xnnpack/_passes/TARGETS index a199e1aab01..972980570ec 100644 --- a/backends/xnnpack/_passes/TARGETS +++ b/backends/xnnpack/_passes/TARGETS @@ -19,5 +19,6 @@ python_library( "//executorch/exir/passes:const_prop_pass", "//executorch/exir/passes:memory_format_ops_pass", "//executorch/exir/program:program", + "//executorch/backends/transforms:utils", ], ) diff --git a/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py index b0f4779eb4c..6f31fe698ba 100644 --- a/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py +++ b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py @@ -7,13 +7,22 @@ import operator import torch +from executorch.backends.transforms.utils import ( + create_constant_placeholder, + delete_constant_placeholder, +) from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.utils import get_param_tensor, is_param_node +from executorch.backends.xnnpack.utils.utils import ( + get_param_tensor, + get_tensor_name, + is_param_node, +) from executorch.exir import ExportedProgram from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassResult +from torch.export.graph_signature import InputKind from torch.nn.utils.fusion import fuse_conv_bn_weights @@ -28,7 +37,7 @@ class FuseBatchNormWithConvPass(XNNPACKPass): def call(self, graph_module: torch.fx.GraphModule): graph = graph_module.graph - counter = 0 + constant_placeholders_to_delete = set() for conv in graph.nodes: # We want to discover a chain of conv -> batch_norm. # Only proceed if the current node is a conv node, and has a single @@ -55,9 +64,11 @@ def call(self, graph_module: torch.fx.GraphModule): assert len(conv.args) == 9 conv_weight = get_param_tensor(self.exported_program, conv.args[1]) + conv_weight_name = get_tensor_name(self.exported_program, conv.args[1]) assert conv_weight is not None conv_bias = get_param_tensor(self.exported_program, conv.args[2]) + conv_bias_name = get_tensor_name(self.exported_program, conv.args[2]) # Get the parameters from the batchnorm op assert ( @@ -95,23 +106,43 @@ def call(self, graph_module: torch.fx.GraphModule): bn_bias, is_transpose, ) + fused_weight_name = (conv_weight_name + "_fused_bn").replace(".", "_") + if conv_bias_name == "": + fused_bias_name = (conv_weight_name + "_bias_fused_bn").replace( + ".", "_" + ) + else: + fused_bias_name = (conv_bias_name + "_fused_bn").replace(".", "_") # Modify the graph by updating the weight and bias of conv op # with the fused weight and bias params, and replacing all the users # of getitem(batchnorm) with the conv op. - with graph.inserting_before(conv): - fused_weight_name = f"_fused_with_bn_weight_{counter}" - graph_module.register_parameter(fused_weight_name, fused_weight) - fused_weight_node = graph.get_attr(fused_weight_name) - fused_bias_name = f"_fused_with_bn_bias_{counter}" - graph_module.register_parameter(fused_bias_name, fused_bias) - fused_bias_node = graph.get_attr(fused_bias_name) - - # Update the weight and bias of conv op - conv_args = list(conv.args) + ([None] if len(conv.args) == 2 else []) - conv_args[1] = fused_weight_node - conv_args[2] = fused_bias_node - conv.args = tuple(conv_args) + with graph.inserting_before(conv.args[1]): + fused_conv_weight_node = create_constant_placeholder( + exp_program=self.exported_program, + graph=graph_module.graph, + kind=InputKind.PARAMETER, + name=fused_weight_name, + data=fused_weight, + ) + if fused_bias is not None: + fused_conv_bias_node = create_constant_placeholder( + exp_program=self.exported_program, + graph=graph_module.graph, + kind=InputKind.PARAMETER, + name=fused_bias_name, + data=fused_bias, + ) + else: + fused_conv_bias_node = None + + conv.args = ( + conv.args[0], + fused_conv_weight_node, + fused_conv_bias_node, + *conv.args[3:], + ) + # Remove any use of batchnorm from the graph for user in bn.users.copy(): assert user.target == operator.getitem @@ -119,8 +150,13 @@ def call(self, graph_module: torch.fx.GraphModule): graph.erase_node(user) graph.erase_node(bn) + constant_placeholders_to_delete.update(conv.args[1:3] + bn.args[1:5]) - counter += 1 + if len(constant_placeholders_to_delete) > 0: + graph_module.graph.eliminate_dead_code() + for node in constant_placeholders_to_delete: + if (node is not None) and (len(node.users) == 0): + delete_constant_placeholder(self.exported_program, node) graph_module.recompile() # To Regenerate meta data and shape information, retrace module diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py index 0a825a94bef..ec39d287346 100644 --- a/backends/xnnpack/operators/node_visitor.py +++ b/backends/xnnpack/operators/node_visitor.py @@ -34,11 +34,16 @@ check_or_raise, get_input_node, get_param_tensor, + get_tensor_name, is_param_node, PERM_NCHW_TO_NHWC, ) -from executorch.backends.xnnpack.utils.xnnpack_constants import XNN_INVALID_VALUE_ID +from executorch.backends.xnnpack.utils.xnnpack_constants import ( + UINT64_MAX, + XNN_INVALID_VALUE_ID, +) +from executorch.exir._serialize._named_data_store import NamedDataStore from torch.export import ExportedProgram XNN_TYPE_MAP = { @@ -46,8 +51,6 @@ } from executorch.backends.xnnpack.serialization.xnnpack_graph_serialize import ( - _aligned_size, - _pad_to, CONSTANT_TENSOR_ALIGNMENT, ) @@ -86,11 +89,11 @@ def __init__( self, exported_program: ExportedProgram, external_ids: Dict, - constant_data_bytes: bytearray, + named_data_store: NamedDataStore, ) -> None: self._external_ids = external_ids or {} self._exported_program = exported_program or None - self._constant_data_bytes = constant_data_bytes + self._named_data_store = named_data_store @property def external_ids(self) -> Dict: @@ -579,11 +582,16 @@ def get_serialized_buffer_index( ctypes.POINTER(array_type), ).contents - offset = len(self._constant_data_bytes) + named_key = get_tensor_name(self.exported_program, get_attr_node) + if named_key == "": + raise ValueError(f"Tensor from node: {get_attr_node} has no name") + size = const_val.untyped_storage().nbytes() - xnn_graph.constant_data.append(ConstantDataOffset(offset=offset, size=size)) - self._constant_data_bytes.extend( - _pad_to(bytes(array), _aligned_size(size, CONSTANT_TENSOR_ALIGNMENT)) + xnn_graph.constant_data.append( + ConstantDataOffset(offset=UINT64_MAX, size=size, named_key=named_key) + ) + self._named_data_store.add_named_data( + named_key, bytes(array), alignment=CONSTANT_TENSOR_ALIGNMENT ) return buffer_idx diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index 5a43481b98d..193656c30b1 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -316,11 +316,20 @@ table XNNLeakyReLU { table ConstantDataOffset { // Constant data offsets are relative to the constant data base offset provided // in the XNNPACKHeader. + // named_key and offset are mutually exclusive, meaning only one of these values + // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX. + // If the offset is not UINT64_MAX, then the named key must be an empty string offset: uint64; // The size in bytes of valid data starting at the offset. The constant data // may be followed by padding before the next piece of constant data size: uint64; + + // unique string id used to query the offset from the named data store. + // named_key and offset are mutually exclusive, meaning only one of these values + // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX. + // If the offset is not UINT64_MAX, then the named key must be an empty string + named_key: string; } table XNNGraph { diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py index 3276dac7869..3cb572c66ef 100644 --- a/backends/xnnpack/serialization/xnnpack_graph_schema.py +++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py @@ -470,6 +470,7 @@ class XValue: class ConstantDataOffset: offset: int size: int + named_key: str = "" @dataclass diff --git a/backends/xnnpack/utils/gen_xnnpack_constants.sh b/backends/xnnpack/utils/gen_xnnpack_constants.sh index 6be9d4519f3..5fa92e5b038 100644 --- a/backends/xnnpack/utils/gen_xnnpack_constants.sh +++ b/backends/xnnpack/utils/gen_xnnpack_constants.sh @@ -26,5 +26,6 @@ } > xnnpack_constants.py echo UINT32_MAX = 4294967295 >> xnnpack_constants.py +echo UINT64_MAX = 18446744073709551615 >> xnnpack_constants.py awk '/^#define\s+XNN_/ { print $2,"=",$3} ' "$1"/include/xnnpack.h >> xnnpack_constants.py if ! grep -qc "^XNN_" xnnpack_constants.py; then false; fi diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py index b802d73c16b..fab95618807 100644 --- a/backends/xnnpack/utils/utils.py +++ b/backends/xnnpack/utils/utils.py @@ -131,6 +131,22 @@ def get_param_tensor( raise RuntimeError(f"unsupported param type, {node.op}.") +def get_tensor_name(exp_prog: ExportedProgram, node: torch.fx.Node) -> str: + if node is None: + return "" + if is_param(exp_prog, node): + return exp_prog.graph_signature.inputs_to_parameters[node.name] + elif is_buffer(exp_prog, node): + return exp_prog.graph_signature.inputs_to_buffers[node.name] + elif is_lifted_tensor_constant(exp_prog, node): + return exp_prog.graph_signature.inputs_to_lifted_tensor_constants[node.name] + else: + assert isinstance(node.target, str) + return node.target + + return "" + + def get_source_fn(node: torch.fx.Node) -> Optional[torch.fx.Node]: """ Returns the source fn of the given node, return None if something goes wrong diff --git a/backends/xnnpack/utils/xnnpack_constants.py b/backends/xnnpack/utils/xnnpack_constants.py index 351cc8ad897..364819a2435 100644 --- a/backends/xnnpack/utils/xnnpack_constants.py +++ b/backends/xnnpack/utils/xnnpack_constants.py @@ -6,8 +6,11 @@ # Auto-generated by gen_xnnpack_constants.sh script. Do not modify UINT32_MAX = 4294967295 +UINT64_MAX = 18446744073709551615 +XNN_EXTRA_BYTES = 128 XNN_EXTRA_BYTES = 16 XNN_MAX_TENSOR_DIMS = 6 +XNN_INVALID_VALUE_ID = UINT32_MAX XNN_FLAG_HINT_SPARSE_INFERENCE = 0x00000001 XNN_FLAG_HINT_FP16_INFERENCE = 0x00000002 XNN_FLAG_FORCE_FP16_INFERENCE = 0x00000004 @@ -26,7 +29,8 @@ XNN_FLAG_YIELD_WORKERS = 0x00000010 XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER = 0x00000020 XNN_FLAG_KEEP_DIMS = 0x00000040 -XNN_EXTRA_QUANTIZATION_PARAMS = 8 +XNN_EXTRA_QUANTIZATION_PARAMS = 10 +XNN_MIN_BLOCKSIZE = 32 XNN_VALUE_FLAG_EXTERNAL_INPUT = 0x00000001 XNN_VALUE_FLAG_EXTERNAL_OUTPUT = 0x00000002 XNN_VALUE_FLAG_PERSISTENT = 0x00000004 diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py index 4548de4940a..84cdfd69a48 100644 --- a/backends/xnnpack/xnnpack_preprocess.py +++ b/backends/xnnpack/xnnpack_preprocess.py @@ -31,6 +31,7 @@ XNN_VALUE_FLAG_EXTERNAL_INPUT, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, ) +from executorch.exir._serialize._named_data_store import NamedDataStore from executorch.exir.backend.backend_details import ( BackendDetails, @@ -103,7 +104,7 @@ def preprocess( edge_program: ExportedProgram, compile_specs: List[CompileSpec], ) -> PreprocessResult: - + named_data_store = NamedDataStore() xnnpack_edge_compile_config = get_xnnpack_edge_compile_config() # Need to wrap EP here because xnnpack does addmm to linear @@ -162,7 +163,7 @@ def preprocess( ) constant_data_bytes = bytearray() - node_visitors = get_node_visitors(ep, node_to_external_map, constant_data_bytes) + node_visitors = get_node_visitors(ep, node_to_external_map, named_data_store) for node in graph_module.graph.nodes: if node.op == "call_function": @@ -191,4 +192,5 @@ def preprocess( xnnpack_graph, constant_data_bytes ), debug_handle_map={}, + data_store_output=named_data_store.get_named_data_store_output(), ) From d08d93894afada186d3727b24f54a32b28634914 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 14 Mar 2025 22:31:14 -0400 Subject: [PATCH 4/5] [XNNPACK][Weights Cache] Initial Weights Cache Design with NamedDataMap (#9296) This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: https://github.com/pytorch/executorch/pull/9154 by @mcr229 ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/10/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/10/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/9/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/10/orig @diff-train-skip-merge --------- Co-authored-by: Max Ren --- backends/xnnpack/runtime/XNNWeightsCache.cpp | 237 +++++++++++++++ backends/xnnpack/runtime/XNNWeightsCache.h | 164 ++++++++++ .../test/runtime/test_xnn_weights_cache.cpp | 286 ++++++++++++++++++ backends/xnnpack/test/targets.bzl | 13 + extension/testing_util/targets.bzl | 1 + schema/targets.bzl | 4 + 6 files changed, 705 insertions(+) create mode 100644 backends/xnnpack/runtime/XNNWeightsCache.cpp create mode 100644 backends/xnnpack/runtime/XNNWeightsCache.h create mode 100644 backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp diff --git a/backends/xnnpack/runtime/XNNWeightsCache.cpp b/backends/xnnpack/runtime/XNNWeightsCache.cpp new file mode 100644 index 00000000000..f2842851d3a --- /dev/null +++ b/backends/xnnpack/runtime/XNNWeightsCache.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace xnnpack { +namespace delegate { + +using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; + +XNNWeightsCache::XNNWeightsCache() { + weights_cache_.context = this; + weights_cache_.look_up = (size_t(*)( + void*, const xnn_weights_cache_look_up_key*))XNNWeightsCache::look_up; + weights_cache_.reserve_space = + (void* (*)(void*, size_t))XNNWeightsCache::reserve_space; + weights_cache_.look_up_or_insert = + (size_t(*)(void*, const xnn_weights_cache_look_up_key*, void*, size_t)) + XNNWeightsCache::look_up_or_insert; + weights_cache_.is_finalized = (bool (*)(void*))XNNWeightsCache::is_finalized; + weights_cache_.offset_to_addr = + (void* (*)(void*, size_t))XNNWeightsCache::offset_to_addr; + weights_cache_.delete_cache = + (enum xnn_status(*)(void*))XNNWeightsCache::delete_cache; +} + +Error XNNWeightsCache::initialize_for_runtime( + MemoryAllocator* runtime_allocator, + const NamedDataMap* named_data_map) { + runtime_allocator_ = runtime_allocator; + named_data_map_ = named_data_map; + is_finalized_ = false; + + return Error::Ok; +} + +Result> XNNWeightsCache::finalize_for_runtime() { + is_finalized_ = true; + + // All data has been packed by create_runtime + // so we clear the unpacked data as it is no longer needed + for (FreeableBuffer& buffer : unpacked_data_) { + buffer.Free(); + } + unpacked_data_.clear(); + unpacked_data_to_name_.clear(); + + std::vector packed_data_names; + // update the reference count of all the packed data + // used by this runtime + for (auto& entry : name_to_packed_data_metadata_) { + if (entry.second.in_current_runtime) { + entry.second.ref_count++; + entry.second.in_current_runtime = false; + packed_data_names.push_back(entry.first); + } + } + + return packed_data_names; +} + +Result XNNWeightsCache::load_unpacked_data( + const std::string& name) { + Result named_data = named_data_map_->get_data(name.c_str()); + if (!named_data.ok()) { + ET_LOG(Error, "Failed to load constant data for key %s", name.c_str()); + return Error::InvalidExternalData; + } + const uint8_t* data_pointer = + static_cast(named_data.get().data()); + unpacked_data_.push_back(std::move(named_data.get())); + unpacked_data_to_name_[data_pointer] = name; + + return data_pointer; +} + +Error XNNWeightsCache::delete_packed_data( + const std::vector& packed_data_names) { + if (!is_finalized_) { + ET_LOG( + Error, + "Error, attempted to delete packed data from the cache but the cache is not finalized"); + return Error::InvalidArgument; + } + for (const std::string& name : packed_data_names) { + auto entry = name_to_packed_data_metadata_.find(name); + if (entry == name_to_packed_data_metadata_.end()) { + ET_LOG( + Error, + "Error, attempted to deleted packed data: %s, from the cache but it wasn't found", + name.c_str()); + return Error::InvalidArgument; + } else { + entry->second.ref_count--; + if (entry->second.ref_count == 0) { + void* packed_data_ptr = packed_data_ptrs_[entry->second.offset]; + // Erase the key/value from the map frees the pointer holding the packed + // data + packed_pointer_to_container_.erase(packed_data_ptr); + // remove the pointer from the packed_data_ptrs_ + packed_data_ptrs_[entry->second.offset] = nullptr; + // Erase the name to packed metadata entry + name_to_packed_data_metadata_.erase(entry->first); + } + } + } + + return Error::Ok; +} + +size_t XNNWeightsCache::look_up( + XNNWeightsCache* context, + const xnn_weights_cache_look_up_key* cache_key) { + const void* unpacked_weights_ptr = cache_key->kernel; + const void* unpacked_bias_ptr = cache_key->bias; + auto entry = context->unpacked_data_to_name_.find(unpacked_weights_ptr); + + // Check if weight_pointer has been cached + if (entry == context->unpacked_data_to_name_.end()) { + return SIZE_MAX; + } + + std::string weight_bias_name = entry->second; + + // Check if bias_pointer has been cached + if (unpacked_bias_ptr != nullptr) { + auto bias_entry = context->unpacked_data_to_name_.find(unpacked_bias_ptr); + if (bias_entry != context->unpacked_data_to_name_.end()) { + weight_bias_name.append(bias_entry->second); + } + } + + // check if weight_bias_name has been packed already + auto packed_weight_entry = + context->name_to_packed_data_metadata_.find(weight_bias_name); + if (packed_weight_entry == context->name_to_packed_data_metadata_.end()) { + return SIZE_MAX; + } + packed_weight_entry->second.in_current_runtime = true; + + return packed_weight_entry->second.offset; +} + +void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) { + // MemoryAllocator* allocator = context->runtime_allocator_; + // void* reserved_pointer = allocator->allocate(n, + // context->kPackedAllocationAlignment); + + // return reserved_pointer; + std::string data_container; + data_container.resize(n + context->kPackedAllocationAlignment); + void* maybe_aligned_space = data_container.data(); + void* aligned_space = (void*)((intptr_t)maybe_aligned_space + 64 - + (intptr_t)maybe_aligned_space % 64); + + context->packed_pointer_to_container_[aligned_space] = + std::move(data_container); + return aligned_space; +} + +size_t XNNWeightsCache::look_up_or_insert( + XNNWeightsCache* context, + const xnn_weights_cache_look_up_key* cache_key, + void* ptr, + size_t size) { + size_t offset = context->look_up(context, cache_key); + + if (offset != SIZE_MAX) { + void* saved_ptr = context->offset_to_addr(context, offset); + if (0 == memcmp(ptr, saved_ptr, size)) { + return offset; + } + // Failure, cache is out of date + return SIZE_MAX; + } + + // Add to Cache if it is not finalized + size_t next_offset = context->packed_data_ptrs_.size(); + auto entry = context->unpacked_data_to_name_.find(cache_key->kernel); + + // Check if weight_pointer has been cached + if (entry != context->unpacked_data_to_name_.end()) { + std::string weight_bias_name = entry->second; + if (cache_key->bias != nullptr) { + auto bias_entry = context->unpacked_data_to_name_.find(cache_key->bias); + if (bias_entry != context->unpacked_data_to_name_.end()) { + weight_bias_name.append(bias_entry->second); + } + } + PackedDataMeta packed_data_metadata = { + .offset = next_offset, + .ref_count = + 0, // ref_count is only incremented after finalizing for runtime + .in_current_runtime = true}; + context->name_to_packed_data_metadata_[weight_bias_name] = + packed_data_metadata; + } else { + ET_LOG( + Info, + "Warning: Unpacked weight and bias were not registered with names, " + "this will add new cache entries for packed data and may affect performance."); + } + context->packed_data_ptrs_.push_back(ptr); + + return next_offset; +} + +bool XNNWeightsCache::is_finalized(XNNWeightsCache* context) { + return context->is_finalized_; +} + +void* XNNWeightsCache::offset_to_addr(XNNWeightsCache* context, size_t offset) { + return context->packed_data_ptrs_[offset]; +} + +enum xnn_status XNNWeightsCache::delete_cache(XNNWeightsCache* context) { + return xnn_status_success; +} + +} // namespace delegate +} // namespace xnnpack +} // namespace backends +} // namespace executorch diff --git a/backends/xnnpack/runtime/XNNWeightsCache.h b/backends/xnnpack/runtime/XNNWeightsCache.h new file mode 100644 index 00000000000..bc00ac15fd0 --- /dev/null +++ b/backends/xnnpack/runtime/XNNWeightsCache.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace xnnpack { +namespace delegate { + +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; +using executorch::runtime::Result; + +struct PackedDataMeta { + size_t offset; + // Count number of xnn_runtime_t this packed data is used in + size_t ref_count; + // true if this packed data was inserted or looked up for the + // current runtime being created + bool in_current_runtime; +}; + +class XNNWeightsCache { + public: + XNNWeightsCache(); + + /** + * Initializes the XNNWeightsCache for the next xnn_create_runtime + */ + Error initialize_for_runtime( + MemoryAllocator* runtime_allocator, + const NamedDataMap* named_data_map); + + /** + * Finalizes the weights cache after the weights have been packed + * in xnn_create_runtime. + * + * This should only be called after creating the runtime. Returns + * the name of all the packed weights used by this runtime + */ + Result> finalize_for_runtime(); + + // Taken from XNN_ALLOCATION_ALIGNMENT in xnnpack/common.h + static const size_t kPackedAllocationAlignment = 64; + + /** + * Returns XNNPACK's underlying weights_cache pointer + */ + inline xnn_weights_cache_t get() { + return (xnn_weights_cache_t)&weights_cache_; + } + + /** + * Returns the number of unpacked data + */ + inline size_t get_num_unpacked_data() { + return unpacked_data_.size(); + }; + + /** + * Returns the names of all unpacked data + */ + inline std::vector get_unpacked_data_names() { + std::vector names; + for (const auto& pair : unpacked_data_to_name_) { + names.push_back(pair.second); + } + return names; + }; + + /** + * Returns the packed data names + */ + inline std::vector get_packed_data_names() { + std::vector names; + for (const auto& pair : name_to_packed_data_metadata_) { + names.push_back(pair.first); + } + return names; + }; + + /** + * Loads unpacked named data from the NamedDataMap into this XNNWeightsCache + * and returns a pointer to the unpacked data. This unpacked data is given + * to XNNPACK's define_tensor APIs, and used as the cache key for + * look_up_or_insert. + * @param[in] name The name of the data to load + * @param[out] out the pointer to the unpacked data that was loaded + */ + Result load_unpacked_data(const std::string& name); + + /** + * Deletes the packed data associated with the names given. + * Decrements the ref_count if the packed data is used by other + * models + * + */ + Error delete_packed_data(const std::vector& packed_names); + + private: + // Runtime Allocator used to reserve memory for packed weights + MemoryAllocator* runtime_allocator_; + + // Named Data Map used to load named data + const NamedDataMap* named_data_map_; + + // Map of unpacked pointers to the data name + std::unordered_map unpacked_data_to_name_; + // Map of data names to offset into the packed data + std::unordered_map name_to_packed_data_metadata_; + // Vector holding list of pointers to the packed data + std::vector packed_data_ptrs_; + // vector holding list of strings which are containers for packed_data_ptrs + std::unordered_map packed_pointer_to_container_; + // Vector hodling list of unpacked freeable buffers + std::vector unpacked_data_; + // xnnpack's weight cache provider + xnn_weights_cache_provider weights_cache_; + // whether or not the weight cache is finalized + bool is_finalized_; + + // Function pointers to override XNNPACK's default xnn_weights_cache_provider + // functions. + static size_t look_up( + XNNWeightsCache* context, + const xnn_weights_cache_look_up_key* cache_key); + + static void* reserve_space(XNNWeightsCache* context, size_t n); + + static size_t look_up_or_insert( + XNNWeightsCache* context, + const xnn_weights_cache_look_up_key* cache_key, + void* ptr, + size_t size); + + static bool is_finalized(XNNWeightsCache* context); + + static void* offset_to_addr(XNNWeightsCache* context, size_t offset); + + static enum xnn_status delete_cache(XNNWeightsCache* context); +}; + +} // namespace delegate +} // namespace xnnpack +} // namespace backends +} // namespace executorch diff --git a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp new file mode 100644 index 00000000000..ca149a67b5e --- /dev/null +++ b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp @@ -0,0 +1,286 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using executorch::backends::xnnpack::delegate::XNNWeightsCache; +using executorch::extension::FileDataLoader; +using executorch::extension::testing::TempFile; +using executorch::runtime::DataLoader; +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::Result; +using executorch::runtime::internal::PteDataMap; + +class XNNWeightsCacheTest : public ::testing::Test { + protected: + void SetUp() override { + // Creating a NamedDataMap from scratch is a little bit convoluted, so + // we copied a lot of setup from test_pte_data_map.cpp + + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + executorch::runtime::runtime_init(); + + // Create a sample Program with only named_data and segments. Technically + // not a valid Program; only used to test the PteDataMap. + // Create named data. + std::array, 2> + named_data_arr = { + executorch_flatbuffer::CreateNamedDataDirect( + builder_, "weight", /*segment_index=*/0), + executorch_flatbuffer::CreateNamedDataDirect( + builder_, "bias", /*segment_index=*/1), + }; + const auto named_data = + builder_.CreateVector(named_data_arr.data(), named_data_arr.size()); + + // Create segments. + std::array, 2> + segment_arr = {// @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment + executorch_flatbuffer::CreateDataSegment( + builder_, /*offset=*/0, /*size=*/kSegmentSizes[0]), + // @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment + executorch_flatbuffer::CreateDataSegment( + builder_, + /*offset=*/kSegmentAlignment * 2, + /*size=*/kSegmentSizes[1])}; + const auto segments = + builder_.CreateVector(segment_arr.data(), segment_arr.size()); + + // Create Program. + const auto program = executorch_flatbuffer::CreateProgram( + builder_, 0, 0, 0, 0, segments, 0, 0, named_data); + + builder_.Finish(program); + program_ = executorch_flatbuffer::GetProgram(builder_.GetBufferPointer()); + + // Create sample segment data. + for (int i = 0; i < kSegmentSizes[0]; i++) { + sample_data_[i] = 1; + } + for (int i = kSegmentOffsets[1]; i < kSegmentOffsets[1] + kSegmentSizes[1]; + i++) { + sample_data_[i] = 2; + } + TempFile tf(sample_data_.data(), sizeof(sample_data_)); + + // Wrap the sample data in a loader. + Result loader = + FileDataLoader::from(tf.path().c_str(), kSegmentAlignment); + ASSERT_EQ(loader.error(), Error::Ok); + data_map_loader_ = + std::make_unique(std::move(loader.get())); + + Result data_map = PteDataMap::create( + data_map_loader_.get(), + 0, + program_->named_data(), + program_->segments()); + ASSERT_EQ(data_map.error(), Error::Ok); + data_map_ = std::make_unique(std::move(data_map.get())); + + memory_allocator_ = std::make_unique( + memory_allocator_data_.size(), memory_allocator_data_.data()); + + xnn_status status = xnn_initialize(nullptr); + ASSERT_EQ(status, xnn_status_success); + } + + void BuildAndRunGraphWithWeightsCache( + XNNWeightsCache& weight_cache, + const std::vector& batches, + size_t input_channels, + size_t output_channels, + float* input_data, + float* output_data) { + // Defining subgraph + xnn_subgraph_t subgraph_ptr = nullptr; + xnn_status status = xnn_create_subgraph( + /*external_value_ids=*/2, + /*flags=*/0, + &subgraph_ptr); + ASSERT_EQ(status, xnn_status_success); + std::unique_ptr subgraph( + subgraph_ptr, &xnn_delete_subgraph); + + // Define tensors + // Define input + uint32_t input_id; + std::vector input_dims(batches); + input_dims.push_back(input_channels); + status = xnn_define_tensor_value( + subgraph_ptr, + xnn_datatype_fp32, + input_dims.size(), + input_dims.data(), + nullptr, + 0, + XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input_id); + + // Define weight + uint32_t weight_id; + Result weight_pointer = + weight_cache.load_unpacked_data("weight"); + ASSERT_TRUE(weight_pointer.ok()); + ASSERT_TRUE(weight_pointer.get() != nullptr); + std::vector weight_dims{output_channels, input_channels}; + status = xnn_define_tensor_value( + subgraph_ptr, + xnn_datatype_fp32, + weight_dims.size(), + weight_dims.data(), + weight_pointer.get(), + XNN_INVALID_VALUE_ID, + 0, + &weight_id); + ASSERT_EQ(status, xnn_status_success); + + // Define bias + uint32_t bias_id; + Result bias_pointer = + weight_cache.load_unpacked_data("bias"); + ASSERT_TRUE(bias_pointer.ok()); + std::vector bias_dims{output_channels}; + status = xnn_define_tensor_value( + subgraph_ptr, + xnn_datatype_fp32, + bias_dims.size(), + bias_dims.data(), + bias_pointer.get(), + XNN_INVALID_VALUE_ID, + 0, + &bias_id); + + // Define output tensor + uint32_t output_id; + std::vector output_dims(batches); + output_dims.push_back(output_channels); + status = xnn_define_tensor_value( + subgraph_ptr, + xnn_datatype_fp32, + output_dims.size(), + output_dims.data(), + nullptr, + 1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id); + + // create xecond fully connected + status = xnn_define_fully_connected( + subgraph_ptr, + -std::numeric_limits::infinity(), + std::numeric_limits::infinity(), + input_id, + weight_id, + bias_id, + output_id, + 0); + // Create and Pack Weights + xnn_runtime_t runtime_ptr = nullptr; + status = xnn_create_runtime_v3( + subgraph_ptr, weight_cache.get(), nullptr, 0, &runtime_ptr); + Result> packed_weights_added = + weight_cache.finalize_for_runtime(); + ASSERT_TRUE(packed_weights_added.ok()); + ASSERT_EQ(packed_weights_added.get().size(), 1); + ASSERT_EQ(packed_weights_added.get()[0], "weightbias"); + + auto runtime = std::unique_ptr( + runtime_ptr, xnn_delete_runtime); + + const std::array external = { + xnn_external_value{0, input_data}, + xnn_external_value{1, output_data}, + }; + + status = xnn_reshape_runtime(runtime.get()); + status = + xnn_setup_runtime_v2(runtime.get(), external.size(), external.data()); + + ASSERT_EQ(status, xnn_status_success); + status = xnn_invoke_runtime(runtime.get()); + ASSERT_EQ(status, xnn_status_success); + } + + // Program builder constants. + static constexpr int kSegmentAlignment = 16; + static constexpr std::array kSegmentSizes{384, 128}; + static constexpr std::array kSegmentOffsets{0, kSegmentAlignment * 2}; + std::array sample_data_; + + // Program builder. + flatbuffers::FlatBufferBuilder builder_; + const executorch_flatbuffer::Program* program_; + + // Data loader for the sample data. + std::unique_ptr data_map_loader_; + + // PteDataMap + std::unique_ptr data_map_; + + // MemoryAllocator + std::array memory_allocator_data_; + std::unique_ptr memory_allocator_; +}; + +TEST_F(XNNWeightsCacheTest, ReusePackedWeights) { + XNNWeightsCache weight_cache; + size_t padding = 32; + + std::vector batches{1, 2, 3}; + size_t num_batches = 1; + for (size_t batch_dim : batches) { + num_batches *= batch_dim; + } + size_t input_channels = 3; + size_t output_channels = 4; + std::vector input_tensor(num_batches * input_channels + padding, 1.0f); + std::vector output_tensor(num_batches * output_channels, 0.0f); + float* input_data = input_tensor.data(); + float* output_data = output_tensor.data(); + weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get()); + BuildAndRunGraphWithWeightsCache( + weight_cache, + batches, + input_channels, + output_channels, + input_data, + output_data); + + weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get()); + BuildAndRunGraphWithWeightsCache( + weight_cache, + batches, + input_channels, + output_channels, + input_data, + output_data); + ASSERT_EQ(weight_cache.get_num_unpacked_data(), 0); + weight_cache.delete_packed_data(weight_cache.get_packed_data_names()); + std::vector packed_data_names = + weight_cache.get_packed_data_names(); + // Packed Data Still exists because it has a ref count of 2 + ASSERT_EQ(packed_data_names.size(), 1); + weight_cache.delete_packed_data(weight_cache.get_packed_data_names()); + packed_data_names = weight_cache.get_packed_data_names(); + ASSERT_EQ(packed_data_names.size(), 0); +} diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl index 30ce970a842..58589b70607 100644 --- a/backends/xnnpack/test/targets.bzl +++ b/backends/xnnpack/test/targets.bzl @@ -30,3 +30,16 @@ def define_common_targets(): "//executorch/backends/xnnpack:xnnpack_backend", ], ) + + runtime.cxx_test( + name = "test_xnn_weights_cache", + srcs = ["runtime/test_xnn_weights_cache.cpp"], + deps = [ + third_party_dep("XNNPACK"), + "//executorch/backends/xnnpack:xnnpack_backend", + "//executorch/runtime/executor:pte_data_map", + "//executorch/extension/data_loader:file_data_loader", + "//executorch/extension/testing_util:temp_file", + "//executorch/schema:program", + ], + ) diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl index 2b12480dfff..95b1f94d182 100644 --- a/extension/testing_util/targets.bzl +++ b/extension/testing_util/targets.bzl @@ -17,5 +17,6 @@ def define_common_targets(): "//executorch/extension/fb/ptez/decompression_methods/test/...", "//executorch/extension/fb/ptez/test/...", "//executorch/runtime/executor/test/...", + "//executorch/backends/xnnpack/test/...", ], ) diff --git a/schema/targets.bzl b/schema/targets.bzl index 40c6d8d5c8d..c0036c7500a 100644 --- a/schema/targets.bzl +++ b/schema/targets.bzl @@ -78,6 +78,10 @@ def define_common_targets(): # //executorch/runtime/executor/... "//executorch/codegen/tools/...", "//executorch/runtime/executor/...", + # Tests have a set up which uses raw flatbuffer. + # TODO will refactor these setup steps into + # testing utils in runtime/executor/... path + "//executorch/backends/xnnpack/test/...", ], exported_headers = { OUTPUT_PROGRAM_HEADER: ":{}[{}]".format(PROGRAM_GEN_RULE_NAME, OUTPUT_PROGRAM_HEADER), From 2a903f9609c2ee811604a1aff3c7995ed1a35c10 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 14 Mar 2025 22:31:40 -0400 Subject: [PATCH 5/5] [XNNPACK][Weights Cache] Enable in XNNPACK (#9297) This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: https://github.com/pytorch/executorch/pull/9155 by @mcr229 ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/11/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/11/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/10/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/11/orig @diff-train-skip-merge --------- Co-authored-by: Max Ren --- backends/xnnpack/CMakeLists.txt | 13 ++++ backends/xnnpack/runtime/XNNCompiler.cpp | 72 +++++++++++++++---- backends/xnnpack/runtime/XNNCompiler.h | 10 ++- backends/xnnpack/runtime/XNNExecutor.cpp | 4 +- backends/xnnpack/runtime/XNNExecutor.h | 8 ++- backends/xnnpack/runtime/XNNPACKBackend.cpp | 42 +++++++++-- backends/xnnpack/targets.bzl | 10 ++- .../xnnpack/test/runtime/test_xnnexecutor.cpp | 3 +- 8 files changed, 131 insertions(+), 31 deletions(-) diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 8b3bf3d91c1..ed0128f93f1 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -37,6 +37,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE # Keeping this OFF by default due to regressions in decode and model load with # kleidi kernels option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF) + +# Turning this on cache weights between partitions and methods. If weights +# are shared across methods/partitions then this can reduce load time and +# memory usage + +# Keeping this off maintains existing behavior. Turning this on serializes +# execution and initialization of delegates, to be revisited +option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE + "Enable weights cache to cache and manage all packed weights" OFF) + +if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE) + add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE) +endif() if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 6a93ab73a2e..c0204831c07 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #pragma clang diagnostic ignored "-Wmissing-prototypes" #pragma clang diagnostic ignored "-Wglobal-constructors" @@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr( GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, const NamedDataMap* named_data_map, - std::vector& loaded_buffers_from_map) { + std::vector& freeable_buffers, + XNNWeightsCache* weights_cache) { auto buffer_idx = tensor_value->constant_buffer_idx(); if (buffer_idx) { if (!constant_data_ptr) { @@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr( return constant_data_ptr + offset; } else { const std::string& data_name = constant_data_offset->named_key()->str(); +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + Result data_ptr = + weights_cache->load_unpacked_data(data_name); + if (!data_ptr.ok()) { + ET_LOG(Error, "Failed to load weights from cache"); + return nullptr; + } + return data_ptr.get(); +#else Result buffer = named_data_map->get_data(data_name.c_str()); if (!buffer.ok()) { @@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr( } const uint8_t* data_ptr = static_cast(buffer.get().data()); - loaded_buffers_from_map.push_back(std::move(buffer.get())); + freeable_buffers.push_back(std::move(buffer.get())); return data_ptr; +#endif } } } @@ -222,7 +235,8 @@ Error defineTensor( std::vector& output_ids, CompileAllocator& allocator, const NamedDataMap* named_data_map, - std::vector& loaded_buffers_from_map) { + std::vector& freeable_buffers, + XNNWeightsCache* weights_cache) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -264,7 +278,8 @@ Error defineTensor( flatbuffer_graph, constant_data_ptr, named_data_map, - loaded_buffers_from_map); + freeable_buffers, + weights_cache); xnn_status status; // The type we might have to convert to @@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel( const void* buffer_pointer, size_t num_bytes, XNNExecutor* executor, - MemoryAllocator* runtime_allocator, - const NamedDataMap* named_data_map, - xnn_workspace_t workspace) { + XNNWeightsCache* weights_cache, + xnn_workspace_t workspace, + const NamedDataMap* named_data_map) { Result header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; @@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel( // Invalid ids do not need to be remapped remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID); + // If weight cache is not on we hold onto all the unpacked buffers + // and we free them at the end + std::vector unpacked_buffers; + // External Ids for inputs and outputs std::vector input_ids; std::vector output_ids; Error err = Error::Ok; - std::vector loaded_buffers_from_map; for (auto value : *flatbuffer_graph->xvalues()) { err = defineTensor( subgraph.get(), @@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel( output_ids, compile_allocator, named_data_map, - loaded_buffers_from_map); + unpacked_buffers, + weights_cache); if (err != Error::Ok) { return err; @@ -2103,12 +2122,26 @@ ET_NODISCARD Error XNNCompiler::compileModel( xnn_runtime_t runtime_ptr = nullptr; + // XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache + // just manages the unpacked weights until the runtime is created. +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + ET_CHECK_OR_RETURN_ERROR( + unpacked_buffers.size() == 0, + Internal, + "Weight Cache is enabled, which means unpacked buffers should be owned by the cache"); + xnn_weights_cache_t weights_cache_ptr = + weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get() + : nullptr; +#else + xnn_weights_cache_t weights_cache_ptr = nullptr; +#endif + #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE ET_CHECK_OR_RETURN_ERROR( workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace"); status = xnn_create_runtime_v4( subgraph.get(), - /*weight_cache=*/nullptr, // TODO - support weight cache + weights_cache_ptr, workspace, ::executorch::extension::threadpool::get_pthreadpool(), runtime_flags, @@ -2116,7 +2149,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( #else status = xnn_create_runtime_v3( subgraph.get(), - /*weight_cache=*/nullptr, // TODO - support weight cache + weights_cache_ptr, ::executorch::extension::threadpool::get_pthreadpool(), runtime_flags, &runtime_ptr); @@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel( "XNN Runtime creation failed with code: %s", xnn_status_to_string(status)); +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + auto packed_weights_names = weights_cache->finalize_for_runtime(); + ET_CHECK_OR_RETURN_ERROR( + packed_weights_names.ok(), + Internal, + "Failed to finalize weights cache after creating the xnn runtime") +#else + for (auto& buffer : unpacked_buffers) { + buffer.Free(); + } + Result> packed_weights_names = + std::vector(); +#endif + err = executor->initialize( // NOLINT: runtime_ptr is non-null runtime_ptr, std::move(input_ids), - std::move(output_ids)); + std::move(output_ids), + std::move(packed_weights_names.get())); return err; }; diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h index 3ea621a4d59..bcc87351d7d 100644 --- a/backends/xnnpack/runtime/XNNCompiler.h +++ b/backends/xnnpack/runtime/XNNCompiler.h @@ -9,11 +9,9 @@ #pragma once #include +#include #include - #include -#include -#include namespace executorch { namespace backends { @@ -29,9 +27,9 @@ class XNNCompiler { const void* buffer_pointer, size_t num_bytes, XNNExecutor* executor, - executorch::runtime::MemoryAllocator* runtime_allocator, - const executorch::runtime::NamedDataMap* named_data_map, - xnn_workspace_t workspace); + XNNWeightsCache* weights_cache, + xnn_workspace_t workspace, + const NamedDataMap* named_data_map); }; } // namespace delegate diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp index 1ba549bb8d7..ae7c0d66ecb 100644 --- a/backends/xnnpack/runtime/XNNExecutor.cpp +++ b/backends/xnnpack/runtime/XNNExecutor.cpp @@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit; ET_NODISCARD Error XNNExecutor::initialize( xnn_runtime_t runtime, std::vector&& input_ids, - std::vector&& output_ids) { + std::vector&& output_ids, + std::vector&& packed_data_names) { runtime_ = std::unique_ptr( runtime, xnn_delete_runtime); @@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize( std::sort(output_ids_.begin(), output_ids_.end()); externals_.resize(input_ids_.size() + output_ids_.size()); + packed_data_names_ = std::move(packed_data_names); return Error::Ok; } diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h index 68ee18609e3..b98c902f44f 100644 --- a/backends/xnnpack/runtime/XNNExecutor.h +++ b/backends/xnnpack/runtime/XNNExecutor.h @@ -34,6 +34,7 @@ class XNNExecutor { std::vector input_ids_; std::vector output_ids_; std::vector externals_; + std::vector packed_data_names_; public: XNNExecutor() = default; @@ -46,6 +47,10 @@ class XNNExecutor { return output_ids_.size(); } + inline std::vector get_packed_data_names() { + return packed_data_names_; + } + /** * Initialize the XNNExecutor with a given runtime and input/output ids. * The input/output ids are expected to be sorted in order of their @@ -54,7 +59,8 @@ class XNNExecutor { ET_NODISCARD executorch::runtime::Error initialize( xnn_runtime_t runtime, std::vector&& input_ids, - std::vector&& output_ids); + std::vector&& output_ids, + std::vector&& packed_data_names); /** * Prepares the arguments for runtime graph execution. diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index a01ba2da704..1e2f07bd905 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -20,6 +21,7 @@ namespace executorch { namespace backends { +using executorch::backends::xnnpack::delegate::XNNWeightsCache; using executorch::runtime::ArrayRef; using executorch::runtime::Backend; using executorch::runtime::BackendExecutionContext; @@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { } const NamedDataMap* named_data_map = context.get_named_data_map(); - -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - // This is needed to serialize access to xnn_create_runtime which is not // thread safe. This can heppen when multiple threads call init() on // the same backend instance. +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE const std::lock_guard lock(workspace_mutex_); #endif + +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + const std::lock_guard lock_weight_cache(weights_cache_mutex_); + weights_cache_->initialize_for_runtime( + context.get_runtime_allocator(), named_data_map); +#endif + // Executor has been allocated but not constructed, ensure that runtime_ is // nullptr by constructing it in place here. NOTE: Since we use placement // new and since this type is not trivially destructible, we must call the @@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { processed->data(), processed->size(), executor, - context.get_runtime_allocator(), - named_data_map, - workspace_.get()); + weights_cache_.get(), + workspace_.get(), + named_data_map); // This backend does not need its processed data after compiling the model. processed->Free(); @@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { const std::lock_guard lock(workspace_mutex_); #endif +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + const std::lock_guard lock_weights_cache(weights_cache_mutex_); +#endif + // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); if (err != Error::Ok) { @@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE // This is needed to serialize access to xnn_delete_runtime which is not // thread safe. This can heppen when multiple threads call destroy() on // the same backend instance. +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE const std::lock_guard lock(workspace_mutex_); #endif + auto executor = static_cast(handle); + #ifdef ENABLE_XNNPACK_PROFILING executor->print_avg_op_timings(); #endif + +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + const std::lock_guard lock_weights_cache( + weights_cache_mutex_); + weights_cache_->delete_packed_data(executor->get_packed_data_names()); +#endif // XNNExecutor is not trivially destructible. Since this was constructed // manually in init(), we must destroy it manually here. executor->~XNNExecutor(); @@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { std::unique_ptr workspace_{ nullptr, &xnn_release_workspace}; + + // Weights cache is global to all delegate instances. + mutable std::mutex weights_cache_mutex_; + std::unique_ptr weights_cache_ = + std::make_unique(); + + // Lock Hiearchy for Mutexes: + // workspace_mutex_ + // weights_cache_mutex_ }; namespace { diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 3fd9c433372..e97f1941ff7 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -6,11 +6,15 @@ def _get_preprocessor_flags(): Disable if someone explictly specified a config option, else Enable otherwise """ - if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0": - return [] + preprocessor_flags = [] + if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0": + preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE") + + if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0": + preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE") # Enable if not disabled through config - return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"] + return preprocessor_flags def define_common_targets(): runtime.cxx_library( diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp index a5a26004b49..42d925c1253 100644 --- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp +++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp @@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) { }, { 1, - }), + }, + {}), Error::Ok); TensorFactory tf; auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});