Generate tiled code but too much logic is in the Template

pulp-platform · Victor-Jung · Mar 12, 2026 · Mar 12, 2026 · Mar 17, 2026 · Mar 18, 2026
commit d8548468a1d4cfd7f1398d1d5a975d5be5e01da5
@@ -24,6 +24,7 @@
 from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MLIRDataTypes import MLIRNodeTemplate
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 
 
 class XDNA2Deployer(SignPropDeployer):
@@ -67,6 +68,10 @@ def generateMLIR(self) -> str:
         Iterates over bound layers, calls each template's ``emit()``
         to construct AIE operations, adds a ``runtime_sequence`` for
         host-side DMA, verifies the module, and returns the MLIR text.
+
+        If tiling is enabled (patternMemoryConstraint available), passes
+        tiling information to templates to generate tiled transfers and
+        compute kernels.
 
         Returns
         -------
@@ -81,13 +86,17 @@ def generateMLIR(self) -> str:
             mapper = layer.mapper
             template = mapper.binder.template
             op_repr = mapper.parser.operatorRepresentation
+
+            # Check if tiling is enabled by looking for patternMemoryConstraint
+            executionBlock = mapper.binder.executionBlock
+            tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None)
 
             if not isinstance(template, MLIRNodeTemplate):
                 raise RuntimeError(
                     f"Node '{node_name}' has no MLIRNodeTemplate — "
                     f"only BF16 Add is supported in this release.")
 
-            nodes.append((node_name, template, op_repr))
+            nodes.append((node_name, template, op_repr, tilingConstraint))
 
         if not nodes:
             raise RuntimeError("No bound layers found — cannot generate MLIR.")
@@ -101,23 +110,25 @@ def _device():
                 shim_tile = aie_d.tile(0, 0)
 
                 # Emit each node's operations (ObjectFifos, core, kernel decls)
-                for node_name, template, op_repr in nodes:
-                    log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'")
+                for node_name, template, op_repr, tilingConstraint in nodes:
+                    log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'" +
+                             (" with tiling" if tilingConstraint else ""))
                     template.emit(op_repr,
                                   compute_tile=compute_tile,
-                                  shim_tile=shim_tile) # JUNGVI: What should be the interface of the MLIR template emission exactly?
+                                  shim_tile=shim_tile,
+                                  tilingConstraint=tilingConstraint)  # Pass tiling info
 
                 # Runtime sequence: collect tensor types from all nodes' I/O
                 # For now (single-node), derive from the first node.
-                _, first_template, first_op_repr = nodes[0]
-                params = first_template.getAIEParams(first_op_repr)
+                _, first_template, first_op_repr, first_tilingConstraint = nodes[0]
+                params = first_template.getAIEParams(first_op_repr, tilingConstraint=first_tilingConstraint)
                 num_elements = params['num_elements']
                 tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get())
 
                 @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
                 def _seq(*args):
-                    for _, template, op_repr in nodes:
-                        template.emitRuntimeSequence(op_repr, list(args))
+                    for _, template, op_repr, tilingConstraint in nodes:
+                        template.emitRuntimeSequence(op_repr, list(args), tilingConstraint=tilingConstraint)
 
             module = ctx.module
             assert module.operation.verify(), \

@@ -2,19 +2,34 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
-    StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
+    NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
 from Deeploy.Targets.Generic.Layers import AddLayer
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.Parsers import AddParser
 from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings
 
+# Standard mapper for non-tiled deployment
 XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings)
 
+# Tiling-ready mapper for tiled deployment  
+XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings)
+
+# Standard mapping (used when tiling is disabled)
 XDNA2Mapping = {
     'Add': AddLayer([XDNA2AddMapper]),
 }
 
+# Tiling-ready mapping (used when tiling is enabled)
+XDNA2TilingMapping = {
+    'Add': AddLayer([XDNA2AddTilableMapper]),
+}
+
 # Buffer classes reuse Generic templates since XDNA2Deployer manages its own
 # output format (MLIR + test headers) and these templates are never rendered.
 
@@ -56,6 +71,21 @@ def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str =
         super().__init__(name, Mapping, initCode, includeList)
 
 
+class XDNA2AIECoreEngine(DeploymentEngine):
+    """AIE core execution engine with L1 local memory as preferred memory level.
+
+    The AIE core has 8KB of local memory (L1) for temporary buffers and computation.
+    Data is transferred from L3 (shared memory) to L1 as needed.
+    """
+
+    def __init__(self, name: str = "XDNA2_AIE_Core", Mapping = XDNA2Mapping, initCode: str = "",
+                 includeList = None, preferredMemoryLevel: str = "L1") -> None:
+        if includeList is None:
+            includeList = []
+        super().__init__(name, Mapping, initCode, includeList)
+        self.preferredMemoryLevel = preferredMemoryLevel
+
+
 class XDNA2Platform(DeploymentPlatform):
 
     def __init__(self,
@@ -67,3 +97,58 @@ def __init__(self,
         if engines is None:
             engines = [XDNA2Engine()]
         super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
+class MemoryXDNA2Platform(MemoryPlatform):
+    """XDNA2 platform with memory hierarchy support for tiling.
+
+    Defines the memory hierarchy:
+    - L1: 8KB per AIE core (local memory)
+    - L3: Shared memory for entire AIE array
+    """
+
+    def __init__(self,
+                 memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel,
+                 engines = None,
+                 variableBuffer = XDNA2VariableBuffer,
+                 constantBuffer = XDNA2ConstantBuffer,
+                 structBuffer = XDNA2StructBuffer,
+                 transientBuffer = XDNA2TransientBuffer) -> None:
+        if engines is None:
+            engines = [XDNA2AIECoreEngine()]
+        super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,
+                         structBuffer, transientBuffer)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        """Get the target memory level for a tensor in a given node.
+
+        For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level).
+        Otherwise use the default target memory level (typically L3).
+        """
+        # Check if node has an engine assignment
+        if hasattr(node, '_engine_assignment'):
+            engine = node._engine_assignment
+            if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
+                return engine.preferredMemoryLevel
+
+        return self.defaultTargetMemoryLevel.name
+
+
+class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper):
+    """Wrapper for XDNA2Platform with memory-level support."""
+
+    def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, 
+                 defaultTargetMemoryLevel: MemoryLevel):
+        assert isinstance(platform, XDNA2Platform), \
+            f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}"
+        super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        """Get the target memory level for a tensor in a given node."""
+        if hasattr(node, '_engine_assignment'):
+            engine = node._engine_assignment
+            if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
+                return engine.preferredMemoryLevel
+
+        return self.defaultTargetMemoryLevel.name
@@ -12,6 +12,8 @@
 
 from typing import TYPE_CHECKING
 
+import numpy as np
+
 from aie.dialects import aie as aie_d
 from aie.dialects import aiex as aiex_d
 from aie.dialects import arith as arith_d
@@ -51,19 +53,56 @@ def __init__(self):
     # Parameter helpers
     # ------------------------------------------------------------------
 
-    def getAIEParams(self, operatorRepresentation: OperatorRepresentation) -> dict:
+    def getAIEParams(self, operatorRepresentation: OperatorRepresentation,
+                     tilingConstraint=None) -> dict:
         """Extract AIE parameters from the operator representation.
+
+        If tilingConstraint is available (tiling enabled), use information
+        from it. Otherwise fall back to fixed tile sizes.
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            Parsed operator representation containing 'size' (total elements).
+        tilingConstraint : PatternMemoryConstraints, optional
+            Tiling solution from the solver. If provided, tile size is derived
+            from the tiling solution.
 
         Returns
         -------
         dict
-            ``num_elements``, ``tile_size`` (clamped to MAX_TILE_SIZE and
-            ensuring divisibility).
+            ``num_elements``, ``tile_size`` (from tiling solution if available,
+            otherwise clamped to MAX_TILE_SIZE).
         """
         num_elements = int(operatorRepresentation['size'])
-        tile_size = min(num_elements, self.MAX_TILE_SIZE)
+
+        # If tiling is enabled, extract tile size from the tiling solution
+        if tilingConstraint is not None:
+            # tilingConstraint is a PatternMemoryConstraints with nodeConstraints
+            nodeConstraint = tilingConstraint.nodeConstraints[0]
+            outputConstraints = nodeConstraint.outputTensorMemoryConstraints
+            if outputConstraints:
+                # Get the first output tensor's L1 memory constraint (tile shape)
+                firstOutputName = list(outputConstraints.keys())[0]
+                tensorConstraint = outputConstraints[firstOutputName]
+                # Use L1 constraint which holds the tile shape for the AIE core
+                if "L1" in tensorConstraint.memoryConstraints:
+                    l1Constraint = tensorConstraint.memoryConstraints["L1"]
+                    if l1Constraint.shape is not None:
+                        tile_size = int(np.prod(l1Constraint.shape))
+                    else:
+                        tile_size = min(num_elements, self.MAX_TILE_SIZE)
+                else:
+                    tile_size = min(num_elements, self.MAX_TILE_SIZE)
+            else:
+                tile_size = min(num_elements, self.MAX_TILE_SIZE)
+        else:
+            tile_size = min(num_elements, self.MAX_TILE_SIZE)
+
         if num_elements % tile_size != 0:
-            tile_size = 1
+            # Round down to the largest divisor of num_elements that fits
+            tile_size = max(d for d in range(1, tile_size + 1) if num_elements % d == 0)
+
         return {
             'num_elements': num_elements,
             'tile_size': tile_size,
@@ -81,8 +120,17 @@ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None
 
         * ``compute_tile`` — result of ``aie_d.tile(col, row)``
         * ``shim_tile`` — result of ``aie_d.tile(col, 0)``
+        * ``tilingConstraint`` — optional NodeMemoryConstraint for tiled execution
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            Parsed operator representation with 'size' and other attributes
+        **kwargs
+            compute_tile, shim_tile, tilingConstraint (optional)
         """
-        params = self.getAIEParams(operatorRepresentation)
+        tilingConstraint = kwargs.get('tilingConstraint', None)
+        params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
         num_elements = params['num_elements']
         tile_size = params['tile_size']
         num_tiles = num_elements // tile_size
@@ -123,7 +171,7 @@ def _core():
                 scf_d.yield_([])
 
     def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
-                            seq_args: list) -> None:
+                            seq_args: list, tilingConstraint=None) -> None:
         """Emit DMA configuration inside a runtime_sequence block.
 
         Parameters
@@ -133,8 +181,10 @@ def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
         seq_args : list
             Block arguments of the runtime_sequence (memref values for
             in1, in2, out — in the order matching the ONNX graph I/O).
+        tilingConstraint : NodeMemoryConstraint, optional
+            Tiling solution from the solver (currently ignored, for future use).
         """
-        params = self.getAIEParams(operatorRepresentation)
+        params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
         num_elements = params['num_elements']
 
         dims = [

@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation."""
+
+from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
+from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+# For Add operator, reuse the generic BOP (Binary Operator) tile constraint
+# which handles equal-dimension binary operations
+XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings=XDNA2AddBindings,
+    tileConstraint=AddTileConstraint()
+)
@@ -14,4 +14,4 @@
 from testUtils.deeployRunner import main
 
 if __name__ == '__main__':
-    sys.exit(main(default_platform="XDNA2", default_simulator="host"))
+    sys.exit(main(default_platform="XDNA2", default_simulator="host", tiling_enabled=True))