Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Generate tiled code but too much logic is in the Template
  • Loading branch information
Victor-Jung committed Mar 18, 2026
commit d8548468a1d4cfd7f1398d1d5a975d5be5e01da5
27 changes: 19 additions & 8 deletions Deeploy/Targets/XDNA2/Deployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
from Deeploy.Logging import DEFAULT_LOGGER as log
from Deeploy.MLIRDataTypes import MLIRNodeTemplate
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint


class XDNA2Deployer(SignPropDeployer):
Expand Down Expand Up @@ -67,6 +68,10 @@ def generateMLIR(self) -> str:
Iterates over bound layers, calls each template's ``emit()``
to construct AIE operations, adds a ``runtime_sequence`` for
host-side DMA, verifies the module, and returns the MLIR text.

If tiling is enabled (patternMemoryConstraint available), passes
tiling information to templates to generate tiled transfers and
compute kernels.

Returns
-------
Expand All @@ -81,13 +86,17 @@ def generateMLIR(self) -> str:
mapper = layer.mapper
template = mapper.binder.template
op_repr = mapper.parser.operatorRepresentation

# Check if tiling is enabled by looking for patternMemoryConstraint
executionBlock = mapper.binder.executionBlock
tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None)

if not isinstance(template, MLIRNodeTemplate):
raise RuntimeError(
f"Node '{node_name}' has no MLIRNodeTemplate — "
f"only BF16 Add is supported in this release.")

nodes.append((node_name, template, op_repr))
nodes.append((node_name, template, op_repr, tilingConstraint))

if not nodes:
raise RuntimeError("No bound layers found — cannot generate MLIR.")
Expand All @@ -101,23 +110,25 @@ def _device():
shim_tile = aie_d.tile(0, 0)

# Emit each node's operations (ObjectFifos, core, kernel decls)
for node_name, template, op_repr in nodes:
log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'")
for node_name, template, op_repr, tilingConstraint in nodes:
log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'" +
(" with tiling" if tilingConstraint else ""))
template.emit(op_repr,
compute_tile=compute_tile,
shim_tile=shim_tile) # JUNGVI: What should be the interface of the MLIR template emission exactly?
shim_tile=shim_tile,
tilingConstraint=tilingConstraint) # Pass tiling info

# Runtime sequence: collect tensor types from all nodes' I/O
# For now (single-node), derive from the first node.
_, first_template, first_op_repr = nodes[0]
params = first_template.getAIEParams(first_op_repr)
_, first_template, first_op_repr, first_tilingConstraint = nodes[0]
params = first_template.getAIEParams(first_op_repr, tilingConstraint=first_tilingConstraint)
num_elements = params['num_elements']
tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get())

@aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
def _seq(*args):
for _, template, op_repr in nodes:
template.emitRuntimeSequence(op_repr, list(args))
for _, template, op_repr, tilingConstraint in nodes:
template.emitRuntimeSequence(op_repr, list(args), tilingConstraint=tilingConstraint)

module = ctx.module
assert module.operation.verify(), \
Expand Down
89 changes: 87 additions & 2 deletions Deeploy/Targets/XDNA2/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,34 @@
#
# SPDX-License-Identifier: Apache-2.0

from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
import onnx_graphsurgeon as gs

from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
from Deeploy.Targets.Generic.Layers import AddLayer
from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
from Deeploy.Targets.Generic.Parsers import AddParser
from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings

# Standard mapper for non-tiled deployment
XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings)

# Tiling-ready mapper for tiled deployment
XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings)

# Standard mapping (used when tiling is disabled)
XDNA2Mapping = {
'Add': AddLayer([XDNA2AddMapper]),
}

# Tiling-ready mapping (used when tiling is enabled)
XDNA2TilingMapping = {
'Add': AddLayer([XDNA2AddTilableMapper]),
}

# Buffer classes reuse Generic templates since XDNA2Deployer manages its own
# output format (MLIR + test headers) and these templates are never rendered.

Expand Down Expand Up @@ -56,6 +71,21 @@ def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str =
super().__init__(name, Mapping, initCode, includeList)


class XDNA2AIECoreEngine(DeploymentEngine):
"""AIE core execution engine with L1 local memory as preferred memory level.

The AIE core has 8KB of local memory (L1) for temporary buffers and computation.
Data is transferred from L3 (shared memory) to L1 as needed.
"""

def __init__(self, name: str = "XDNA2_AIE_Core", Mapping = XDNA2Mapping, initCode: str = "",
includeList = None, preferredMemoryLevel: str = "L1") -> None:
if includeList is None:
includeList = []
super().__init__(name, Mapping, initCode, includeList)
self.preferredMemoryLevel = preferredMemoryLevel


class XDNA2Platform(DeploymentPlatform):

def __init__(self,
Expand All @@ -67,3 +97,58 @@ def __init__(self,
if engines is None:
engines = [XDNA2Engine()]
super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)


class MemoryXDNA2Platform(MemoryPlatform):
"""XDNA2 platform with memory hierarchy support for tiling.

Defines the memory hierarchy:
- L1: 8KB per AIE core (local memory)
- L3: Shared memory for entire AIE array
"""

def __init__(self,
memoryHierarchy: MemoryHierarchy,
defaultTargetMemoryLevel: MemoryLevel,
engines = None,
variableBuffer = XDNA2VariableBuffer,
constantBuffer = XDNA2ConstantBuffer,
structBuffer = XDNA2StructBuffer,
transientBuffer = XDNA2TransientBuffer) -> None:
if engines is None:
engines = [XDNA2AIECoreEngine()]
super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,
structBuffer, transientBuffer)

def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
"""Get the target memory level for a tensor in a given node.

For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level).
Otherwise use the default target memory level (typically L3).
"""
# Check if node has an engine assignment
if hasattr(node, '_engine_assignment'):
engine = node._engine_assignment
if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
return engine.preferredMemoryLevel

return self.defaultTargetMemoryLevel.name


class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper):
"""Wrapper for XDNA2Platform with memory-level support."""

def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy,
defaultTargetMemoryLevel: MemoryLevel):
assert isinstance(platform, XDNA2Platform), \
f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}"
super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel)

def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
"""Get the target memory level for a tensor in a given node."""
if hasattr(node, '_engine_assignment'):
engine = node._engine_assignment
if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
return engine.preferredMemoryLevel

return self.defaultTargetMemoryLevel.name
66 changes: 58 additions & 8 deletions Deeploy/Targets/XDNA2/Templates/AddTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

from typing import TYPE_CHECKING

import numpy as np

from aie.dialects import aie as aie_d
from aie.dialects import aiex as aiex_d
from aie.dialects import arith as arith_d
Expand Down Expand Up @@ -51,19 +53,56 @@ def __init__(self):
# Parameter helpers
# ------------------------------------------------------------------

def getAIEParams(self, operatorRepresentation: OperatorRepresentation) -> dict:
def getAIEParams(self, operatorRepresentation: OperatorRepresentation,
tilingConstraint=None) -> dict:
"""Extract AIE parameters from the operator representation.

If tilingConstraint is available (tiling enabled), use information
from it. Otherwise fall back to fixed tile sizes.

Parameters
----------
operatorRepresentation : OperatorRepresentation
Parsed operator representation containing 'size' (total elements).
tilingConstraint : PatternMemoryConstraints, optional
Tiling solution from the solver. If provided, tile size is derived
from the tiling solution.

Returns
-------
dict
``num_elements``, ``tile_size`` (clamped to MAX_TILE_SIZE and
ensuring divisibility).
``num_elements``, ``tile_size`` (from tiling solution if available,
otherwise clamped to MAX_TILE_SIZE).
"""
num_elements = int(operatorRepresentation['size'])
tile_size = min(num_elements, self.MAX_TILE_SIZE)

# If tiling is enabled, extract tile size from the tiling solution
if tilingConstraint is not None:
# tilingConstraint is a PatternMemoryConstraints with nodeConstraints
nodeConstraint = tilingConstraint.nodeConstraints[0]
outputConstraints = nodeConstraint.outputTensorMemoryConstraints
if outputConstraints:
# Get the first output tensor's L1 memory constraint (tile shape)
firstOutputName = list(outputConstraints.keys())[0]
tensorConstraint = outputConstraints[firstOutputName]
# Use L1 constraint which holds the tile shape for the AIE core
if "L1" in tensorConstraint.memoryConstraints:
l1Constraint = tensorConstraint.memoryConstraints["L1"]
if l1Constraint.shape is not None:
tile_size = int(np.prod(l1Constraint.shape))
else:
tile_size = min(num_elements, self.MAX_TILE_SIZE)
else:
tile_size = min(num_elements, self.MAX_TILE_SIZE)
else:
tile_size = min(num_elements, self.MAX_TILE_SIZE)
else:
tile_size = min(num_elements, self.MAX_TILE_SIZE)

if num_elements % tile_size != 0:
tile_size = 1
# Round down to the largest divisor of num_elements that fits
tile_size = max(d for d in range(1, tile_size + 1) if num_elements % d == 0)

return {
'num_elements': num_elements,
'tile_size': tile_size,
Expand All @@ -81,8 +120,17 @@ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None

* ``compute_tile`` — result of ``aie_d.tile(col, row)``
* ``shim_tile`` — result of ``aie_d.tile(col, 0)``
* ``tilingConstraint`` — optional NodeMemoryConstraint for tiled execution

Parameters
----------
operatorRepresentation : OperatorRepresentation
Parsed operator representation with 'size' and other attributes
**kwargs
compute_tile, shim_tile, tilingConstraint (optional)
"""
params = self.getAIEParams(operatorRepresentation)
tilingConstraint = kwargs.get('tilingConstraint', None)
params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
num_elements = params['num_elements']
tile_size = params['tile_size']
num_tiles = num_elements // tile_size
Expand Down Expand Up @@ -123,7 +171,7 @@ def _core():
scf_d.yield_([])

def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
seq_args: list) -> None:
seq_args: list, tilingConstraint=None) -> None:
"""Emit DMA configuration inside a runtime_sequence block.

Parameters
Expand All @@ -133,8 +181,10 @@ def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
seq_args : list
Block arguments of the runtime_sequence (memref values for
in1, in2, out — in the order matching the ONNX graph I/O).
tilingConstraint : NodeMemoryConstraint, optional
Tiling solution from the solver (currently ignored, for future use).
"""
params = self.getAIEParams(operatorRepresentation)
params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
num_elements = params['num_elements']

dims = [
Expand Down
16 changes: 16 additions & 0 deletions Deeploy/Targets/XDNA2/Tiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

"""XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation."""

from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings

# For Add operator, reuse the generic BOP (Binary Operator) tile constraint
# which handles equal-dimension binary operations
XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(
nodeBindings=XDNA2AddBindings,
tileConstraint=AddTileConstraint()
)
2 changes: 1 addition & 1 deletion DeeployTest/deeployRunner_xdna2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
from testUtils.deeployRunner import main

if __name__ == '__main__':
sys.exit(main(default_platform="XDNA2", default_simulator="host"))
sys.exit(main(default_platform="XDNA2", default_simulator="host", tiling_enabled=True))
Loading