From fbfd25940f622cbb0fc71dbf5a33d7d73bb7a113 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Aug 2021 12:19:46 -0500 Subject: [PATCH 001/109] adds an immutable tree implementation --- loopy/tools.py | 224 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) diff --git a/loopy/tools.py b/loopy/tools.py index d12ff750c..78969d31f 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -30,6 +30,9 @@ from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) from sys import intern +from typing import FrozenSet, Generic, TypeVar, Iterator +from pyrsistent import PMap, pmap +from dataclasses import dataclass import logging logger = logging.getLogger(__name__) @@ -928,4 +931,225 @@ def _get_persistent_hashable_arg(arg): return wrapper + +# {{{ tree data structure + +T = TypeVar("T") + + +@dataclass(frozen=True) +class Tree(Generic[T]): + """ + An immutable tree implementation. + + .. automethod:: ancestors + .. automethod:: parent + .. automethod:: children + .. automethod:: depth + .. automethod:: rename_node + .. automethod:: move_node + + .. note:: + + Almost all the operations are implemented recursively. NOT suitable for + deep trees. At the very least if the Python implementation is CPython + this allocates a new stack frame for each iteration of the operation. + """ + _parent_to_children: "PMap[T, FrozenSet[T]]" + _child_to_parent: "PMap[T, Optional[T]]" + + @staticmethod + def from_root(root: T): + return Tree(pmap({root: frozenset()}), + pmap({root: None})) + + @property + def root(self) -> T: + guess = set(self._child_to_parent).pop() + while self.parent(guess) is not None: + guess = self.parent(guess) + + return guess + + def ancestors(self, node: T) -> "FrozenSet[T]": + """ + Returns a :class:`frozenset` of nodes that are ancestors of *node*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + if self.is_root(node): + # => root + return frozenset() + + parent = self._child_to_parent[node] + + return frozenset([parent]) | self.ancestors(parent) + + def parent(self, node: T) -> "Optional[T]": + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self._child_to_parent[node] + + def children(self, node: T) -> "FrozenSet[T]": + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self._parent_to_children[node] + + def depth(self, node: T) -> int: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + if self.is_root(node): + # => None + return 0 + + return 1 + self.depth(self.parent(node)) + + def is_root(self, node: T) -> bool: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self.parent(node) is None + + def is_leaf(self, node: T) -> bool: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return len(self.children(node)) == 0 + + def is_a_node(self, node: T) -> bool: + return node in self._child_to_parent + + def add_node(self, node: T, parent: T) -> "Tree[T]": + """ + Returns a :class:`Tree` with added node *node* having a parent + *parent*. + """ + if self.is_a_node(node): + raise ValueError(f"'{node}' already present in tree.") + + siblings = self._parent_to_children[parent] + + return Tree((self._parent_to_children + .set(parent, siblings | frozenset([node])) + .set(node, frozenset())), + self._child_to_parent.set(node, parent)) + + def rename_node(self, node: T, new_id: T) -> "Tree[T]": + """ + Returns a copy of *self* with *node* renamed to *new_id*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not present in tree.") + + if self.is_a_node(new_id): + raise ValueError(f"cannot rename to '{new_id}', as its already a part" + " of the tree.") + + parent = self.parent(node) + children = self.children(node) + + # {{{ update child to parent + + new_child_to_parent = (self._child_to_parent.discard(node) + .set(new_id, parent)) + + for child in children: + new_child_to_parent = (new_child_to_parent + .set(child, new_id)) + + # }}} + + # {{{ update parent_to_children + + new_parent_to_children = (self._parent_to_children + .discard(node) + .set(new_id, self.children(node))) + + if parent is not None: + # update the child's name in the parent's children + new_parent_to_children = (new_parent_to_children + .discard(parent) + .set(parent, ((self.children(parent) + - frozenset([node])) + | frozenset([new_id])))) + + # }}} + + return Tree(new_parent_to_children, + new_child_to_parent) + + def move_node(self, node: T, new_parent: "Optional[T]") -> "Tree[T]": + """ + Returns a copy of *self* with node *node* as a child of *new_parent*. + """ + if self.is_root(node) and new_parent is not None: + raise ValueError("Moving root not allowed.") + + if not self.is_a_node(node): + raise ValueError(f"'{node}' not a part of the tree => cannot move.") + + if not self.is_a_node(new_parent): + raise ValueError(f"Cannot move to '{new_parent}' as it's not in tree.") + + parent = self.parent(node) + siblings = self.children(parent) + parents_new_children = siblings - frozenset([node]) + new_parents_children = self.children(new_parent) | frozenset([node]) + + new_child_to_parent = self._child_to_parent.set(node, new_parent) + new_parent_to_children = (self._parent_to_children + .set(parent, parents_new_children) + .set(new_parent, new_parents_children)) + + return Tree(new_parent_to_children, + new_child_to_parent) + + def __str__(self): + """ + Stringifies the tree by using the box-drawing unicode characters. + + :: + + >>> from loopy.tools import Tree + >>> tree = (Tree.from_root("Root") + ... .add_node("A", "Root") + ... .add_node("B", "Root") + ... .add_node("D", "B") + ... .add_node("E", "B") + ... .add_node("C", "A")) + + >>> print(tree) + Root + ├── A + │ └── C + └── B + ├── D + └── E + """ + def rec(node): + children_result = [rec(c) for c in self.children(node)] + + def post_process_non_last_child(child): + return ["├── " + child[0]] + [f"│ {c}" for c in child[1:]] + + def post_process_last_child(child): + return ["└── " + child[0]] + [f" {c}" for c in child[1:]] + + children_result = ([post_process_non_last_child(c) + for c in children_result[:-1]] + + [post_process_last_child(c) + for c in children_result[-1:]]) + return [str(node)] + sum(children_result, start=[]) + + return "\n".join(rec(self.root)) + + def nodes(self) -> "Iterator[T]": + return iter(self._child_to_parent.keys()) + +# }}} + # vim: fdm=marker From 05624165578e90020544e61b830b279aa21975fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Aug 2021 12:21:12 -0500 Subject: [PATCH 002/109] add helpers to figure out loop nestings from a kernel --- loopy/schedule/tools.py | 415 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 415 insertions(+) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index ee6f75e4c..70120dee9 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -21,6 +21,11 @@ """ from loopy.kernel.data import AddressSpace +from loopy.diagnostic import LoopyError +from loopy.tools import Tree +from functools import reduce +from pytools import memoize_on_first_arg +from pyrsistent import pmap # {{{ block boundary finder @@ -138,3 +143,413 @@ def get_return_from_kernel_mapping(kernel): return return_from_kernel_idxs # }}} + + +def _pull_out_loop_nest(tree, loop_nests, inames_to_pull_out): + """ + Returns a copy of *tree* that realizes *inames_to_pull_out* as loop + nesting. + + :arg tree: A :class:`loopy.tools.Tree`, where each node is + :class:`frozenset` of inames representing a loop nest. For example a + tree might look like: + + :arg loop_nests: A collection of nodes in *tree* that cover + *inames_to_pull_out*. + + :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``, + where outer_loop_nest is the identifier for the new outer and inner + loop nests so that *inames_to_pull_out* is a valid nesting. + + .. note:: + + We could compute *loop_nests* within this routine's implementation, but + computing would be expensive and hence we ask the caller for this info. + + Example:: + *tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k', 'l'}) + + *inames_to_pull_out*: frozenset({'k', 'i', 'j'}) + *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})} + + Returns: + + *new_tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k'}) + └── frozenset({'l'}) + + *outer_loop_nest*: frozenset({'k'}) + *inner_loop_nest*: frozenset({'l'}) + """ + assert all(isinstance(loop_nest, frozenset) for loop_nest in loop_nests) + assert inames_to_pull_out <= reduce(frozenset.union, loop_nests, frozenset()) + + # {{{ sanity check to ensure the loop nest *inames_to_pull_out* is possible + + loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest)) + + for outer, inner in zip(loop_nests[:-1], loop_nests[1:]): + if outer != tree.parent(inner): + raise LoopyError(f"Cannot schedule loop nest {inames_to_pull_out} " + f" in the nesting tree:\n{tree}") + + assert tree.depth(loop_nests[0]) == 0 + + # }}} + + innermost_loop_nest = loop_nests[-1] + new_outer_loop_nest = inames_to_pull_out - reduce(frozenset.union, + loop_nests[:-1], + frozenset()) + new_inner_loop_nest = innermost_loop_nest - inames_to_pull_out + + if new_outer_loop_nest == innermost_loop_nest: + # such a loop nesting already exists => do nothing + return tree, new_outer_loop_nest, None + + # add the outer loop to our loop nest tree + tree = tree.add_node(new_outer_loop_nest, + parent=tree.parent(innermost_loop_nest)) + + # rename the old loop to the inner loop + tree = tree.rename_node(innermost_loop_nest, + new_id=new_inner_loop_nest) + + # set the parent of inner loop to be the outer loop + tree = tree.move_node(new_inner_loop_nest, new_parent=new_outer_loop_nest) + + return tree, new_outer_loop_nest, new_inner_loop_nest + + +def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest): + """ + Returns a copy of *tree* that nests *inner_loop_nest* inside *outer_loop_nest*. + """ + # add the outer loop to our loop nest tree + return tree.add_node(inner_loop_nest, parent=outer_loop_nest) + + +def _order_loop_nests(loop_nest_tree, + strict_priorities, + relaxed_priorities, + iname_to_tree_node_id): + """ + Returns a loop nest where all nodes in the tree are instances of + :class:`str` denoting inames. Unlike *loop_nest_tree* which corresponds to + multiple loop nesting, this routine returns a unique loop nest that is + obtained after constraining *loop_nest_tree* with the constraints enforced + by *priorities*. + + :arg strict_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These priorities are imposed + strictly i.e. if these conditions cannot be met a + :class:`loopy.diagnostic.LoopyError` is raised. + + :arg relaxed_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These nesting constraints are + treated as options. + + :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a + part of. + """ + from pytools.graph import compute_topological_order as toposort + from warnings import warn + + loop_nests = set(iname_to_tree_node_id.values()) + + # flow_requirements: A mapping from the loop nest level to the nesting + # constraints applicable to it. + # Each nesting constraint is represented as a DAG. In the DAG, if there + # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested + # inside 'i'. + flow_requirements = {loop_nest: {iname: frozenset() + for iname in loop_nest} + for loop_nest in loop_nests} + + # The plan here is populate DAGs in *flow_requirements* and then perform a + # toposort for each loop nest. + + def _update_flow_requirements(priorities, cannot_satisfy_callback): + """ + Records *priorities* in *flow_requirements* and calls + *cannot_satisfy_callback* with an appropriate error message if the + priorities cannot be met. + """ + for priority in priorities: + for outer_iname, inner_iname in zip(priority[:-1], priority[1:]): + if inner_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {inner_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + if outer_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {outer_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + flow_requirements[inner_iname_nest][outer_iname] |= {inner_iname} + else: + ancestors_of_inner_iname = (loop_nest_tree + .ancestors(inner_iname_nest)) + ancestors_of_outer_iname = (loop_nest_tree + .ancestors(outer_iname_nest)) + if outer_iname in ancestors_of_inner_iname: + # nesting constraint already satisfied => do nothing + pass + elif inner_iname in ancestors_of_outer_iname: + cannot_satisfy_callback("Cannot satisfy constraint that" + f" iname '{inner_iname}' must be" + f" nested within '{outer_iname}''.") + else: + # inner iname and outer iname are indirect family members + # => must be realized via dependencies in the linearization + # phase, not implemented in v2-scheduler yet. + from loopy.schedule import V2SchedulerNotImplementedException + raise V2SchedulerNotImplementedException("cannot" + " schedule kernels with priority dependencies" + " between sibling loop nests") + + def _raise_loopy_err(x): + raise LoopyError(x) + + # record strict priorities + _update_flow_requirements(strict_priorities, _raise_loopy_err) + # record relaxed priorities + _update_flow_requirements(relaxed_priorities, warn) + + # ordered_loop_nests: A mapping from the unordered loop nests to their + # ordered couterparts. For example. If we had only one loop nest + # `frozenset({"i", "j", "k"})`, and the prioirities said added the + # constraint that "i" must be nested within "k", then `ordered_loop_nests` + # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop + # nests would now have an order. + ordered_loop_nests = {unordered_nest: toposort(flow, + key=lambda x: x) + for unordered_nest, flow in flow_requirements.items()} + + # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree' + + assert loop_nest_tree.root == frozenset() + + new_tree = Tree.from_root("") + + old_to_new_parent = {} + + old_to_new_parent[loop_nest_tree.root] = "" + + # traversing 'tree' in an BFS fashion to create 'new_tree' + queue = list(loop_nest_tree.children(loop_nest_tree.root)) + + while queue: + current_nest = queue.pop(0) + + ordered_nest = ordered_loop_nests[current_nest] + new_tree = new_tree.add_node(ordered_nest[0], + parent=old_to_new_parent[loop_nest_tree + .parent(current_nest)]) + for new_parent, new_child in zip(ordered_nest[:-1], ordered_nest[1:]): + new_tree = new_tree.add_node(node=new_child, parent=new_parent) + + old_to_new_parent[current_nest] = ordered_nest[-1] + + queue.extend(list(loop_nest_tree.children(current_nest))) + + # }}} + + return new_tree + + +@memoize_on_first_arg +def _get_parallel_inames(kernel): + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + return (concurrent_inames - ilp_inames - vec_inames) + + +def _get_partial_loop_nest_tree(kernel): + """ + Returns :class:`loopy.Tree` representing the *kernel*'s loop-nests. + + Each node of the returned tree has a :class:`frozenset` of inames. + All the inames in the identifier of a parent node of a loop nest in the + tree must be nested outside all the iname in identifier of the loop nest. + + .. note:: + + This routine only takes into account the nesting dependency + constraints of :attr:`loopy.InstructionBase.within_inames` of all the + *kernel*'s instructions and the iname tags. This routine does *NOT* + include the nesting constraints imposed by the dependencies between the + instructions and the dependencies imposed by the kernel's domain tree. + """ + from loopy.kernel.data import IlpBaseTag + + # figuring the possible loop nestings minus the concurrent_inames as they + # are never realized as actual loops + iname_chains = {insn.within_inames - _get_parallel_inames(kernel) + for insn in kernel.instructions} + + root = frozenset() + tree = Tree.from_root(root) + + # mapping from iname to the innermost loop nest they are part of in *tree*. + iname_to_tree_node_id = {} + + # if there were any loop with no inames, those have been already account + # for as the root. + iname_chains = iname_chains - {root} + + for iname_chain in iname_chains: + not_seen_inames = frozenset(iname for iname in iname_chain + if iname not in iname_to_tree_node_id) + seen_inames = iname_chain - not_seen_inames + + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + seen_inames) + if not_seen_inames: + # make '_not_seen_inames' nest inside the seen ones. + # example: if there is already a loop nesting "i,j,k" + # and the current iname chain is "i,j,l". Only way this is possible + # is if "l" is nested within "i,j"-loops. + tree = _add_inner_loops(tree, outer_loop, not_seen_inames) + + # {{{ update iname to node id + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + for iname in not_seen_inames: + iname_to_tree_node_id[iname] = not_seen_inames + + # }}} + + # {{{ make ILP tagged inames innermost + + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + + for iname_chain in iname_chains: + for ilp_iname in (ilp_inames & iname_chains): + # pull out other loops so that ilp_iname is the innermost + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + (iname_chain + - {ilp_iname})) + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + # }}} + + return tree + + +def _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree): + """ + Returns the mapping from the iname to the *tree*'s node that it was a part + of. + + :arg tree: A partial loop nest tree. + """ + iname_to_tree_node_id = {} + for node in tree.nodes(): + assert isinstance(node, frozenset) + for iname in node: + iname_to_tree_node_id[iname] = node + + return pmap(iname_to_tree_node_id) + + +def get_loop_nest_tree(kernel): + """ + Returns ```tree``` (an instance of :class:`Tree`) representing the loop + nesting for *kernel*. Each node of ``tree`` is an instance of :class:`str` + corresponding to the inames of *kernel* that are realized as concrete + ``for-loops``. A parent node in `tree` is always nested outside all its + children. + + .. note:: + + Multiple loop nestings might exist for *kernel*, but this routine returns + one valid loop nesting. + """ + from islpy import dim_type + + tree = _get_partial_loop_nest_tree(kernel) + iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + strict_loop_priorities = frozenset() + + # {{{ impose constraints by the domain tree + + loop_inames = (reduce(frozenset.union, + (insn.within_inames + for insn in kernel.instructions), + frozenset()) + - _get_parallel_inames(kernel)) + + for dom in kernel.domains: + for outer_iname in set(dom.get_var_names(dim_type.param)): + if outer_iname not in loop_inames: + continue + + for inner_iname in dom.get_var_names(dim_type.set): + if inner_iname not in loop_inames: + continue + + # either outer_iname and inner_iname should belong to the same + # loop nest level or outer should be strictly outside inner + # iname + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + strict_loop_priorities |= {(outer_iname, inner_iname)} + else: + ancestors_of_inner_iname = tree.ancestors(inner_iname_nest) + if outer_iname_nest not in ancestors_of_inner_iname: + raise LoopyError(f"Loop '{outer_iname}' cannot be nested" + f" outside '{inner_iname}'.") + + # }}} + + return _order_loop_nests(tree, + strict_loop_priorities, + kernel.loop_priority, + iname_to_tree_node_id) + +# vim: fdm=marker From bf7ca9aee5d4c4def6d1b7644a5e29a21e8d8132 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Aug 2021 12:22:01 -0500 Subject: [PATCH 003/109] adds loopy scheduler v2 --- loopy/schedule/__init__.py | 228 ++++++++++++++++++++++++++++++++----- 1 file changed, 197 insertions(+), 31 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 5822f44ed..92ceda2e6 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -810,8 +810,161 @@ def is_similar_to_template(insn): # {{{ scheduling algorithm -def generate_loop_schedules_internal( - sched_state, debug=None): +def _get_outermost_diverging_inames(tree, within1, within2): + """ + For loop nestings *within1* and *within2*, returns the first inames at which + the loops nests diverge in the loop nesting tree *tree*. + + :arg tree: A :class:`loopy.tools.Tree` of inames, denoting a loop nesting. + :arg within1: A :class:`frozenset` of inames. + :arg within2: A :class:`frozenset` of inames. + """ + common_ancestors = (within1 & within2) | {""} + + innermost_parent = max(common_ancestors, + key=lambda k: tree.depth(k)) + iname1, = tree.children(innermost_parent) & within1 + iname2, = tree.children(innermost_parent) & within2 + + return iname1, iname2 + + +class V2SchedulerNotImplementedException(RuntimeError): + pass + + +def generate_loop_schedules_v2(kernel): + from loopy.schedule.tools import get_loop_nest_tree + from functools import reduce + from pytools.graph import compute_topological_order + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + parallel_inames = (concurrent_inames - ilp_inames - vec_inames) + + # {{{ can v2 scheduler handle?? + + if any(len(insn.conflicts_with_groups) != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction having conflicts with groups.") + + if any(insn.priority != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction priorities set.") + + if kernel.linearization is not None: + # cannnot handle preschedule yet + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " prescheduled kernels.") + + if ilp_inames or vec_inames: + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " loops tagged with 'ilp'/'vec' as they are not guaranteed to" + " be single entry loops.") + + # }}} + + loop_nest_tree = get_loop_nest_tree(kernel) + + # loop_inames: inames that are realized as loops. Concurrent inames aren't + # realized as a loop in the generated code for a loopy.TargetBase. + loop_inames = (reduce(frozenset.union, (insn.within_inames + for insn in kernel.instructions), + frozenset()) + - parallel_inames) + + # The idea here is to build a DAG, where nodes are schedule items and if + # there exists an edge from schedule item A to schedule item B in the DAG => + # B *must* come after A in the linearized result. + + dag = {} + + # LeaveLoop(i) *must* follow EnterLoop(i) + dag.update({EnterLoop(iname=iname): frozenset({LeaveLoop(iname=iname)}) + for iname in loop_inames}) + dag.update({LeaveLoop(iname=iname): frozenset() + for iname in loop_inames}) + dag.update({RunInstruction(insn_id=insn.id): frozenset() + for insn in kernel.instructions}) + + # {{{ add constraints imposed by the loop nesting + + for outer_loop in loop_nest_tree.nodes(): + if outer_loop == "": + continue + + for child in loop_nest_tree.children(outer_loop): + inner_loop = child + dag[EnterLoop(iname=outer_loop)] |= {EnterLoop(iname=inner_loop)} + dag[LeaveLoop(iname=inner_loop)] |= {LeaveLoop(iname=outer_loop)} + + # }}} + + # {{{ add deps. b/w schedule items coming from insn. depepdencies + + for insn in kernel.instructions: + insn_loop_inames = insn.within_inames & loop_inames + for dep_id in insn.depends_on: + dep = kernel.id_to_insn[dep_id] + dep_loop_inames = dep.within_inames & loop_inames + # Enforce instruction dep: + dag[RunInstruction(insn_id=dep_id)] |= {RunInstruction(insn_id=insn.id)} + + # {{{ register deps on loop entry/leave because of insn. deps + + if dep_loop_inames < insn_loop_inames: + for iname in insn_loop_inames - dep_loop_inames: + dag[RunInstruction(insn_id=dep.id)] |= {EnterLoop(iname=iname)} + elif insn_loop_inames < dep_loop_inames: + for iname in dep_loop_inames - insn_loop_inames: + dag[LeaveLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + elif dep_loop_inames != insn_loop_inames: + insn_iname, dep_iname = _get_outermost_diverging_inames( + loop_nest_tree, insn_loop_inames, dep_loop_inames) + dag[LeaveLoop(iname=dep_iname)] |= {EnterLoop(iname=insn_iname)} + else: + pass + + # }}} + + for iname in insn_loop_inames: + # For an insn within a loop nest 'i' + # for i + # insn + # end i + # 'insn' *must* come b/w 'for i' and 'end i' + dag[EnterLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + dag[RunInstruction(insn_id=insn.id)] |= {LeaveLoop(iname=iname)} + + # }}} + + def iname_key(iname): + all_ancestors = sorted(loop_nest_tree.ancestors(iname), + key=lambda x: loop_nest_tree.depth(x)) + return ",".join(all_ancestors+[iname]) + + def key(x): + if isinstance(x, RunInstruction): + iname = max((kernel.id_to_insn[x.insn_id].within_inames & loop_inames), + key=lambda k: loop_nest_tree.depth(k), + default="") + result = (iname_key(iname), x.insn_id) + elif isinstance(x, (EnterLoop, LeaveLoop)): + result = (iname_key(x.iname),) + else: + raise NotImplementedError + + return result + + return compute_topological_order(dag, key=key) + + +def generate_loop_schedules_internal(sched_state, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel @@ -1954,6 +2107,39 @@ def generate_loop_schedules(kernel, callables_table, debug_args=None): callables_table, debug_args=debug_args) +def postprocess_schedule(kernel, callables_table, gen_sched): + from loopy.kernel import KernelState + gen_sched = convert_barrier_instructions_to_barriers( + kernel, gen_sched) + + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % kernel.name) + gen_sched = insert_barriers(kernel, gen_sched, + synchronization_kind="global", verify_only=True) + + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) + + new_kernel = kernel.copy( + linearization=gen_sched, + state=KernelState.LINEARIZED) + + from loopy.schedule.device_mapping import \ + map_schedule_onto_host_or_device + if kernel.state != KernelState.LINEARIZED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + from loopy.schedule.tools import add_extra_args_to_schedule + return add_extra_args_to_schedule(new_kernel) + + def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): if debug_args is None: debug_args = {} @@ -1963,6 +2149,14 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") + try: + gen_sched = generate_loop_schedules_v2(kernel) + yield postprocess_schedule(kernel, callables_table, gen_sched) + return + except V2SchedulerNotImplementedException as e: + from warnings import warn + warn(f"Falling back to a slow scheduler implementation due to: {e}") + schedule_count = 0 debug = ScheduleDebugger(**debug_args) @@ -2072,35 +2266,7 @@ def print_longest_dead_end(): sched_state, debug=debug, **schedule_gen_kwargs): debug.stop() - gen_sched = convert_barrier_instructions_to_barriers( - kernel, gen_sched) - - gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, - return_dict=True) - - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, - synchronization_kind="global", verify_only=True) - - logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) - - new_kernel = kernel.copy( - linearization=gen_sched, - state=KernelState.LINEARIZED) - - from loopy.schedule.device_mapping import \ - map_schedule_onto_host_or_device - if kernel.state != KernelState.LINEARIZED: - # Device mapper only gets run once. - new_kernel = map_schedule_onto_host_or_device(new_kernel) - - from loopy.schedule.tools import add_extra_args_to_schedule - new_kernel = add_extra_args_to_schedule(new_kernel) + new_kernel = postprocess_schedule(kernel, callables_table, gen_sched) yield new_kernel debug.start() From 8c42d10b09953ff5c03e68b1ec43f57bbc06ad52 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Aug 2021 12:22:57 -0500 Subject: [PATCH 004/109] changes in docs to account for equivalent generated codes from the same LoopKernel --- doc/tutorial.rst | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 2671de282..63b989b33 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -610,7 +610,7 @@ commonly called 'loop tiling': ... assumptions="n mod 16 = 0 and n >= 1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.split_iname(knl, "j", 16) - >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner") + >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner") >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) @@ -1029,8 +1029,8 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - acc_k = 0.0f; a_fetch = a[16 * gid(0) + lid(0)]; + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; @@ -1053,10 +1053,10 @@ earlier: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) + acc_k = 0.0f; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { @@ -1908,18 +1908,16 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; - { - int const k_outer = 0; - + for (int i = 0; i <= 49; ++i) for (int j = 0; j <= 9; ++j) - for (int i = 0; i <= 49; ++i) - { - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; - } - } + { + int const k_outer = 0; + + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; + } } In this kernel, when a work-item performs the second instruction it uses data From 12189b9baac15d13508ba0b996554b6258704a07 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Apr 2021 23:37:54 -0500 Subject: [PATCH 005/109] store loopy domains as LoopKernelDomains instead of list * Provides better performance via taking into account incremental updates * Adds :mod:`pyrsistent` as a dep. --- loopy/kernel/__init__.py | 161 +++++++++++++++++++++++++++++++++-- loopy/kernel/creation.py | 3 +- loopy/kernel/tools.py | 6 +- loopy/tools.py | 7 +- loopy/transform/data.py | 6 +- loopy/transform/fusion.py | 7 +- loopy/transform/iname.py | 24 +++--- loopy/transform/parameter.py | 7 +- loopy/transform/save.py | 4 +- 9 files changed, 190 insertions(+), 35 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 7425f0d3f..db0cb48fc 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -39,6 +39,7 @@ from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname +from pyrsistent import PClass, field, pmap, pvector from warnings import warn @@ -146,8 +147,154 @@ def SCHEDULED(): # pylint:disable=no-method-argument def _get_inames_from_domains(domains): - return frozenset().union(* - (frozenset(dom.get_var_names(dim_type.set)) for dom in domains)) + return domains.set_dims + + +class LoopKernelDomains(PClass): + """ + Records the domain information seen in a :class:`loopy.kernel.LoopKernel`. + + .. attribute:: _domains + + A :class:`pyrsistent.PVector` of :class:`islpy.BasicSet` instances + representing the :ref:`domain-tree`. + + .. attribute:: param_to_idoms + + A :class:`pyrsistent.PMap` of dim names to :class:`frozenset` of + indices of domains in which the dims appear as + :class:`islpy.dim_type.param`-type dims. + + .. attribute:: home_domain_map + + A :class:`pyrsistent.PMap` of dim names to the index of the domain in + which the dims appear as :class:`islpy.dim_type.set`-type dim. + + .. automethod:: append + .. automethod:: swap + """ + _domains = field() + param_to_idoms = field() + home_domain_map = field() + + def __getitem__(self, key): + return self._domains[key] + + def append(self, dom): + """ + Returns a copy of *self* with *dom* appended to it's domains. + """ + assert dom.get_ctx() == isl.DEFAULT_CONTEXT + + param_to_idoms_update = {} + idom = len(self._domains) + + for var in dom.get_var_names(dim_type.param): + param_to_idoms_update[var] = (self.param_to_idoms.get(var, frozenset()) + | frozenset([idom])) + + hdm_update = {k: idom for k in dom.get_var_names(dim_type.set)} + + return LoopKernelDomains(_domains=self._domains.append(dom), + param_to_idoms=(self.param_to_idoms + .update(param_to_idoms_update)), + home_domain_map=(self.home_domain_map + .update(hdm_update) + )) + + def swap(self, idom, domain): + """ + Returns a copy of *self* with its *idom*-th domain replaced with + *domain*. + """ + from functools import reduce + + new_domains = self._domains.set(idom, domain) + hdm = reduce(lambda x, y: x.set(y, idom), + domain.get_var_names(dim_type.set), + reduce(lambda x, y: x.remove(y), + self._domains[idom].get_var_names(dim_type.set), + self.home_domain_map)) + + param_to_idoms = self.param_to_idoms + param_to_idoms_update = defaultdict(list) + + # {{{ remove the params of old domains + + for par in self._domains[idom].get_var_names(dim_type.param): + if param_to_idoms[par] == frozenset([idom]): + param_to_idoms = param_to_idoms.remove(par) + else: + assert idom in param_to_idoms[par] + param_to_idoms_update[par] = list(param_to_idoms[par] + - frozenset([idom])) + + # }}} + + # {{{ add the params from new_domains + + for var in domain.get_var_names(dim_type.param): + param_to_idoms_update[var].append(idom) + + # }}} + + param_to_idoms_update = {k: frozenset(v) + for k, v in param_to_idoms_update.items()} + + return LoopKernelDomains(_domains=new_domains, + home_domain_map=hdm, + param_to_idoms=(param_to_idoms + .update(param_to_idoms_update))) + + def extend(self, domains): + from functools import reduce + return reduce(lambda x, y: x.append(y), domains, self) + + def __add__(self, other): + if isinstance(other, list): + return self.extend(other) + + return NotImplemented + + def __iter__(self): + return iter(self._domains) + + def __len__(self): + return len(self._domains) + + def thaw(self): + from pyrsistent import thaw + return thaw(self._domains) + + @property + def set_dims(self): + return frozenset(self.home_domain_map.keys()) + + @property + def param_dims(self): + return frozenset(self.param_to_idoms.keys()) + + def update_persistent_hash(self, key_hash, key_builder): + """Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + """ + for field_name in sorted(self._pclass_fields): + key_builder.rec(key_hash, getattr(self, field_name)) + + +def make_loop_kernel_domains(domains): + param_to_idoms = defaultdict(frozenset) + for idom, dom in enumerate(domains): + for var in dom.get_var_names(dim_type.param): + param_to_idoms[var] |= frozenset([idom]) + + home_domain_map = pmap({iname: i_domain + for i_domain, dom in enumerate(domains) + for iname in dom.get_var_names(dim_type.set)}) + + return LoopKernelDomains(_domains=pvector(domains), + param_to_idoms=pmap(param_to_idoms), + home_domain_map=home_domain_map) class _not_provided: # noqa: N801 @@ -166,8 +313,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): .. attribute:: domains - a list of :class:`islpy.BasicSet` instances representing the - :ref:`domain-tree`. + an instance of :class:`loopy.kernel.LoopKernelDomains`. .. attribute:: instructions @@ -338,6 +484,8 @@ def __init__(self, domains, instructions, args=None, assert isinstance(assumptions, isl.BasicSet) assert assumptions.is_params() + assert isinstance(domains, LoopKernelDomains) + from loopy.types import to_loopy_type index_dtype = to_loopy_type(index_dtype) if not index_dtype.is_integral(): @@ -589,10 +737,7 @@ def all_parents_per_domain(self): @memoize_method def _get_home_domain_map(self): - return { - iname: i_domain - for i_domain, dom in enumerate(self.domains) - for iname in dom.get_var_names(dim_type.set)} + return self.domains.home_domain_map def get_home_domain_index(self, iname): return self._get_home_domain_map()[iname] diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index d88c6e54a..9509a7e5e 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2404,7 +2404,8 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): temporary_variables[tv.name] = tv del cse_temp_vars - domains = parse_domains(domains, defines) + from loopy.kernel import make_loop_kernel_domains + domains = make_loop_kernel_domains(parse_domains(domains, defines)) # {{{ process assumptions diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 9806fbe8d..859e1641b 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -475,11 +475,11 @@ def get_original_domain(self): return self.kernel.domains[self.leaf_domain_index] def get_domains_with(self, replacement): - result = self.kernel.domains[:] + result = self.kernel.domains if self.leaf_domain_index is not None: - result[self.leaf_domain_index] = replacement + result = result.swap(self.leaf_domain_index, replacement) else: - result.append(replacement) + result = result.append(replacement) return result diff --git a/loopy/tools.py b/loopy/tools.py index 78969d31f..b649b4b17 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -131,7 +131,12 @@ def update_for_pymbolic_expression(self, key_hash, key): else: PersistentHashWalkMapper(key_hash)(key) - update_for_PMap = update_for_dict # noqa: N815 + def _update_for_pyrsistent_containers(self, key_hash, key): + from pyrsistent import thaw + self.rec(key_hash, thaw(key)) + + update_for_PVector = _update_for_pyrsistent_containers # noqa: N815 + update_for_PMap = _update_for_pyrsistent_containers # noqa: N815 class PymbolicExpressionHashWrapper: diff --git a/loopy/transform/data.py b/loopy/transform/data.py index c91aee4c3..74e29dd9a 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -359,7 +359,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, # {{{ remove inames that were temporarily added by slice sweeps - new_domains = new_kernel.domains[:] + new_domains = new_kernel.domains for iname in inames_to_be_removed: home_domain_index = kernel.get_home_domain_index(iname) @@ -367,8 +367,8 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, dt, idx = domain.get_var_dict()[iname] assert dt == dim_type.set - - new_domains[home_domain_index] = domain.project_out(dt, idx, 1) + new_domains = new_domains.swap(home_domain_index, + domain.project_out(dt, idx, 1)) new_kernel = new_kernel.copy(domains=new_domains) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 6e28d9e7b..e4bed4ad8 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -133,12 +133,12 @@ def _fuse_two_kernels(kernela, kernelb): # {{{ fuse domains - new_domains = kernela.domains[:] + new_domains = kernela.domains for dom_b in kernelb.domains: i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains) if i_fuse is None: - new_domains.append(dom_b) + new_domains = new_domains.append(dom_b) else: dom_a = new_domains[i_fuse] dom_a, dom_b = isl.align_two(dom_a, dom_b) @@ -156,8 +156,7 @@ def _fuse_two_kernels(kernela, kernelb): "inames '%s'" % (",".join(shared_inames))) new_domain = dom_a & dom_b - - new_domains[i_fuse] = new_domain + new_domains = new_domains.swap(i_fuse, new_domain) # }}} diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 3712d678b..c939dbb45 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -269,10 +269,12 @@ def _split_iname_backend(kernel, iname_to_split, if inner_iname is None: inner_iname = vng(iname_to_split+"_inner") - new_domains = [ - _split_iname_in_set(dom, iname_to_split, inner_iname, outer_iname, - fixed_length, fixed_length_is_inner) - for dom in kernel.domains] + new_domains = kernel.domains + for idom, dom in enumerate(kernel.domains): + if iname_to_split in dom.get_var_dict(): + new_domains = new_domains.swap(idom, _split_iname_in_set( + dom, iname_to_split, inner_iname, outer_iname, fixed_length, + fixed_length_is_inner)) from pymbolic import var inner = var(inner_iname) @@ -1304,20 +1306,19 @@ def remove_unused_inames(kernel, inames=None): domains = kernel.domains for iname in unused_inames: - new_domains = [] - for dom in domains: + for idom, dom in enumerate(domains): try: dt, idx = dom.get_var_dict()[iname] except KeyError: pass else: dom = dom.project_out(dt, idx, 1) - new_domains.append(dom) - domains = new_domains + domains = domains.swap(idom, dom) - kernel = kernel.copy(domains=domains) + kernel = kernel.copy(domains=domains, + inames=new_inames) # }}} @@ -1564,7 +1565,7 @@ def parse_equation(eqn): new_inames_set = frozenset(new_inames) old_inames_set = frozenset(old_inames) - new_domains = [] + new_domains = kernel.domains for idom, dom in enumerate(kernel.domains): dom_var_dict = dom.get_var_dict() old_iname_overlap = [ @@ -1573,7 +1574,6 @@ def parse_equation(eqn): if iname in dom_var_dict] if not old_iname_overlap: - new_domains.append(dom) continue from loopy.symbolic import get_dependencies @@ -1644,7 +1644,7 @@ def parse_equation(eqn): dt, idx = dom.get_var_dict()[iname] dom = dom.project_out(dt, idx, 1) - new_domains.append(dom) + new_domains = new_domains.swap(idom, dom) # }}} diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 5cffdcf23..38088deda 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -24,6 +24,7 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from islpy import dim_type from loopy.translation_unit import for_each_kernel from loopy.kernel import LoopKernel @@ -89,7 +90,11 @@ def process_set(s): return s - new_domains = [process_set(dom) for dom in kernel.domains] + new_domains = kernel.domains + + for idom, dom in enumerate(kernel.domains): + if name in dom.get_var_names(dim_type.param): + new_domains = new_domains.swap(idom, process_set(dom)) from pymbolic.mapper.substitutor import make_subst_func subst_func = make_subst_func({name: value}) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 3ced06bfc..fd4f75ebf 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -599,10 +599,10 @@ def finish(self): self.updated_iname_to_tags.update(self.kernel.iname_to_tags) self.updated_temporary_variables.update(self.kernel.temporary_variables) - new_domains = list(self.kernel.domains) + new_domains = self.kernel.domains import islpy as isl if self.new_subdomain.dim(isl.dim_type.set) > 0: - new_domains.append(self.new_subdomain) + new_domains = new_domains.append(self.new_subdomain) kernel = self.kernel.copy( domains=new_domains, From a73125144b1ff4dc3922be909122517c6ffffdbb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Apr 2021 23:41:42 -0500 Subject: [PATCH 006/109] store LoopKernel.inames as InameDict instead of a dict * dict would be a dense data structure that would require lead costly LoopKernel copies. --- loopy/kernel/__init__.py | 106 ++++++++++++++++++++++++++++++++++---- loopy/kernel/creation.py | 8 ++- loopy/kernel/tools.py | 15 +++--- loopy/transform/fusion.py | 11 ++-- loopy/transform/iname.py | 10 ++-- 5 files changed, 119 insertions(+), 31 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index db0cb48fc..e42ab4a06 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -150,6 +150,94 @@ def _get_inames_from_domains(domains): return domains.set_dims +class InameDict(PClass): + """ + A mapping from iname names to corresponding instances of + :class:`loopy.kernel.data.Iname`. + + :attr data: An instance of :class:`pyrsistent.PMap` from iname names + to instances of :class:`~loopy.kernel.data.Iname`. + :attr all_inames: A :class:`frozenset` of names of all inames in a + :class:`LoopKernel` + + .. note:: + + * Inames that are not a part of :attr:`InameDict.data`, but are seen in + :attr`InameDict.all_inames` are realized as instances of + :class:`~loopy.kernel.data.Iname` with no tags. + + * This class was introduced to cut-down the operation and storage + overhead that comes with maintaining default instances of + :class:`~loopy.kernel.data.Iname`. + + .. automethod:: set + .. automethod:: remove + .. automethod:: discard + """ + data = field() + all_inames = field() + + def copy(self, data=None, all_inames=None): + if all_inames is None: + all_inames = self.all_inames + + if data is None: + data = self.data + + return InameDict(data=data, all_inames=all_inames) + + def __getitem__(self, key): + try: + return self.data[key] + except KeyError: + if key in self.all_inames: + return Iname(key, frozenset()) + else: + raise KeyError + + def set(self, key, val): + assert isinstance(val, Iname) + return self.copy(self.data.set(key, val), + self.all_inames | frozenset([val.name])) + + def remove(self, key): + if key not in self.all_inames: + raise LoopyError(f"Cannot remove unknown iname '{key}'") + + return self.copy(self.data.discard(key), + self.all_inames - frozenset([key])) + + def discard(self, key): + return self.copy(self.data.discard(key), + self.all_inames - frozenset([key])) + + def __iter__(self): + return iter(self.all_inames) + + def keys(self): + return iter(self.all_inames) + + def items(self): + return ((k, self[k]) for k in self.keys()) + + def values(self): + return (self[k] for k in self.keys()) + + def update_persistent_hash(self, key_hash, key_builder): + """Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + """ + for field_name in sorted(self._pclass_fields): + key_builder.rec(key_hash, getattr(self, field_name)) + + +def make_iname_dict(tagged_inames, all_inames): + assert set(tagged_inames) <= all_inames + assert isinstance(tagged_inames, dict) + assert isinstance(all_inames, frozenset) + return InameDict(data=pmap(tagged_inames), all_inames=all_inames) + + class LoopKernelDomains(PClass): """ Records the domain information seen in a :class:`loopy.kernel.LoopKernel`. @@ -385,9 +473,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): .. attribute:: inames - An instance of :class:`dict`, a mapping from the names of kernel's - inames to their corresponding instances of :class:`loopy.kernel.data.Iname`. - An entry is guaranteed to be present for each iname. + An instance of :class:`~loopy.kernel.InameDict`. .. automethod:: __call__ .. automethod:: copy @@ -470,11 +556,10 @@ def __init__(self, domains, instructions, args=None, raise LoopyError("Cannot provide both iname_to_tags and inames to " "LoopKernel.__init__") - inames = { - name: inames.get(name, Iname(name, frozenset())) - for name in _get_inames_from_domains(domains)} + inames = make_iname_dict({k: Iname(v) for k, v in iname_to_tags.items()}, + self.domain.set_dims) - assert isinstance(inames, dict) + assert isinstance(inames, InameDict) if index_dtype is None: index_dtype = np.int32 @@ -1763,10 +1848,9 @@ def get_copy_kwargs(self, **kwargs): iname_to_tags = kwargs["iname_to_tags"] domains = kwargs.get("domains", self.domains) - kwargs["inames"] = {name: Iname(name, - iname_to_tags.get(name, frozenset())) - for name in _get_inames_from_domains(domains) - } + kwargs["inames"] = make_iname_dict({k: Iname(k, v) + for k, v in iname_to_tags.items()}, + self.domains.set_dims) del kwargs["iname_to_tags"] if "domains" in kwargs: diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 9509a7e5e..2a6a98d1c 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2431,11 +2431,9 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): raise LoopyError("assumptions must be either 'str' or BasicSet") # }}} - - from loopy.kernel.data import Iname - from loopy.kernel import _get_inames_from_domains - inames = {name: Iname(name, frozenset()) - for name in _get_inames_from_domains(domains)} + from loopy.kernel import (_get_inames_from_domains, + make_loop_kernel_domains, make_iname_dict) + inames = make_iname_dict({}, _get_inames_from_domains(domains)) arg_guesser = ArgumentGuesser(domains, instructions, temporary_variables, substitutions, diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 859e1641b..b897dbcb1 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -822,11 +822,12 @@ def assign_axis(recursion_axis, iname, axis=None): except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. - new_inames = kernel.inames.copy() - new_inames[iname] = kernel.inames[iname].copy( - tags=frozenset(tag - for tag in kernel.inames[iname].tags - if not isinstance(tag, AutoLocalInameTagBase))) + new_inames = kernel.inames + new_inames = kernel.inames.set(iname, + kernel.inames[iname].copy(tags=frozenset( + tag + for tag in kernel.inames[iname].tags + if not isinstance(tag, AutoLocalInameTagBase)))) return assign_automatic_axes( kernel.copy(inames=new_inames), callables_table, @@ -895,8 +896,8 @@ def assign_axis(recursion_axis, iname, axis=None): frozenset(tag for tag in kernel.inames[iname].tags if not isinstance(tag, AutoLocalInameTagBase)) | new_tag_set) - new_inames = kernel.inames.copy() - new_inames[iname] = kernel.inames[iname].copy(tags=new_tags) + new_inames = kernel.inames + new_inames = new_inames.set(iname, kernel.inames[iname].copy(tags=new_tags)) return assign_automatic_axes(kernel.copy(inames=new_inames), callables_table, axis=recursion_axis, local_size=local_size) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index e4bed4ad8..b8820c43f 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -231,7 +231,8 @@ def _fuse_two_kernels(kernela, kernelb): # }}} - from loopy.kernel import LoopKernel + from loopy.kernel import LoopKernel, make_iname_dict + from pyrsistent import thaw return LoopKernel( domains=new_domains, instructions=new_instructions, @@ -244,10 +245,10 @@ def _fuse_two_kernels(kernela, kernelb): local_sizes=_merge_dicts( "local size", kernela.local_sizes, kernelb.local_sizes), temporary_variables=new_temporaries, - inames=_merge_dicts( - "inames", - kernela.inames, - kernelb.inames), + inames=make_iname_dict(_merge_dicts("inames", + thaw(kernela.inames.data), + thaw(kernelb.inames.data)), + new_domains.set_dims), substitutions=_merge_dicts( "substitution", kernela.substitutions, diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c939dbb45..459c1eb55 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -683,8 +683,10 @@ def untag_inames(kernel, iname_to_untag, tag_type): tags_to_remove = filter_iname_tags_by_type( kernel.inames[iname_to_untag].tags, tag_type) new_inames = kernel.inames.copy() - new_inames[iname_to_untag] = kernel.inames[iname_to_untag].without_tags( - tags_to_remove, verify_existence=False) + new_inames = new_inames.set(iname_to_untag, + kernel.inames[iname_to_untag] + .without_tags(tags_to_remove, + verify_existence=False)) return kernel.copy(inames=new_inames) @@ -805,7 +807,7 @@ def parse_tag(tag): if name not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % name) - knl_inames[name] = knl_inames[name].tagged(new_tag) + knl_inames = knl_inames.set(name, knl_inames[iname].tagged(new_tag)) return kernel.copy(inames=knl_inames) @@ -1305,7 +1307,9 @@ def remove_unused_inames(kernel, inames=None): # {{{ remove them domains = kernel.domains + new_inames = kernel.inames for iname in unused_inames: + new_inames = new_inames.remove(iname) for idom, dom in enumerate(domains): try: From 26c211b139a3d5a2f9d750fadab368754e375f4a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Apr 2021 23:44:32 -0500 Subject: [PATCH 007/109] exit early strategy: calculating parents_per_domain --- loopy/kernel/__init__.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e42ab4a06..593875ea3 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -743,6 +743,13 @@ def parents_per_domain(self): tree to the root. """ + # {{{ exit early strategy: all domains are roots + + if self.domains.param_dims <= self.get_unwritten_value_args(): + return [None, ] * len(self.domains) + + # }}} + # The stack of iname sets records which inames are active # as we step through the linear list of domains. It also # determines the granularity of inames to be popped/decactivated @@ -806,10 +813,18 @@ def all_parents_per_domain(self): Each domains nest list walks from the leaves of the nesting tree to the root. """ - result = [] + result = [] ppd = self.parents_per_domain() - for parent in ppd: + + # {{{ exit early strategy: all domains are roots + + if set(ppd) == {None}: + return [[], ] * len(self.domains) + + # }}} + + for dom, parent in zip(self.domains, ppd): # keep walking up tree to find *all* parents dom_result = [] while parent is not None: From ddd48528778a9ca08c76033233d34ef345dc4205 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Apr 2021 23:45:00 -0500 Subject: [PATCH 008/109] exit early strategy: remove unused iname --- loopy/transform/iname.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 459c1eb55..761edfafd 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1311,6 +1311,18 @@ def remove_unused_inames(kernel, inames=None): for iname in unused_inames: new_inames = new_inames.remove(iname) + # {{{ easy update: iname is only a set dim + + if iname not in domains.param_dims: + idom = domains.home_domain_map[iname] + dom = domains[idom] + dt, idx = dom.get_var_dict()[iname] + dom = dom.project_out(dt, idx, 1) + domains = domains.swap(idom, dom) + continue + + # }}} + for idom, dom in enumerate(domains): try: dt, idx = dom.get_var_dict()[iname] From f7b0f2b968fa875541c0c316c296fca7c95d1477 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 00:38:47 -0500 Subject: [PATCH 009/109] implement LoopKernelDomains.__radd__ --- loopy/kernel/__init__.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 593875ea3..09df28f57 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -344,6 +344,33 @@ def __add__(self, other): return NotImplemented + def __radd__(self, other): + if not isinstance(other, (list, PVector)): + return NotImplemented + + if isinstance(other, list): + other = pvector(other) + + # {{{ update all domain indices + + home_domain_map = {k: v+len(other) + for k, v in self.home_domain_map.items()} + param_to_idoms = {k: frozenset(map(lambda x: x+len(other), v)) + for k, v in self.param_to_idoms.items()} + + # }}} + + for idom, dom in enumerate(other): + for dim in dom.get_var_names(dim_type.set): + home_domain_map[dim] = idom + + for dim in dom.get_var_names(dim_type.param): + param_to_idoms[dim] = idom + + return LoopKernelDomains(_domains=other+self._domains, + param_to_idoms=pmap(param_to_idoms), + home_domain_map=pmap(home_domain_map)) + def __iter__(self): return iter(self._domains) From 67a4b4a9c1a906268b6bf29f14ada3d1a3e96903 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 00:47:07 -0500 Subject: [PATCH 010/109] fixup! store LoopKernel.inames as InameDict instead of a dict --- loopy/transform/iname.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 761edfafd..913b2822a 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -807,7 +807,7 @@ def parse_tag(tag): if name not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % name) - knl_inames = knl_inames.set(name, knl_inames[iname].tagged(new_tag)) + knl_inames = knl_inames.set(name, knl_inames[name].tagged(new_tag)) return kernel.copy(inames=knl_inames) From 27364d5f44bd4acd974a3c320bc704a70ec13749 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 00:47:40 -0500 Subject: [PATCH 011/109] fixup! store loopy domains as LoopKernelDomains instead of list --- loopy/transform/data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 74e29dd9a..7fd099b94 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -728,10 +728,9 @@ def rename_arg_in_basic_set(dom): return dom - new_domains = [] - for dom in kernel.domains: - dom = rename_arg_in_basic_set(dom) - new_domains.append(dom) + new_domains = kernel.domains + for idom, dom in enumerate(kernel.domains): + new_domains = new_domains.swap(idom, rename_arg_in_basic_set(dom)) new_assumptions = rename_arg_in_basic_set(kernel.assumptions) From a67aa734c352b6053893ba24b48a6a0f6c9b9213 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 00:47:48 -0500 Subject: [PATCH 012/109] exit early: domain to be swapped *is* already inplace --- loopy/kernel/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 09df28f57..eba7bfe05 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -39,7 +39,7 @@ from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname -from pyrsistent import PClass, field, pmap, pvector +from pyrsistent import PClass, field, pmap, pvector, PVector from warnings import warn @@ -295,6 +295,10 @@ def swap(self, idom, domain): Returns a copy of *self* with its *idom*-th domain replaced with *domain*. """ + + if domain is self._domains[idom]: + return self + from functools import reduce new_domains = self._domains.set(idom, domain) From e6dde67d7711a997b9a907d249ba025a936bee69 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 16:13:46 -0500 Subject: [PATCH 013/109] simplify the logic in LoopKernelDomains.(swap|delete) --- loopy/kernel/__init__.py | 103 ++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index eba7bfe05..0de71f686 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -260,6 +260,7 @@ class LoopKernelDomains(PClass): .. automethod:: append .. automethod:: swap + .. automethod:: delete """ _domains = field() param_to_idoms = field() @@ -301,42 +302,122 @@ def swap(self, idom, domain): from functools import reduce + # {{{ swap dim names in home_domain_map + new_domains = self._domains.set(idom, domain) - hdm = reduce(lambda x, y: x.set(y, idom), + hdm = reduce(lambda acc, y: acc.set(y, idom), domain.get_var_names(dim_type.set), - reduce(lambda x, y: x.remove(y), + reduce(lambda acc, y: acc.remove(y), self._domains[idom].get_var_names(dim_type.set), self.home_domain_map)) + # }}} param_to_idoms = self.param_to_idoms - param_to_idoms_update = defaultdict(list) # {{{ remove the params of old domains + param_to_idoms_update = {} + for par in self._domains[idom].get_var_names(dim_type.param): if param_to_idoms[par] == frozenset([idom]): param_to_idoms = param_to_idoms.remove(par) else: assert idom in param_to_idoms[par] - param_to_idoms_update[par] = list(param_to_idoms[par] - - frozenset([idom])) + param_to_idoms_update[par] = param_to_idoms[par] - frozenset([idom]) + + param_to_idoms = param_to_idoms.update(param_to_idoms_update) # }}} # {{{ add the params from new_domains - for var in domain.get_var_names(dim_type.param): - param_to_idoms_update[var].append(idom) + param_to_idoms_update = {} + + for par in domain.get_var_names(dim_type.param): + param_to_idoms_update[par] = (param_to_idoms.get(par, frozenset()) + | frozenset([idom])) + + param_to_idoms = param_to_idoms.update(param_to_idoms_update) # }}} - param_to_idoms_update = {k: frozenset(v) - for k, v in param_to_idoms_update.items()} + return LoopKernelDomains(_domains=new_domains, + home_domain_map=hdm, + param_to_idoms=param_to_idoms) + + def delete(self, idom): + """ + Returns an instance of :class:`LoopKernelDomains` with + the domain at *idom* removed. + + .. note:: + + It would be cheaper to call :meth:`LoopKernelDomains.swap` instead + of calling :meth:`LoopKernelDomains.delete` and + :meth:`LoopKernelDomains.insert`. + """ + from functools import reduce + new_domains = self._domains.delete(idom) + + param_to_idoms = self.param_to_idoms + + # {{{ remove the params of old domains + + param_to_idoms_update = {} + for par in self._domains[idom].get_var_names(dim_type.param): + if param_to_idoms[par] == frozenset([idom]): + param_to_idoms = param_to_idoms.remove(par) + else: + assert idom in param_to_idoms[par] + param_to_idoms_update[par] = (param_to_idoms[par] + - frozenset([idom])) + + param_to_idoms = param_to_idoms.update(param_to_idoms_update) + + # }}} + + # {{{ update the indices of all domains in param_to_idoms for indices>idom + + param_to_idoms_update = {} + all_params_from_idom_plus_1 = reduce( + lambda acc, dom: acc.union(frozenset(dom.get_var_names(dim_type.param))), + self._domains[idom+1:], + frozenset()) + param_to_idoms_update = {par: frozenset(k if k < idom else k-1 + for k in param_to_idoms[par]) + for par in all_params_from_idom_plus_1} + param_to_idoms = param_to_idoms.update(param_to_idoms_update) + + # }}} + + # {{{ update the indices of all domains in home_domain_map for indices>idom + + # remove all the idom's set dims + hdm = reduce(lambda acc, x: acc.remove(x), + self._domains[idom].get_var_names(dim_type.set), + self.home_domain_map) + + hdm_update = {} + for i, dom in enumerate(self._domains[idom+1:], + start=idom+1): + for dim_name in dom.get_var_names(dim_type.set): + assert self.home_domain_map[dim_name] == i + hdm_update[dim_name] = i-1 + + hdm = hdm.update(hdm_update) + + # }}} return LoopKernelDomains(_domains=new_domains, home_domain_map=hdm, - param_to_idoms=(param_to_idoms - .update(param_to_idoms_update))) + param_to_idoms=param_to_idoms) + + def insert(self, idom, domain): + """ + Returns a copy of *self* with *domain* inserted at the *idom*-index in + :attr:`LoopKernel._domains`. + """ + raise NotImplementedError def extend(self, domains): from functools import reduce From 88d28350829a2bff760de78cf37628a6c49baf5b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 16:14:31 -0500 Subject: [PATCH 014/109] fixup! store loopy domains as LoopKernelDomains instead of list --- loopy/loop.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/loopy/loop.py b/loopy/loop.py index af61b7db5..c5e7d255b 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -114,16 +114,15 @@ def merge_loop_domains(kernel): # }}} - new_domains = kernel.domains[:] + new_domains = kernel.domains min_idx = min(inner_domain_idx, outer_domain_idx) max_idx = max(inner_domain_idx, outer_domain_idx) - del new_domains[max_idx] - del new_domains[min_idx] + new_domains = new_domains.delete(max_idx) outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom) - new_domains.insert(min_idx, inner_dom & outer_dom) + new_domains = new_domains.swap(min_idx, inner_dom & outer_dom) break if new_domains: From 6e5176a9359678181fab701d513f7902578840b3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 16:47:29 -0500 Subject: [PATCH 015/109] uses a frozendataclass rather than pyrsistent.PClass * pyrsistent.PClass doesn't play well with pylint --- loopy/kernel/__init__.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 0de71f686..e619cc8bd 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -39,7 +39,9 @@ from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname -from pyrsistent import PClass, field, pmap, pvector, PVector +from pyrsistent import pmap, pvector, PVector, PMap +from typing import FrozenSet +from dataclasses import dataclass from warnings import warn @@ -150,7 +152,8 @@ def _get_inames_from_domains(domains): return domains.set_dims -class InameDict(PClass): +@dataclass(frozen=True) +class InameDict: """ A mapping from iname names to corresponding instances of :class:`loopy.kernel.data.Iname`. @@ -174,8 +177,8 @@ class InameDict(PClass): .. automethod:: remove .. automethod:: discard """ - data = field() - all_inames = field() + data: PMap + all_inames: FrozenSet def copy(self, data=None, all_inames=None): if all_inames is None: @@ -227,7 +230,7 @@ def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. """ - for field_name in sorted(self._pclass_fields): + for field_name in sorted(self.__dataclass_fields__): key_builder.rec(key_hash, getattr(self, field_name)) @@ -238,7 +241,8 @@ def make_iname_dict(tagged_inames, all_inames): return InameDict(data=pmap(tagged_inames), all_inames=all_inames) -class LoopKernelDomains(PClass): +@dataclass(frozen=True) +class LoopKernelDomains: """ Records the domain information seen in a :class:`loopy.kernel.LoopKernel`. @@ -262,9 +266,9 @@ class LoopKernelDomains(PClass): .. automethod:: swap .. automethod:: delete """ - _domains = field() - param_to_idoms = field() - home_domain_map = field() + _domains: PVector + param_to_idoms: PMap + home_domain_map: PMap def __getitem__(self, key): return self._domains[key] @@ -478,7 +482,7 @@ def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. """ - for field_name in sorted(self._pclass_fields): + for field_name in sorted(self.__dataclass_fields__): key_builder.rec(key_hash, getattr(self, field_name)) From 1e921f69f5ccef7ffd22955ef4739280cc49de8a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 16:57:15 -0500 Subject: [PATCH 016/109] [docs] expose LoopKernelDomains, InameDict * adds pyrsistent to list of intersphinx mappings --- doc/ref_kernel.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 922315685..82e015f83 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -704,6 +704,10 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to .. autoclass:: LoopKernel +.. autoclass:: loopy.kernel.LoopKernelDomains + +.. autoclass:: loopy.kernel.InameDict + .. autoclass:: KernelState :members: :undoc-members: From 8753dfc618685e677e6b8179b92cbbecbd4d069b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 17:16:26 -0500 Subject: [PATCH 017/109] [doc] minor fixes to avoid warning which docs building --- loopy/kernel/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e619cc8bd..8098f0578 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -161,11 +161,11 @@ class InameDict: :attr data: An instance of :class:`pyrsistent.PMap` from iname names to instances of :class:`~loopy.kernel.data.Iname`. :attr all_inames: A :class:`frozenset` of names of all inames in a - :class:`LoopKernel` + :class:`~loopy.LoopKernel` .. note:: - * Inames that are not a part of :attr:`InameDict.data`, but are seen in + * Inames that are not a part of *data*, but are seen in :attr`InameDict.all_inames` are realized as instances of :class:`~loopy.kernel.data.Iname` with no tags. @@ -244,7 +244,7 @@ def make_iname_dict(tagged_inames, all_inames): @dataclass(frozen=True) class LoopKernelDomains: """ - Records the domain information seen in a :class:`loopy.kernel.LoopKernel`. + Records the domain information seen in a :class:`loopy.LoopKernel`. .. attribute:: _domains @@ -255,16 +255,17 @@ class LoopKernelDomains: A :class:`pyrsistent.PMap` of dim names to :class:`frozenset` of indices of domains in which the dims appear as - :class:`islpy.dim_type.param`-type dims. + :attr:`islpy.dim_type.param`-type dims. .. attribute:: home_domain_map A :class:`pyrsistent.PMap` of dim names to the index of the domain in - which the dims appear as :class:`islpy.dim_type.set`-type dim. + which the dims appear as :attr:`islpy.dim_type.set`-type dim. .. automethod:: append .. automethod:: swap .. automethod:: delete + .. automethod:: insert """ _domains: PVector param_to_idoms: PMap @@ -419,7 +420,7 @@ def delete(self, idom): def insert(self, idom, domain): """ Returns a copy of *self* with *domain* inserted at the *idom*-index in - :attr:`LoopKernel._domains`. + :attr:`LoopKernelDomains._domains`. """ raise NotImplementedError From 9f6da99ed6e73003f4e1c8e42be79391e962025a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 17:45:34 -0500 Subject: [PATCH 018/109] prefer fields instead of __dataclass_fields__ --- loopy/kernel/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 8098f0578..17dc109e4 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -41,7 +41,7 @@ from loopy.kernel.data import filter_iname_tags_by_type, Iname from pyrsistent import pmap, pvector, PVector, PMap from typing import FrozenSet -from dataclasses import dataclass +from dataclasses import dataclass, fields from warnings import warn @@ -230,8 +230,8 @@ def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. """ - for field_name in sorted(self.__dataclass_fields__): - key_builder.rec(key_hash, getattr(self, field_name)) + for field in fields(self): + key_builder.rec(key_hash, getattr(self, field.name)) def make_iname_dict(tagged_inames, all_inames): @@ -483,8 +483,8 @@ def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. """ - for field_name in sorted(self.__dataclass_fields__): - key_builder.rec(key_hash, getattr(self, field_name)) + for field in fields(self): + key_builder.rec(key_hash, getattr(self, field.name)) def make_loop_kernel_domains(domains): From 932e6874494260171813f6a53d7c19f048e0928a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 19:22:19 -0500 Subject: [PATCH 019/109] computing outer params is easier than you think --- loopy/kernel/tools.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index b897dbcb1..1c6b59b0b 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2058,14 +2058,8 @@ def get_outer_params(domains): :arg domains: An instance of :class:`list` of :class:`isl.BasicSet`. """ - all_inames = set() - all_params = set() - for dom in domains: - all_inames.update(dom.get_var_names(dim_type.set)) - all_params.update(dom.get_var_names(dim_type.param)) - from loopy.tools import intern_frozenset_of_ids - return intern_frozenset_of_ids(all_params-all_inames) + return intern_frozenset_of_ids(domains.param_dims - domains.set_dims) # }}} From c6684a141811dbeffe8417f3f75f7977f3f8bb3f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Apr 2021 19:27:39 -0500 Subject: [PATCH 020/109] removes unnecessary conditional * LoopKernelDomains.swap(..) exits early if the obtained domain is identical --- loopy/transform/parameter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 38088deda..1f7c713fd 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -24,7 +24,6 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl -from islpy import dim_type from loopy.translation_unit import for_each_kernel from loopy.kernel import LoopKernel @@ -93,8 +92,7 @@ def process_set(s): new_domains = kernel.domains for idom, dom in enumerate(kernel.domains): - if name in dom.get_var_names(dim_type.param): - new_domains = new_domains.swap(idom, process_set(dom)) + new_domains = new_domains.swap(idom, process_set(dom)) from pymbolic.mapper.substitutor import make_subst_func subst_func = make_subst_func({name: value}) From 6102ff736f3e2b9bad0aaefcec551ab752925e73 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 18:50:59 -0500 Subject: [PATCH 021/109] removes CannotBranching restriction - there was really no good reason why this was present in the first place - allowing this is a GOOD idea as it allows to have more flexibility in defining fine-grained domains --- loopy/diagnostic.py | 4 - loopy/kernel/__init__.py | 118 ++++++++-------------------- loopy/transform/array_buffer_map.py | 6 +- 3 files changed, 35 insertions(+), 93 deletions(-) diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index c81d38c34..74dab84a8 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -78,10 +78,6 @@ class LoopyIndexError(LoopyError): pass -class CannotBranchDomainTree(LoopyError): - pass - - class TypeInferenceFailure(LoopyError): pass diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 17dc109e4..cb58f1211 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -35,7 +35,7 @@ from pytools import UniqueNameGenerator, generate_unique_names, natsorted -from loopy.diagnostic import CannotBranchDomainTree, LoopyError +from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname @@ -867,57 +867,43 @@ def parents_per_domain(self): # }}} - # The stack of iname sets records which inames are active - # as we step through the linear list of domains. It also - # determines the granularity of inames to be popped/decactivated - # if we ascend a level. - - iname_set_stack = [] result = [] + hdm = self._get_home_domain_map() - from loopy.kernel.tools import is_domain_dependent_on_inames - - for dom_idx, dom in enumerate(self.domains): - inames = set(dom.get_var_names(dim_type.set)) - - # This next domain may be nested inside the previous domain. - # Or it may not, in which case we need to figure out how many - # levels of parents we need to discard in order to find the - # true parent. - - discard_level_count = 0 - while discard_level_count < len(iname_set_stack): - last_inames = ( - iname_set_stack[-1-discard_level_count]) - if discard_level_count + 1 < len(iname_set_stack): - last_inames = ( - last_inames - iname_set_stack[-2-discard_level_count]) - - if is_domain_dependent_on_inames(self, dom_idx, last_inames): - break - - discard_level_count += 1 - - if discard_level_count: - iname_set_stack = iname_set_stack[:-discard_level_count] + for idom, dom in enumerate(self.domains): + idom_param_vars = (frozenset(dom.get_var_names(dim_type.param)) + - self.get_unwritten_value_args()) + if len(idom_param_vars) == 0: + # idom doesn't depend on any inames/variables + # => doesn't impose any nesting criteria + result.append(None) + continue - if result: - parent = len(result)-1 - else: - parent = None + # outer_inames: inames that must be nested outside the 'set dims' + # of 'dom' + outer_inames = set() + for var in idom_param_vars: + if var in self.all_inames(): + outer_inames.add(var) + else: + writer_insns = self.writer_map()[var] + if len(writer_insns) > 1: + raise RuntimeError(f"loop bound '{var}' " + "may only be written to once") - for _i in range(discard_level_count): - assert parent is not None - parent = result[parent] + writer_insn, = writer_insns + outer_inames.update(self.insn_inames(writer_insn)) - # found this domain's parent - result.append(parent) + parent_idoms = {hdm[iname] for iname in outer_inames} - if iname_set_stack: - parent_inames = iname_set_stack[-1] + if len(parent_idoms) == 0: + result.append(None) + elif len(parent_idoms) > 1: + raise NotImplementedError("Only one parent per domain supported" + " for now.") else: - parent_inames = set() - iname_set_stack.append(parent_inames | inames) + parent_idom, = parent_idoms + result.append(parent_idom) return result @@ -945,6 +931,7 @@ def all_parents_per_domain(self): # keep walking up tree to find *all* parents dom_result = [] while parent is not None: + import pudb; pu.db dom_result.insert(0, parent) parent = ppd[parent] @@ -1007,49 +994,12 @@ def get_inames_domain(self, inames): @memoize_method def get_leaf_domain_indices(self, inames): - """Find the leaves of the domain tree needed to cover all inames. + """Find the leaves of the domain tree needed to cover *inames*. :arg inames: a non-mutable iterable """ - hdm = self._get_home_domain_map() - ppd = self.all_parents_per_domain() - - domain_indices = set() - - # map root -> leaf - root_to_leaf = {} - - for iname in inames: - home_domain_index = hdm[iname] - if home_domain_index in domain_indices: - # nothin' new - continue - - domain_path_to_root = [home_domain_index] + ppd[home_domain_index] - current_root = domain_path_to_root[-1] - previous_leaf = root_to_leaf.get(current_root) - - if previous_leaf is not None: - # Check that we don't branch the domain tree. - # - # Branching the domain tree is dangerous/ill-formed because - # it can introduce artificial restrictions on variables - # further up the tree. - - prev_path_to_root = set([previous_leaf] + ppd[previous_leaf]) - if not prev_path_to_root <= set(domain_path_to_root): - raise CannotBranchDomainTree("iname set '%s' requires " - "branch in domain tree (when adding '%s')" - % (", ".join(inames), iname)) - else: - # We're adding a new root. That's fine. - pass - - root_to_leaf[current_root] = home_domain_index - domain_indices.update(domain_path_to_root) - - return list(root_to_leaf.values()) + return frozenset(hdm[iname] for iname in inames) @memoize_method def _get_inames_domain_backend(self, inames): diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index 3c4092b74..7e94cca15 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -355,11 +355,7 @@ def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs): arg_inames.update(get_dependencies(arg)) arg_inames = frozenset(arg_inames & self.kernel.all_inames()) - from loopy.kernel import CannotBranchDomainTree - try: - usage_domain = self.kernel.get_inames_domain(arg_inames) - except CannotBranchDomainTree: - return False + usage_domain = self.kernel.get_inames_domain(arg_inames) for i in range(usage_domain.dim(dim_type.set)): iname = usage_domain.get_dim_name(dim_type.set, i) From ae3b2b07add9e3eaed8fe08e8162b9deb4a674d1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 19:30:55 -0500 Subject: [PATCH 022/109] fixup! ImmutableRecordWithoutPickingWithTargetedCopies.copy: return a copy in all cases --- loopy/kernel/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index cb58f1211..98215d57f 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -931,7 +931,6 @@ def all_parents_per_domain(self): # keep walking up tree to find *all* parents dom_result = [] while parent is not None: - import pudb; pu.db dom_result.insert(0, parent) parent = ppd[parent] From 7c491d3de56a1a2551686f41c8762196a38ec9cc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 09:14:52 -0500 Subject: [PATCH 023/109] precompute: generate fine grained domains - coarse grained domains are typically bad. (see gh-379) - Gist of it is that disjoint loop nests induce predicates at the entry of each other, which is generally isn't an expected behavior from the domain specification - things go even crazy if a domain contains multiple iname targetting the same hw axes --- loopy/transform/precompute.py | 43 ++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 201abd470..c169214b6 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -828,7 +828,48 @@ def add_assumptions(d): # }}} - new_kernel_domains = domch.get_domains_with(mod_domain) + # {{{ splitting mod_domain + + # Splitting 'mod_domain' into 2 parts + # 1. domain contribution from the preexisting_precompute_inames, and, + # 2. domain contribution arising from new precompute inames. + # + # We do so in order to have fine grained domains. Typically precompute + # inames are tagged with hardware inames. As concluded in + # https://github.com/inducer/loopy/issues/379, it is generally a better + # idea to decouple domains containing the same hw axes inames. + + existing_inames_domain = mod_domain + + for name, (dt, pos) in mod_domain.get_var_dict().items(): + if name in non1_storage_axis_names: + existing_inames_domain = existing_inames_domain.eliminate(dt, pos, 1) + + new_inames_domain = mod_domain.gist(existing_inames_domain) + + for name in non1_storage_axis_names: + dt, pos = existing_inames_domain.get_var_dict()[name] + existing_inames_domain = existing_inames_domain.project_out(dt, + pos, + 1) + + for dim_name in new_inames_domain.get_var_names(isl.dim_type.set)[:]: + if dim_name in sweep_inames: + dt, pos = new_inames_domain.get_var_dict()[dim_name] + new_inames_domain = new_inames_domain.project_out(dt, pos, 1) + continue + if dim_name not in non1_storage_axis_names: + dt, pos = new_inames_domain.get_var_dict()[dim_name] + new_inames_domain = new_inames_domain.move_dims(isl.dim_type.param, + new_inames_domain + .dim(isl.dim_type + .param), + dt, pos, 1) + new_kernel_domains = domch.get_domains_with(existing_inames_domain) + new_kernel_domains = new_kernel_domains + [new_inames_domain + .drop_unused_params()] + + # }}} else: # leave kernel domains unchanged From 2716694b88afe0724d0a1a13ca303b8ab3d7c489 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 23:00:06 -0500 Subject: [PATCH 024/109] combine domains enclosing the sweep inames, pre_existing precompute-inames --- loopy/transform/precompute.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index c169214b6..6b3722628 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -698,6 +698,24 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, change_inames = expanding_inames | preexisting_precompute_inames + # {{{ combine domains containing *change_inames* + + # Why combine? + # Downstream logic in DomainChanger requires the domain to be + # manipulated to be a single domain. + + domain_indices = tuple(sorted({kernel.get_home_domain_index(i) + for i in change_inames}, reverse=True)) + + combined_domain = kernel.combine_domains(domain_indices) + domains_after_combining = [dom + for idom, dom in enumerate(kernel.domains) + if idom not in domain_indices] + domains_after_combining.insert(domain_indices[-1], combined_domain) + kernel = kernel.copy(domains=domains_after_combining) + + # }}} + from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, change_inames) From 8b89155ac62a270930db9ee11233fd3d2a3377fd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 16 May 2021 22:40:53 -0500 Subject: [PATCH 025/109] tree representation of schedule --- loopy/codegen/__init__.py | 10 ++ loopy/schedule/tree.py | 241 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 loopy/schedule/tree.py diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 98a5494b7..e5dddb752 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -478,6 +478,16 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") + # {{{ make_schedule_a_tree + + from loopy.schedule.tree import make_schedule_tree + sched_tree = make_schedule_tree(kernel) + kernel = kernel.copy(schedule=sched_tree) + print(kernel) + 1/0 + + # }}} + codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") # {{{ pre-codegen-process of non-entrypoint kernel diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py new file mode 100644 index 000000000..6cb9cf73e --- /dev/null +++ b/loopy/schedule/tree.py @@ -0,0 +1,241 @@ +import loopy.schedule as schedule +from loopy.diagnostic import LoopyError +from typing import List, Union, Any, Optional +from dataclasses import dataclass, field + + +# {{{ LoopKernel.schedule a tree + +class ScheduleNode: + """ + Abstract class for a schedule node in a class:`~loopy.LoopKernel`. + """ + pass + + +@dataclass +class RunInstruction(ScheduleNode): + id: str + + mapper_method: str = field(default="map_run_instruction", repr=False, init=False) + + +@dataclass +class Barrier(ScheduleNode): + """ + .. attribute:: comment + + A plain-text comment explaining why the barrier was inserted. + + .. attribute:: synchronization_kind + + ``"local"`` or ``"global"`` + + .. attribute:: mem_kind + + ``"local"`` or ``"global"`` + + .. attribute:: originating_insn_id + """ + comment: str + synchronization_kind: str + originating_insn_id: Optional[str] + + mapper_method: str = field(default="map_barrier", repr=False, init=False) + + +@dataclass +class InstructionBlock(ScheduleNode): + """ + List of instruction ids that are to be executed in sequence. An instruction + block cannot contain other blocks or loops. + + .. attribute:: children + + A list of instruction ids contained in the block. + """ + children: List[Union[Barrier, RunInstruction]] + + mapper_method: str = field(default="map_instruction_block", repr=False, + init=False) + + +@dataclass +class Loop(ScheduleNode): + """ + A loop with the induction variable *iname*. + """ + iname: str + children: List[Union[InstructionBlock, "Loop"]] + + mapper_method: str = field(default="map_loop", repr=False, init=False) + + +@dataclass +class Function(ScheduleNode): + """ + A function definition. + + .. attribute:: name + + An instance of :class:`str` + + .. attribute:: extra_args + + .. attribute:: extra_inames + """ + name: str + extra_args: List[Any] + extra_inames: List[str] + children: List[Union[InstructionBlock, Loop]] + + +@dataclass +class Schedule(ScheduleNode): + """ + Top-level schedule description. + """ + children: List[Union[Loop, InstructionBlock, Function]] + + +@dataclass +class ScheduleTreeBuilder: + """ + A builder for :class:`Schedule`. + """ + + schedule: Schedule + _build_stack: List[ScheduleNode] + + @staticmethod + def new(): + sched = Schedule([]) + return ScheduleTreeBuilder(sched, [sched]) + + @property + def current_node(self): + return self._build_stack[-1] + + def make_current_node(self, node): + self._build_stack.append(node) + + def make_and_enter_function(self, name, extra_args, extra_inames): + assert isinstance(self.current_node, Schedule) + new_function = Function(name, extra_args, extra_inames, []) + self.current_node.children.append(new_function) + self.make_current_node(new_function) + + def make_and_enter_instruction_block(self): + assert isinstance(self.current_node, (Function, Loop, Schedule)) + new_block = InstructionBlock([]) + self.current_node.children.append(new_block) + self.make_current_node(new_block) + + def make_and_enter_loop(self, iname): + assert isinstance(self.current_node, (Schedule, Function, Loop)) + new_loop = Loop(iname, []) + self.current_node.children.append(new_loop) + self.make_current_node(new_loop) + + def add_run_instruction(self, insn_id): + if not isinstance(self.current_node, InstructionBlock): + self.make_and_enter_instruction_block() + + self.current_node.children.append(RunInstruction(insn_id)) + + def add_barrier(self, comment, kind, insn_id): + if not isinstance(self.current_node, InstructionBlock): + self.make_instruction_block() + + self.current_node.children.append(Barrier(comment, kind, insn_id)) + + def exit_function(self): + if isinstance(self.current_node, InstructionBlock): + self._build_stack.pop() + assert isinstance(self.current_node, Function) + return self._build_stack.pop() + + def exit_loop(self): + if isinstance(self.current_node, InstructionBlock): + self._build_stack.pop() + assert isinstance(self.current_node, Loop) + return self._build_stack.pop() + + def exit(self): + if isinstance(self.current_node, InstructionBlock): + self._build_stack.pop() + assert isinstance(self.current_node, Schedule) + return self._build_stack.pop() + + +def make_schedule_tree(kernel): + # bob: the schedule builder + bob = ScheduleTreeBuilder.new() + + for sched_item in kernel.schedule: + if isinstance(sched_item, schedule.CallKernel): + bob.make_and_enter_function(sched_item.kernel_name, + sched_item.extra_args, + sched_item.extra_inames) + elif isinstance(sched_item, schedule.ReturnFromKernel): + fn = bob.exit_function() + assert fn.name == sched_item.kernel_name + elif isinstance(sched_item, schedule.EnterLoop): + bob.make_and_enter_loop(sched_item.iname) + elif isinstance(sched_item, schedule.LeaveLoop): + loop = bob.exit_loop() + assert loop.iname == sched_item.iname + elif isinstance(sched_item, schedule.RunInstruction): + bob.add_run_instruction(sched_item.insn_id) + elif isinstance(sched_item, schedule.Barrier): + bob.add_barrier(sched_item.comment, + sched_item.synchronization_kind, + sched_item.originating_insn_id) + else: + raise NotImplementedError(type(sched_item)) + + return bob.exit() + +# }}} + + +class Mapper: + def __call__(self, expr, *args, **kwargs): + try: + method = getattr(self, expr.mapper_method) + except AttributeError: + raise LoopyError(f"{type(self)} cannot handle expressions of" + f" type {type(expr)}.") + + return method(expr, *args, **kwargs) + + rec = __call__ + + +class IdentityMapper(Mapper): + def map_schedule(self, expr, *args, **kwargs): + return Schedule([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_instruction_block(self, expr, *args, **kwargs): + return InstructionBlock([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_function(self, expr, *args, **kwargs): + return Function(expr.name, + expr.extra_args, + expr.extra_inames, + [self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_loop(self, expr, *args, **kwargs): + return Loop(expr.iname, + [self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_barrier(self, expr, *args, **kwargs): + return Barrier(expr.comment, expr.synchronization_kind, + expr.originating_insn_id) + + def map_run_instruction(self, expr, *args, **kwargs): + return RunInstruction(expr.insn_id) From 75da6a43fe78ad723943b4549a3f7865a31f387d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 May 2021 10:11:24 -0500 Subject: [PATCH 026/109] adds CombineMapper, StringiyMapper --- loopy/codegen/__init__.py | 2 +- loopy/schedule/__init__.py | 8 +++++ loopy/schedule/tree.py | 66 +++++++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e5dddb752..8395d9f8b 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -478,7 +478,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") - # {{{ make_schedule_a_tree + # {{{ make_schedule_tree from loopy.schedule.tree import make_schedule_tree sched_tree = make_schedule_tree(kernel) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 92ceda2e6..909c44f29 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -439,6 +439,14 @@ def format_insn(kernel, insn_id): def dump_schedule(kernel, schedule): + + from loopy.schedule.tree import Schedule + if isinstance(schedule, Schedule): + from loopy.schedule.tree import StringifyMapper + return StringifyMapper(kernel)(schedule) + + assert isinstance(schedule, list) + lines = [] indent = "" diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 6cb9cf73e..e6631bbd1 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -15,7 +15,7 @@ class ScheduleNode: @dataclass class RunInstruction(ScheduleNode): - id: str + insn_id: str mapper_method: str = field(default="map_run_instruction", repr=False, init=False) @@ -89,6 +89,8 @@ class Function(ScheduleNode): extra_inames: List[str] children: List[Union[InstructionBlock, Loop]] + mapper_method: str = field(default="map_function", repr=False, init=False) + @dataclass class Schedule(ScheduleNode): @@ -97,6 +99,8 @@ class Schedule(ScheduleNode): """ children: List[Union[Loop, InstructionBlock, Function]] + mapper_method: str = field(default="map_schedule", repr=False, init=False) + @dataclass class ScheduleTreeBuilder: @@ -239,3 +243,63 @@ def map_barrier(self, expr, *args, **kwargs): def map_run_instruction(self, expr, *args, **kwargs): return RunInstruction(expr.insn_id) + + +class CombineMapper(Mapper): + def combine(self, values): + raise NotImplementedError + + def map_schedule(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_instruction_block(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_function(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_loop(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_barrier(self, expr, *args, **kwargs): + raise NotImplementedError + + def map_run_instruction(self, expr, *args, **kwargs): + raise NotImplementedError + + +class StringifyMapper(CombineMapper): + SHIFTWIDTH = 2 + + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + return "\n".join(values) + + def _indent(self, level): + return level*self.SHIFTWIDTH*" " + + def map_function(self, expr, level=0): + return self.combine([(f"{self._indent(level)}CALL KERNEL {expr.name}(" + f"extra_args={expr.extra_args}, " + f"extra_inames={expr.extra_inames})"), + super().map_function(expr, level+1), + f"{self._indent(level)}RETURN FROM KERNEL {expr.name}"]) + + def map_run_instruction(self, expr, level=0): + from loopy.schedule import format_insn + return (f"{self._indent(level)}" + f"{format_insn(self.kernel, expr.insn_id)}") + + def map_barrier(self, expr, level=0): + return (f"{self._indent(level)}... {expr.kind[0]}barrier") + + def map_loop(self, expr, level=0): + return self.combine([f"{self._indent(level)}for {expr.iname}", + super().map_function(expr, level+1), + f"{self._indent(level)}end {expr.iname}"]) From b791786c0d9e68ccc2e0e0f38f758e1cf837ae58 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 May 2021 18:47:45 -0500 Subject: [PATCH 027/109] adds v1 of predicate insertion mapper --- loopy/schedule/tree.py | 184 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 183 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index e6631bbd1..852fd1ebd 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -1,7 +1,11 @@ +import pymbolic.primitives as prim import loopy.schedule as schedule from loopy.diagnostic import LoopyError -from typing import List, Union, Any, Optional +from typing import List, Union, Any, Optional, Tuple from dataclasses import dataclass, field +import islpy as isl +from islpy import dim_type +from functools import reduce # {{{ LoopKernel.schedule a tree @@ -92,6 +96,19 @@ class Function(ScheduleNode): mapper_method: str = field(default="map_function", repr=False, init=False) +class For(Loop): + iname: str + upper_bound: Union[int, prim.Expression] + lower_bound: Union[int, prim.Expression] + step: int + + mapper_method: str = field(default="map_resolved_for", repr=False, init=False) + + +class If(ScheduleNode): + condition: Union[int, bool, prim.Expression] + + @dataclass class Schedule(ScheduleNode): """ @@ -303,3 +320,168 @@ def map_loop(self, expr, level=0): return self.combine([f"{self._indent(level)}for {expr.iname}", super().map_function(expr, level+1), f"{self._indent(level)}end {expr.iname}"]) + + +def _align_and_intersect(d1, d2): + d1, d2 = isl.align_two(d1, d2) + return d1 & d2 + + +def _wrap_in_if(cond, nodes): + if cond.is_universe(): + return nodes + else: + return If(cond, nodes) + + +@dataclass(frozen=True) +class PredicateInsertionContext: + implemented_domain: isl.BasicSet + gsize: Optional[Tuple[isl.PwAff, ...]] = None + lsize: Optional[Tuple[isl.PwAff, ...]] = None + + def copy(self, *, implemented_domain=None, gsize=None, lsize=None): + if implemented_domain is None: + implemented_domain = self.implemented_domain + + if gsize is None: + gsize = self.gsize + + if lsize is None: + lsize = self.lsize + + return PredicateInsertionContext(implemented_domain, gsize, lsize) + + +class PredicateInsertion(IdentityMapper): + def __init__(self, kernel): + self.kernel = kernel + + def map_schedule(self, expr): + universe = isl.BasicSet(isl.Space.create_from_names(self.kernel.isl_context, + [])) + + return super().map_schedule(expr, PredicateInsertionContext(universe)) + + def map_function(self, expr, context): + # get the implemented domain for the insn ids in this kernel + # Shouldn't be difficult to write a combine mapper for it. + gsize, lsize = self.kernel.get_grid_sizes_for_insns_ids( + gather_insn_ids_for_kernel(expr.name)) + return super().map_function(expr, context.copy(gsize=gsize, lsize=lsize)) + + def map_run_instruction(self, expr, context): + return expr + + def map_barrier(self, expr, context): + return expr + + def map_instruction_block(self, expr, context): + if all(isinstance(child, RunInstruction) for child in expr.children): + # need to add a predicate for the hardware axes usage. + assert len({self.kernel.id_to_insn[child.insn_id].within_inames + for child in expr.children}) == 1 + inames = self.kernel.id_to_insn[expr.children[0].insn_id].within_inames + hw_inames = inames - set(context.implemented_domain.get_var_dict()) + if hw_inames: + raise NotImplementedError + + return InstructionBlock([self.rec(child, context) + for child in expr.children]) + else: + assert all(isinstance(child, Barrier) for child in expr.childre) + return InstructionBlock([self.rec(child, context) + for child in expr.children]) + + def map_loop(self, expr, context): + from loopy.symbolic import aff_to_expr, set_to_cond_expr + + implemented_domain = context.implemented_domain + assert implemented_domain.dim(dim_type.set) == 0 + + domain = self.kernel.get_iname_domain(expr.iname) + + # {{{ make already implemented loops as parallel; project out inner loops + + for set_dim in domain.get_var_names(dim_type.set): + dt, pos = domain.get_var_dict()[set_dim] + assert dt == dim_type.set + + if set_dim in implemented_domain.get_var_dict(): + # make outer loop's iname a param + domain = domain.move_dims(dim_type.param, + domain.dim(dim_type.param), + dt, pos, 1) + elif set_dim != expr.name: + domain = domain.project_out(dt, pos, 1) + else: + pass + + # }}} + + assert domain.dim(dim_type.set) == 1 + + domain, implemented_domain = isl.align_two(domain, + implemented_domain) + domain = domain.gist(implemented_domain) + + downstream_domain = _align_and_intersect(domain + .move_dims(dim_type.param, + domain.dim(dim_type + .param), + dim_type.set, + 0, 1), + implemented_domain) + + outer_condition = isl.align_space(domain.project_out(dim_type.set, 0), + downstream_domain).gist(downstream_domain) + + min_affs = [aff for set, aff in domain.dim_min(0)] + max_affs = [aff for set, aff in domain.dim_max(0)] + lower_bound = reduce(isl.Aff.min, min_affs, min_affs[0]) + upper_bound = reduce(isl.Aff.max, max_affs, max_affs[0]) + + inner_condition = domain.affine_hull() + step = 1 # TODO: from inner_condition try to guess the step + + children = [self.rec(child, (context + .copy(implemented_domain=downstream_domain))) + for child in expr.children] + + return _wrap_in_if(outer_condition, + For(aff_to_expr(lower_bound), + aff_to_expr(upper_bound), + step, + _wrap_in_if(set_to_cond_expr(inner_condition), + children))) + + +class InstructionGatherer(CombineMapper): + """ + Mapper to gather all insn ids a :class:`Function`. + """ + def __init__(self, function_name): + self.function_name = function_name + + def combine(self, values): + assert all(isinstance(value, list) for value in values) + return sum(values, []) + + def map_function(self, expr): + if expr.name == self.function_name: + return super().map_function(expr) + else: + return [] + + def map_run_instruction(self, expr): + return [expr.insn_id] + + def map_barrier(self, expr): + if expr.originating_insn_id is not None: + return [expr.originating_insn_id] + else: + return [] + + +def gather_insn_ids_for_kernel(schedule, kernel_name): + return InstructionGatherer(kernel_name)(schedule) From d64f80802fdae69e5d924fd6e14fb0631a50ab63 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 May 2021 21:29:02 -0500 Subject: [PATCH 028/109] minor fixes; stringifies `If` correctly, applies predicate correctly --- loopy/codegen/__init__.py | 7 +-- loopy/schedule/tree.py | 110 +++++++++++++++++++++++--------------- 2 files changed, 70 insertions(+), 47 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 8395d9f8b..634d9e306 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -480,9 +480,10 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, # {{{ make_schedule_tree - from loopy.schedule.tree import make_schedule_tree - sched_tree = make_schedule_tree(kernel) - kernel = kernel.copy(schedule=sched_tree) + from loopy.schedule.tree import (make_schedule_tree, + insert_predicates_into_schedule) + kernel = make_schedule_tree(kernel) + kernel = insert_predicates_into_schedule(kernel) print(kernel) 1/0 diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 852fd1ebd..e20449cef 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -1,11 +1,12 @@ import pymbolic.primitives as prim import loopy.schedule as schedule -from loopy.diagnostic import LoopyError +import islpy as isl from typing import List, Union, Any, Optional, Tuple from dataclasses import dataclass, field -import islpy as isl -from islpy import dim_type from functools import reduce +from islpy import dim_type +from loopy.diagnostic import LoopyError +from loopy.kernel import KernelState # {{{ LoopKernel.schedule a tree @@ -96,17 +97,23 @@ class Function(ScheduleNode): mapper_method: str = field(default="map_function", repr=False, init=False) +@dataclass class For(Loop): iname: str - upper_bound: Union[int, prim.Expression] lower_bound: Union[int, prim.Expression] + upper_bound: Union[int, prim.Expression] step: int + children: List[Union[InstructionBlock, Loop, "If"]] - mapper_method: str = field(default="map_resolved_for", repr=False, init=False) + mapper_method: str = field(default="map_for", repr=False, init=False) +@dataclass class If(ScheduleNode): condition: Union[int, bool, prim.Expression] + children: List[Union[Loop, InstructionBlock, Function]] + + mapper_method: str = field(default="map_if", repr=False, init=False) @dataclass @@ -215,7 +222,7 @@ def make_schedule_tree(kernel): else: raise NotImplementedError(type(sched_item)) - return bob.exit() + return kernel.copy(schedule=bob.exit()) # }}} @@ -282,6 +289,14 @@ def map_loop(self, expr, *args, **kwargs): return self.combine([self.rec(child, *args, **kwargs) for child in expr.children]) + def map_for(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + + def map_if(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + def map_barrier(self, expr, *args, **kwargs): raise NotImplementedError @@ -318,9 +333,21 @@ def map_barrier(self, expr, level=0): def map_loop(self, expr, level=0): return self.combine([f"{self._indent(level)}for {expr.iname}", - super().map_function(expr, level+1), + super().map_loop(expr, level+1), f"{self._indent(level)}end {expr.iname}"]) + def map_for(self, expr, level=0): + return self.combine([f"{self._indent(level)}For({expr.iname}, " + f"{expr.lower_bound}, {expr.upper_bound}, " + f"{expr.step})", + super().map_for(expr, level+1), + f"{self._indent(level)}end {expr.iname}"]) + + def map_if(self, expr, level=0): + return self.combine([f"{self._indent(level)}If({expr.condition})", + super().map_if(expr, level+1), + f"{self._indent(level)}Endif"]) + def _align_and_intersect(d1, d2): d1, d2 = isl.align_two(d1, d2) @@ -328,10 +355,11 @@ def _align_and_intersect(d1, d2): def _wrap_in_if(cond, nodes): + from loopy.symbolic import set_to_cond_expr if cond.is_universe(): return nodes else: - return If(cond, nodes) + return [If(set_to_cond_expr(cond), nodes)] @dataclass(frozen=True) @@ -353,21 +381,22 @@ def copy(self, *, implemented_domain=None, gsize=None, lsize=None): return PredicateInsertionContext(implemented_domain, gsize, lsize) -class PredicateInsertion(IdentityMapper): +class PredicateInsertionMapper(IdentityMapper): def __init__(self, kernel): self.kernel = kernel def map_schedule(self, expr): - universe = isl.BasicSet(isl.Space.create_from_names(self.kernel.isl_context, - [])) + universe = isl.BasicSet.universe(isl.Space.create_from_names(self.kernel + .isl_context, + [])) return super().map_schedule(expr, PredicateInsertionContext(universe)) def map_function(self, expr, context): # get the implemented domain for the insn ids in this kernel # Shouldn't be difficult to write a combine mapper for it. - gsize, lsize = self.kernel.get_grid_sizes_for_insns_ids( - gather_insn_ids_for_kernel(expr.name)) + gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids( + InstructionGatherer()(expr)) return super().map_function(expr, context.copy(gsize=gsize, lsize=lsize)) def map_run_instruction(self, expr, context): @@ -394,12 +423,12 @@ def map_instruction_block(self, expr, context): for child in expr.children]) def map_loop(self, expr, context): - from loopy.symbolic import aff_to_expr, set_to_cond_expr + from loopy.symbolic import pw_aff_to_expr implemented_domain = context.implemented_domain assert implemented_domain.dim(dim_type.set) == 0 - domain = self.kernel.get_iname_domain(expr.iname) + domain = self.kernel.get_inames_domain(expr.iname) # {{{ make already implemented loops as parallel; project out inner loops @@ -412,7 +441,7 @@ def map_loop(self, expr, context): domain = domain.move_dims(dim_type.param, domain.dim(dim_type.param), dt, pos, 1) - elif set_dim != expr.name: + elif set_dim != expr.iname: domain = domain.project_out(dt, pos, 1) else: pass @@ -433,13 +462,11 @@ def map_loop(self, expr, context): 0, 1), implemented_domain) - outer_condition = isl.align_space(domain.project_out(dim_type.set, 0), - downstream_domain).gist(downstream_domain) + outer_condition = isl.align_spaces(domain.project_out(dim_type.set, 0, 1), + downstream_domain).gist(downstream_domain) - min_affs = [aff for set, aff in domain.dim_min(0)] - max_affs = [aff for set, aff in domain.dim_max(0)] - lower_bound = reduce(isl.Aff.min, min_affs, min_affs[0]) - upper_bound = reduce(isl.Aff.max, max_affs, max_affs[0]) + lower_bound = domain.dim_min(0) + upper_bound = domain.dim_max(0) inner_condition = domain.affine_hull() step = 1 # TODO: from inner_condition try to guess the step @@ -449,39 +476,34 @@ def map_loop(self, expr, context): for child in expr.children] return _wrap_in_if(outer_condition, - For(aff_to_expr(lower_bound), - aff_to_expr(upper_bound), - step, - _wrap_in_if(set_to_cond_expr(inner_condition), - children))) + For(iname=expr.iname, + lower_bound=pw_aff_to_expr(lower_bound), + upper_bound=pw_aff_to_expr(upper_bound), + step=step, + children=_wrap_in_if(inner_condition, + children))) class InstructionGatherer(CombineMapper): """ - Mapper to gather all insn ids a :class:`Function`. + Mapper to gather all insn ids. """ - def __init__(self, function_name): - self.function_name = function_name - def combine(self, values): - assert all(isinstance(value, list) for value in values) - return sum(values, []) - - def map_function(self, expr): - if expr.name == self.function_name: - return super().map_function(expr) - else: - return [] + assert all(isinstance(value, frozenset) for value in values) + return reduce(frozenset.union, values, frozenset()) def map_run_instruction(self, expr): - return [expr.insn_id] + return frozenset([expr.insn_id]) def map_barrier(self, expr): if expr.originating_insn_id is not None: - return [expr.originating_insn_id] + return frozenset([expr.originating_insn_id]) else: - return [] + return frozenset() -def gather_insn_ids_for_kernel(schedule, kernel_name): - return InstructionGatherer(kernel_name)(schedule) +def insert_predicates_into_schedule(kernel): + assert kernel.state >= KernelState.LINEARIZED + assert isinstance(kernel.schedule, Schedule) + new_schedule = PredicateInsertionMapper(kernel)(kernel.schedule) + return kernel.copy(schedule=new_schedule) From eeeb1c02a2771946052e27eb29c9be6c72bbaee7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 20 May 2021 00:10:02 -0500 Subject: [PATCH 029/109] [wip] generates code for sequential loops --- loopy/codegen/__init__.py | 360 ++----------------- loopy/codegen/control.py | 10 +- loopy/codegen/instruction.py | 20 +- loopy/codegen/result.py | 512 +++++++++++++++++---------- loopy/schedule/tree.py | 6 + loopy/target/__init__.py | 40 ++- loopy/target/c/__init__.py | 115 +++--- loopy/target/c/codegen/expression.py | 49 ++- loopy/target/cuda.py | 19 +- loopy/target/ispc.py | 41 +-- loopy/target/numba.py | 20 +- loopy/target/opencl.py | 22 +- loopy/target/pyopencl.py | 34 +- loopy/target/python.py | 52 +-- 14 files changed, 552 insertions(+), 748 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 634d9e306..3488a2a5e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -23,9 +23,8 @@ import logging logger = logging.getLogger(__name__) -import islpy as isl +from loopy.diagnostic import LoopyError -from loopy.diagnostic import LoopyError, warn from pytools import ImmutableRecord from pytools.persistent_dict import WriteOncePersistentDict @@ -133,8 +132,6 @@ def __init__(self, target, name, dtype, arg_class, # }}} -# {{{ code generation state - class UnvectorizableError(Exception): pass @@ -176,252 +173,6 @@ def __init__(self, name, c_name, arg_dtypes, result_dtypes): result_dtypes=result_dtypes) -class CodeGenerationState: - """ - .. attribute:: kernel - .. attribute:: target - .. attribute:: implemented_data_info - - a list of :class:`ImplementedDataInfo` objects. - - .. attribute:: implemented_domain - - The entire implemented domain (as an :class:`islpy.Set`) - i.e. all constraints that have been enforced so far. - - .. attribute:: implemented_predicates - - A :class:`frozenset` of predicates for which checks have been - implemented. - - .. attribute:: seen_dtypes - - set of dtypes that were encountered - - .. attribute:: seen_functions - - set of :class:`SeenFunction` instances - - .. attribute:: seen_atomic_dtypes - - .. attribute:: var_subst_map - - .. attribute:: allow_complex - - .. attribute:: vectorization_info - - None or an instance of :class:`VectorizationInfo` - - .. attribute:: is_generating_device_code - - .. attribute:: gen_program_name - - None (indicating that host code is being generated) - or the name of the device program currently being - generated. - - .. attribute:: schedule_index_end - - .. attribute:: callables_table - - A mapping from callable names to instances of - :class:`loopy.kernel.function_interface.InKernelCallable`. - - .. attribute:: is_entrypoint - - A :class:`bool` to indicate if the code is being generated for an - entrypoint kernel - - .. attribute:: codegen_cache_manager - - An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`. - """ - - def __init__(self, kernel, target, - implemented_data_info, implemented_domain, implemented_predicates, - seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, - allow_complex, - callables_table, - is_entrypoint, - vectorization_info=None, var_name_generator=None, - is_generating_device_code=None, - gen_program_name=None, - schedule_index_end=None, - codegen_cachemanager=None): - self.kernel = kernel - self.target = target - self.implemented_data_info = implemented_data_info - self.implemented_domain = implemented_domain - self.implemented_predicates = implemented_predicates - self.seen_dtypes = seen_dtypes - self.seen_functions = seen_functions - self.seen_atomic_dtypes = seen_atomic_dtypes - self.var_subst_map = var_subst_map.copy() - self.allow_complex = allow_complex - self.callables_table = callables_table - self.is_entrypoint = is_entrypoint - self.vectorization_info = vectorization_info - self.var_name_generator = var_name_generator - self.is_generating_device_code = is_generating_device_code - self.gen_program_name = gen_program_name - self.schedule_index_end = schedule_index_end - self.codegen_cachemanager = codegen_cachemanager - - # {{{ copy helpers - - def copy(self, kernel=None, target=None, implemented_data_info=None, - implemented_domain=None, implemented_predicates=frozenset(), - var_subst_map=None, is_entrypoint=None, vectorization_info=None, - is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): - - if kernel is None: - kernel = self.kernel - - if target is None: - target = self.target - - if implemented_data_info is None: - implemented_data_info = self.implemented_data_info - - if is_entrypoint is None: - is_entrypoint = self.is_entrypoint - - if vectorization_info is False: - vectorization_info = None - - elif vectorization_info is None: - vectorization_info = self.vectorization_info - - if is_generating_device_code is None: - is_generating_device_code = self.is_generating_device_code - - if gen_program_name is None: - gen_program_name = self.gen_program_name - - if schedule_index_end is None: - schedule_index_end = self.schedule_index_end - - return CodeGenerationState( - kernel=kernel, - target=target, - implemented_data_info=implemented_data_info, - implemented_domain=implemented_domain or self.implemented_domain, - implemented_predicates=( - implemented_predicates or self.implemented_predicates), - seen_dtypes=self.seen_dtypes, - seen_functions=self.seen_functions, - seen_atomic_dtypes=self.seen_atomic_dtypes, - var_subst_map=var_subst_map or self.var_subst_map, - allow_complex=self.allow_complex, - callables_table=self.callables_table, - is_entrypoint=is_entrypoint, - vectorization_info=vectorization_info, - var_name_generator=self.var_name_generator, - is_generating_device_code=is_generating_device_code, - gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end, - codegen_cachemanager=self.codegen_cachemanager.with_kernel(kernel), - ) - - def copy_and_assign(self, name, value): - """Make a copy of self with variable *name* fixed to *value*.""" - var_subst_map = self.var_subst_map.copy() - var_subst_map[name] = value - return self.copy(var_subst_map=var_subst_map) - - def copy_and_assign_many(self, assignments): - """Make a copy of self with *assignments* included.""" - - var_subst_map = self.var_subst_map.copy() - var_subst_map.update(assignments) - return self.copy(var_subst_map=var_subst_map) - - # }}} - - @property - def expression_to_code_mapper(self): - return self.ast_builder.get_expression_to_code_mapper(self) - - def intersect(self, other): - new_impl, new_other = isl.align_two(self.implemented_domain, other) - return self.copy(implemented_domain=new_impl & new_other) - - def fix(self, iname, aff): - new_impl_domain = self.implemented_domain - - impl_space = self.implemented_domain.get_space() - if iname not in impl_space.get_var_dict(): - new_impl_domain = (new_impl_domain - .add_dims(isl.dim_type.set, 1) - .set_dim_name( - isl.dim_type.set, - new_impl_domain.dim(isl.dim_type.set), - iname)) - impl_space = new_impl_domain.get_space() - - from loopy.isl_helpers import iname_rel_aff - iname_plus_lb_aff = iname_rel_aff(impl_space, iname, "==", aff) - - from loopy.symbolic import pw_aff_to_expr - cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) - expr = pw_aff_to_expr(aff) - - new_impl_domain = new_impl_domain.add_constraint(cns) - return self.copy_and_assign(iname, expr).copy( - implemented_domain=new_impl_domain) - - def try_vectorized(self, what, func): - """If *self* is in a vectorizing state (:attr:`vectorization_info` is - not None), tries to call func (which must be a callable accepting a - single :class:`CodeGenerationState` argument). If this fails with - :exc:`UnvectorizableError`, it unrolls the vectorized loop instead. - - *func* should return a :class:`GeneratedCode` instance. - - :returns: :class:`GeneratedCode` - """ - - if self.vectorization_info is None: - return func(self) - - try: - return func(self) - except UnvectorizableError as e: - warn(self.kernel, "vectorize_failed", - "Vectorization of '%s' failed because '%s'" - % (what, e)) - - return self.unvectorize(func) - - def unvectorize(self, func): - vinf = self.vectorization_info - result = [] - novec_self = self.copy(vectorization_info=False) - - for i in range(vinf.length): - idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i - new_codegen_state = novec_self.fix(vinf.iname, idx_aff) - generated = func(new_codegen_state) - - if isinstance(generated, list): - result.extend(generated) - else: - result.append(generated) - - from loopy.codegen.result import merge_codegen_results - return merge_codegen_results(self, result) - - @property - def ast_builder(self): - if self.is_generating_device_code: - return self.kernel.target.get_device_ast_builder() - else: - return self.kernel.target.get_host_ast_builder() - -# }}} - - code_gen_cache = WriteOncePersistentDict( "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -459,7 +210,6 @@ class PreambleInfo(ImmutableRecord): .. attribute:: seen_dtypes .. attribute:: seen_functions .. attribute:: seen_atomic_dtypes - .. attribute:: codegen_state """ @@ -478,17 +228,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") - # {{{ make_schedule_tree - - from loopy.schedule.tree import (make_schedule_tree, - insert_predicates_into_schedule) - kernel = make_schedule_tree(kernel) - kernel = insert_predicates_into_schedule(kernel) - print(kernel) - 1/0 - - # }}} - codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") # {{{ pre-codegen-process of non-entrypoint kernel @@ -505,86 +244,36 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, # }}} - # {{{ examine arg list - - from loopy.kernel.data import ValueArg - from loopy.kernel.array import ArrayBase - - implemented_data_info = [] - - for arg in kernel.args: - is_written = arg.name in kernel.get_written_variables() - if isinstance(arg, ArrayBase): - implemented_data_info.extend( - arg.decl_info( - target, - is_written=is_written, - index_dtype=kernel.index_dtype)) - - elif isinstance(arg, ValueArg): - implemented_data_info.append(ImplementedDataInfo( - target=target, - name=arg.name, - dtype=arg.dtype, - arg_class=ValueArg, - is_written=is_written)) - - else: - raise ValueError("argument type not understood: '%s'" % type(arg)) + # {{{ make_schedule_tree - allow_complex = False - for var in kernel.args + list(kernel.temporary_variables.values()): - if var.dtype.involves_complex(): - allow_complex = True + from loopy.schedule.tree import (make_schedule_tree, + insert_predicates_into_schedule) + kernel = make_schedule_tree(kernel) + kernel = insert_predicates_into_schedule(kernel) # }}} - seen_dtypes = set() - seen_functions = set() - seen_atomic_dtypes = set() + from loopy.codegen.result import get_idis_for_kernel, CodeGenMapper + codegen_mapper = CodeGenMapper(kernel) - initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + codegen_result = codegen_mapper(kernel.schedule) - from loopy.codegen.tools import CodegenOperationCacheManager + seen_dtypes = (codegen_mapper.device_ast_builder.seen_dtypes + | codegen_mapper.host_ast_builder.seen_dtypes) + seen_atomic_dtypes = (codegen_mapper.device_ast_builder.seen_atomic_dtypes + | codegen_mapper.host_ast_builder.seen_atomic_dtypes) + seen_functions = (codegen_mapper.device_ast_builder.seen_functions + | codegen_mapper.host_ast_builder.seen_functions) - codegen_state = CodeGenerationState( - kernel=kernel, - target=target, - implemented_data_info=implemented_data_info, - implemented_domain=initial_implemented_domain, - implemented_predicates=frozenset(), - seen_dtypes=seen_dtypes, - seen_functions=seen_functions, - seen_atomic_dtypes=seen_atomic_dtypes, - var_subst_map={}, - allow_complex=allow_complex, - var_name_generator=kernel.get_var_name_generator(), - is_generating_device_code=False, - gen_program_name=( - target.host_program_name_prefix - + kernel.name - + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.linearization), - callables_table=callables_table, - is_entrypoint=is_entrypoint, - codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel), - ) - - from loopy.codegen.result import generate_host_or_device_program - - codegen_result = generate_host_or_device_program( - codegen_state, - schedule_index=0) - - device_code_str = codegen_result.device_code() - - from loopy.check import check_implemented_domains - assert check_implemented_domains(kernel, codegen_result.implemented_domains, - device_code_str) + # FIXME: Fix this!!! + # from loopy.check import check_implemented_domains + # assert check_implemented_domains(kernel, + # codegen_result.implemented_domains, + # codegen_result.device_code()) # {{{ handle preambles - for idi in codegen_state.implemented_data_info: + for idi in get_idis_for_kernel(kernel): seen_dtypes.add(idi.dtype) for tv in kernel.temporary_variables.values(): @@ -602,7 +291,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes, - codegen_state=codegen_state ) preamble_generators = (kernel.preamble_generators @@ -614,12 +302,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, # }}} - # For faster unpickling in the common case when implemented_domains isn't needed. - from loopy.tools import LazilyUnpicklingDict - codegen_result = codegen_result.copy( - implemented_domains=LazilyUnpicklingDict( - codegen_result.implemented_domains)) - codegen_plog.done() return codegen_result diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 8d98196f2..8adf97fa7 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -23,7 +23,6 @@ THE SOFTWARE. """ -from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, @@ -31,16 +30,17 @@ from loopy.diagnostic import LoopyError -def synthesize_idis_for_extra_args(kernel, schedule_index): +def synthesize_idis_for_extra_args(kernel, sched_item): """ + :arg kernel: An instance of :class:`loopy.LoopKernel`. + :arg sched_item: An instance of :class:`loopy.schedule.tree.Function`. :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` """ - sched_item = kernel.linearization[schedule_index] - from loopy.codegen import ImplementedDataInfo from loopy.kernel.data import InameArg, AddressSpace + from loopy.schedule.tree import Function - assert isinstance(sched_item, CallKernel) + assert isinstance(sched_item, Function) idis = [] diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 713254075..7f1c91eff 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -104,20 +104,16 @@ def generate_instruction_code(codegen_state, insn): ast) -def generate_assignment_instruction_code(codegen_state, insn): - kernel = codegen_state.kernel - - ecm = codegen_state.expression_to_code_mapper - +def generate_assignment_instruction_code(kernel, insn, ast_builder, vinfo): + ecm = ast_builder.get_expression_to_code_mapper(kernel) from loopy.expression import VectorizabilityChecker # {{{ vectorization handling - if codegen_state.vectorization_info: + if vinfo is not None: if insn.atomicity: raise UnvectorizableError("atomic operation") - vinfo = codegen_state.vectorization_info vcheck = VectorizabilityChecker( kernel, vinfo.iname, vinfo.length) lhs_is_vector = vcheck(insn.assignee) @@ -158,22 +154,22 @@ def generate_assignment_instruction_code(codegen_state, insn): del lhs - result = codegen_state.ast_builder.emit_assignment(codegen_state, insn) + result = ast_builder.emit_assignment(kernel, insn) # {{{ tracing - lhs_dtype = codegen_state.kernel.get_var_descriptor(assignee_var_name).dtype + lhs_dtype = kernel.get_var_descriptor(assignee_var_name).dtype if kernel.options.trace_assignments or kernel.options.trace_assignment_values: - if codegen_state.vectorization_info and is_vector: + if vinfo and is_vector: raise UnvectorizableError("tracing does not support vectorization") from pymbolic.mapper.stringifier import PREC_NONE - lhs_code = codegen_state.expression_to_code_mapper(insn.assignee, PREC_NONE) + lhs_code = ecm(insn.assignee, PREC_NONE) from cgen import Statement as S # noqa - gs, ls = kernel.get_grid_size_upper_bounds(codegen_state.callables_table) + gs, ls = kernel.get_grid_size_upper_bounds(callables_table) printf_format = "{}.{}[{}][{}]: {}".format( kernel.name, diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7523c11d7..97a8efb2a 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -20,6 +20,10 @@ THE SOFTWARE. """ +from loopy.codegen import VectorizationInfo +from loopy.schedule.tree import CombineMapper +from dataclasses import dataclass +from typing import Optional, Any, List, Union from pytools import ImmutableRecord @@ -46,10 +50,6 @@ def process_preambles(preambles): .. autoclass:: GeneratedProgram .. autoclass:: CodeGenerationResult - -.. autofunction:: merge_codegen_results - -.. autofunction:: generate_host_or_device_program """ @@ -65,12 +65,6 @@ class GeneratedProgram(ImmutableRecord): Once generated, this captures the AST of the overall function definition, including the body. - - .. attribute:: body_ast - - Once generated, this captures the AST of the operative function - body (including declaration of necessary temporaries), but not - the overall function definition. """ @@ -82,11 +76,6 @@ class CodeGenerationResult(ImmutableRecord): A list of :class:`GeneratedProgram` instances intended to run on the compute device. - .. attribute:: implemented_domains - - A mapping from instruction ID to a list of :class:`islpy.Set` - objects. - .. attribute:: host_preambles .. attribute:: device_preambles @@ -99,29 +88,12 @@ class CodeGenerationResult(ImmutableRecord): a list of :class:`loopy.codegen.ImplementedDataInfo` objects. Only added at the very end of code generation. """ - - @staticmethod - def new(codegen_state, insn_id, ast, implemented_domain): - prg = GeneratedProgram( - name=codegen_state.gen_program_name, - is_device_program=codegen_state.is_generating_device_code, - ast=ast) - - if codegen_state.is_generating_device_code: - kwargs = { - "host_program": None, - "device_programs": [prg], - } - else: - kwargs = { - "host_program": prg, - "device_programs": [], - } - - return CodeGenerationResult( - implemented_data_info=codegen_state.implemented_data_info, - implemented_domains={insn_id: [implemented_domain]}, - **kwargs) + def __init__(self, host_program, device_programs, host_preambles=[], + device_preambles=[]): + super().__init__(host_program=host_program, + device_programs=device_programs, + host_preambles=host_preambles, + device_preambles=device_preambles) def host_code(self): preamble_codes = process_preambles(getattr(self, "host_preambles", [])) @@ -153,180 +125,328 @@ def all_code(self): + "\n\n" + str(self.host_program.ast)) - def current_program(self, codegen_state): - if codegen_state.is_generating_device_code: - if self.device_programs: - result = self.device_programs[-1] - else: - result = None - else: - result = self.host_program - - if result is None: - ast = codegen_state.ast_builder.ast_block_class([]) - result = GeneratedProgram( - name=codegen_state.gen_program_name, - is_device_program=codegen_state.is_generating_device_code, - ast=ast) - - assert result.name == codegen_state.gen_program_name - return result - - def with_new_program(self, codegen_state, program): - if codegen_state.is_generating_device_code: - assert program.name == codegen_state.gen_program_name - assert program.is_device_program - return self.copy( - device_programs=( - self.device_programs[:-1] - + - [program])) - else: - assert program.name == codegen_state.gen_program_name - assert not program.is_device_program - return self.copy(host_program=program) +# }}} - def current_ast(self, codegen_state): - return self.current_program(codegen_state).ast - def with_new_ast(self, codegen_state, new_ast): - return self.with_new_program( - codegen_state, - self.current_program(codegen_state).copy( - ast=new_ast)) +def get_idis_for_kernel(kernel): + """ + Returns a :class:`list` of :class:`~loopy.codegen.ImplementedDataInfo` for + *kernel*. -# }}} + :arg kernel: An instance of :class:`loopy.LoopKernel`. + """ + from loopy.kernel.data import ValueArg + from loopy.kernel.array import ArrayBase + from loopy.codegen import ImplementedDataInfo + + implemented_data_info = [] + + for arg in kernel.args: + is_written = arg.name in kernel.get_written_variables() + if isinstance(arg, ArrayBase): + implemented_data_info.extend( + arg.decl_info( + kernel.target, + is_written=is_written, + index_dtype=kernel.index_dtype)) + elif isinstance(arg, ValueArg): + implemented_data_info.append(ImplementedDataInfo( + target=kernel.target, + name=arg.name, + dtype=arg.dtype, + arg_class=ValueArg, + is_written=is_written)) + else: + raise ValueError("argument type not understood: '%s'" % type(arg)) + return implemented_data_info -# {{{ support code for AST merging -def merge_codegen_results(codegen_state, elements, collapse=True): - elements = [el for el in elements if el is not None] +@dataclass(frozen=True) +class CodeGenerationContext: + """ + A context passed around while traversing the schedule tree to generate the + target AST. + """ + in_device: bool + vectorization_info: Optional[VectorizationInfo] = None - if not elements: - return CodeGenerationResult( - host_program=None, - device_programs=[], - implemented_domains={}, - implemented_data_info=codegen_state.implemented_data_info) + def copy(self, *, in_device=None, vectorization_info=None): + if in_device is None: + in_device = self.in_device - ast_els = [] - new_device_programs = [] - dev_program_names = set() - implemented_domains = {} - codegen_result = None + if vectorization_info is None: + vectorization_info = self.vectorization_info - block_cls = codegen_state.ast_builder.ast_block_class - block_scope_cls = codegen_state.ast_builder.ast_block_scope_class + return CodeGenerationContext( + in_device=in_device, + vectorization_info=vectorization_info) - for el in elements: - if isinstance(el, CodeGenerationResult): - if codegen_result is None: - codegen_result = el - else: - assert ( - el.current_program(codegen_state).name - == codegen_result.current_program(codegen_state).name) - - for insn_id, idoms in el.implemented_domains.items(): - implemented_domains.setdefault(insn_id, []).extend(idoms) - - if not codegen_state.is_generating_device_code: - for dp in el.device_programs: - if dp.name not in dev_program_names: - new_device_programs.append(dp) - dev_program_names.add(dp.name) - - cur_ast = el.current_ast(codegen_state) - if (isinstance(cur_ast, block_cls) - and not isinstance(cur_ast, block_scope_cls)): - ast_els.extend(cur_ast.contents) + +# {{{ program generation top-level + +@dataclass(frozen=True) +class CodeGenMapperAccumulator: + host_ast: List[Union[Any]] + device_ast: List[Union[GeneratedProgram, Any]] + + +class CodeGenMapper(CombineMapper): + def __init__(self, kernel): + self.kernel = kernel + self.host_ast_builder = kernel.target.get_host_ast_builder() + self.device_ast_builder = kernel.target.get_device_ast_builder() + + def combine(self, accumulators): + + def _is_a_list_of_generated_program(ast): + return (isinstance(ast, list) and all(isinstance(el, GeneratedProgram) + for el in ast)) + + def _is_a_list_of_ast_nodes(astb, ast): + return (isinstance(ast, list) and all(isinstance(el, astb.ast_base_class) + for el in ast)) + + # either all of them are programs or all of them are ASTs + assert (all(_is_a_list_of_generated_program(acc.device_ast) + for acc in accumulators) + or all(_is_a_list_of_ast_nodes(self.device_ast_builder, + acc.device_ast) + for acc in accumulators)) + + # for each accumulator + assert all(_is_a_list_of_ast_nodes(self.host_ast_builder, acc.host_ast) + for acc in accumulators) + + host_components = [] + dev_components = [] + + for acc in accumulators: + if acc.host_ast is not None: + host_components.extend(acc.host_ast) + + if acc.device_ast is not None: + dev_components.extend(acc.device_ast) + + return CodeGenMapperAccumulator(host_components, + dev_components) + + def map_schedule(self, expr): + from loopy.kernel.data import AddressSpace + + children_res = self.combine([self.rec(child, CodeGenerationContext(False)) + for child in expr.children]) + + for tv in self.kernel.temporary_variables.items(): + if tv.address_space == AddressSpace.GLOBAL and ( + tv.initializer is not None): + # prepend the initializer atop the code. + raise NotImplementedError + + """ + for tv in sorted( + kernel.temporary_variables.values(), + key=lambda tv: tv.name): + + if tv.address_space == AddressSpace.GLOBAL and ( + tv.initializer is not None): + assert tv.read_only + + decl_info, = tv.decl_info(self.target, + index_dtype=kernel.index_dtype) + decl = self.wrap_global_constant( + self.get_temporary_decl( + codegen_state, schedule_index, tv, + decl_info)) + + if tv.initializer is not None: + decl = Initializer(decl, generate_array_literal( + codegen_state, tv, tv.initializer)) + + result.append(decl) + """ + assert all(isinstance(el, GeneratedProgram) + for el in children_res.device_ast) + + host_fn_body_ast = self.host_ast_builder.ast_block_class(children_res + .host_ast) + + idis = get_idis_for_kernel(self.kernel) + host_fn_name = (self.kernel.target.host_program_name_prefix + + self.kernel.name + + self.kernel.target.host_program_name_suffix) + host_fn_decl = (self + .host_ast_builder + .get_function_declaration(self.kernel, host_fn_name, idis, + is_generating_device_code=True)) + host_fn_ast = (self + .host_ast_builder + .get_function_definition(self.kernel, host_fn_name, idis, + host_fn_decl, host_fn_body_ast)) + + host_prog = GeneratedProgram(name=host_fn_name, is_device_program=False, + ast=host_fn_ast) + + return CodeGenerationResult(host_prog, children_res.device_ast) + + def map_function(self, expr, context): + from loopy.codegen.control import synthesize_idis_for_extra_args + assert not context.in_device + + # {{{ Host-side: call the kernel + + from loopy.schedule.tree import InstructionGatherer + gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids_as_exprs( + InstructionGatherer()(expr)) + idis = (get_idis_for_kernel(self.kernel) + + synthesize_idis_for_extra_args(self.kernel, expr)) + + dev_fn_decl = (self + .device_ast_builder + .get_function_declaration(self.kernel, expr.name, idis, + is_generating_device_code=True)) + host_ast = self.host_ast_builder.get_kernel_call(self.kernel, + expr.name, idis, + expr.extra_args) + + # }}} + + # {{{ Device side: Define the kernel + + dwnstrm_ctx = context.copy(in_device=True) + + dev_fn_decl = (self + .device_ast_builder + .get_function_declaration(self.kernel, expr.name, idis, + is_generating_device_code=True)) + children_res = self.combine([self.rec(child, dwnstrm_ctx) + for child in expr.children]) + dev_fn_body_ast = self.device_ast_builder.ast_block_class(children_res + .device_ast) + assert children_res.host_ast == [] + + dev_fn_ast = (self + .device_ast_builder + .get_function_definition(self.kernel, expr.name, idis, + dev_fn_decl, dev_fn_body_ast)) + + dev_prog = GeneratedProgram(name=expr.name, is_device_program=True, + ast=dev_fn_ast) + + # }}} + + return CodeGenMapperAccumulator([host_ast], [dev_prog]) + + # {{{ for loop + + def map_for(self, expr, context): + from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, + VectorizeTag, LoopedIlpTag, + ForceSequentialTag, + InOrderSequentialSequentialTag) + + unr_tags = (UnrolledIlpTag, UnrollTag) + vec_tags = (VectorizeTag, ) + seq_tags = (LoopedIlpTag, ForceSequentialTag, + InOrderSequentialSequentialTag) + ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 + + if self.kernel.iname_tags_of_type(expr.iname, unr_tags): + dwnstrm_ctx = 1/0 + raise NotImplementedError + elif self.kernel.iname_tags_of_type(expr.iname, vec_tags): + dwnstrm_ctx = 1/0 + raise NotImplementedError + else: + assert (len(self.kernel.inames[expr.iname].tags) == 0 + or self.kernel.iname_tags_of_type(expr.iname, + seq_tags)) + + dwnstrm_ctx = context.copy(vectorization_info=None) + children_res = self.combine([self.rec(child, dwnstrm_ctx) + for child in expr.children]) + loop_body = ast_builder.ast_block_class(children_res.device_ast + if context.in_device + else children_res.host_ast) + assert expr.step == 1 + loop_ast = ast_builder.emit_sequential_loop(self.kernel, expr.iname, + self.kernel.index_dtype, + expr.lower_bound, + expr.upper_bound, + loop_body + ) + if context.in_device: + return CodeGenMapperAccumulator(host_ast=children_res.host_ast, + device_ast=[loop_ast]) else: - ast_els.append(cur_ast) + return CodeGenMapperAccumulator(host_ast=[loop_ast], + device_ast=children_res.device_ast) + + # }}} + # {{{ If + + def map_if(self, expr, context): + ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 + children_res = self.combine([self.rec(child, context) + for child in expr.children]) + + if_body = ast_builder.ast_block_class(children_res.device_ast + if context.in_device + else children_res.host_ast) + + if_ast = ast_builder.emit_if(expr.condition, + if_body) + + if context.in_device: + return CodeGenMapperAccumulator(host_ast=children_res.host_ast, + device_ast=[if_ast]) else: - ast_els.append(el) - - if collapse and len(ast_els) == 1: - ast, = ast_els - else: - ast = block_cls(ast_els) - - kwargs = {} - if not codegen_state.is_generating_device_code: - kwargs["device_programs"] = new_device_programs - - return (codegen_result - .with_new_ast(codegen_state, ast) - .copy( - implemented_domains=implemented_domains, - implemented_data_info=codegen_state.implemented_data_info, - **kwargs)) - - -def wrap_in_if(codegen_state, condition_exprs, inner): - if condition_exprs: - from pymbolic.primitives import LogicalAnd - from pymbolic.mapper.stringifier import PREC_NONE - cur_ast = inner.current_ast(codegen_state) - return inner.with_new_ast( - codegen_state, - codegen_state.ast_builder.emit_if( - codegen_state.expression_to_code_mapper( - LogicalAnd(tuple(condition_exprs)), PREC_NONE), - cur_ast)) - - return inner + return CodeGenMapperAccumulator(host_ast=[if_ast], + device_ast=children_res.device_ast) -# }}} + # }}} + def map_barrier(self, expr, context): + # ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 + raise NotImplementedError -# {{{ program generation top-level + # {{{ instruction + + def map_run_instruction(self, expr, context): + from loopy.kernel.instruction import (CallInstruction, Assignment, + CInstruction, NoOpInstruction) + from loopy.codegen.instruction import generate_assignment_instruction_code + + ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 + + insn = self.kernel.id_to_insn[expr.insn_id] + + if isinstance(insn, CallInstruction): + raise NotImplementedError + elif isinstance(insn, Assignment): + insn_ast = generate_assignment_instruction_code(self.kernel, insn, + ast_builder, + (context + .vectorization_info)) + elif isinstance(insn, CInstruction): + raise NotImplementedError + elif isinstance(insn, NoOpInstruction): + raise NotImplementedError + else: + raise NotImplementedError + + if context.in_device: + return CodeGenMapperAccumulator(host_ast=[], + device_ast=[insn_ast]) + else: + return CodeGenMapperAccumulator(host_ast=[insn_ast], + device_ast=[]) + + # }}} -def generate_host_or_device_program(codegen_state, schedule_index): - ast_builder = codegen_state.ast_builder - temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) - - from functools import partial - - from loopy.codegen.control import build_loop_nest - if codegen_state.is_generating_device_code: - from loopy.schedule import CallKernel - assert isinstance(codegen_state.kernel.linearization[schedule_index], - CallKernel) - - from loopy.codegen.loop import set_up_hw_parallel_loops - codegen_result = set_up_hw_parallel_loops( - codegen_state, schedule_index, - next_func=partial(build_loop_nest, - schedule_index=schedule_index + 1)) - else: - codegen_result = build_loop_nest(codegen_state, schedule_index) - - if (codegen_state.is_generating_device_code - or codegen_state.is_entrypoint): - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) - - return codegen_result + def map_loop(self, expr, context): + raise RuntimeError("Cannot handle loops. At this point every loop" + " should have been resolved as 'For' nodes.") # }}} diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index e20449cef..a5d222e48 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -507,3 +507,9 @@ def insert_predicates_into_schedule(kernel): assert isinstance(kernel.schedule, Schedule) new_schedule = PredicateInsertionMapper(kernel)(kernel.schedule) return kernel.copy(schedule=new_schedule) + + +def get_insns_in_function(kernel, name): + function, = [child for child in kernel.schedule.children + if isinstance(child, Function) and child.name == name] + return InstructionGatherer()(function) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a6357a12b..bc9c476f5 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -158,6 +158,10 @@ class ASTBuilderBase: def __init__(self, target): self.target = target + self.seen_dtypes = set() + self.seen_functions = set() + self.seen_atomic_dtypes = set() + # {{{ library @property @@ -184,12 +188,12 @@ def preamble_generators(self): def ast_module(self): raise NotImplementedError() - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + def get_function_definition(self, kernel, name, implemented_data_info, + function_decl, function_body): raise NotImplementedError - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration(self, kernel, name, implemented_data_info, + is_generating_device_code): raise NotImplementedError def generate_top_of_body(self, codegen_state): @@ -198,13 +202,25 @@ def generate_top_of_body(self, codegen_state): def get_temporary_decls(self, codegen_state, schedule_index): raise NotImplementedError - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): raise NotImplementedError + @property + def ast_base_class(self): + raise NotImplementedError() + @property def ast_block_class(self): raise NotImplementedError() + @property + def ast_for_class(self): + raise NotImplementedError() + + @property + def ast_if_class(self): + raise NotImplementedError() + def get_expression_to_code_mapper(self, codegen_state): raise NotImplementedError() @@ -233,8 +249,8 @@ def emit_assignment(self, codegen_state, insn): def emit_multiple_assignment(self, codegen_state, insn): raise NotImplementedError() - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + def emit_sequential_loop(self, kernel, iname, iname_dtype, + lbound, ubound, inner): raise NotImplementedError() @property @@ -280,12 +296,12 @@ def __str__(self): class DummyHostASTBuilder(ASTBuilderBase): - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + def get_function_definition(self, kernel, name, implemented_data_info, + function_decl, function_body): return function_body - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration(self, kernel, name, implemented_data_info, + is_generating_device_code): return None def get_temporary_decls(self, codegen_state, schedule_index): @@ -294,7 +310,7 @@ def get_temporary_decls(self, codegen_state, schedule_index): def get_expression_to_code_mapper(self, codegen_state): return _DummyExpressionToCodeMapper() - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): return None @property diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 0058ad47f..78c4acf85 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -722,58 +722,11 @@ def known_callables(self): # {{{ code generation - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, - function_decl, function_body): - kernel = codegen_state.kernel - - from cgen import ( - FunctionBody, - - # Post-mid-2016 cgens have 'Collection', too. - Module as Collection, - Initializer, - Line) - - result = [] - - from loopy.kernel.data import AddressSpace - from loopy.schedule import CallKernel - # We only need to write declarations for global variables with - # the first device program. `is_first_dev_prog` determines - # whether this is the first device program in the schedule. - is_first_dev_prog = codegen_state.is_generating_device_code - for i in range(schedule_index): - if isinstance(kernel.linearization[i], CallKernel): - is_first_dev_prog = False - break - if is_first_dev_prog: - for tv in sorted( - kernel.temporary_variables.values(), - key=lambda tv: tv.name): - - if tv.address_space == AddressSpace.GLOBAL and ( - tv.initializer is not None): - assert tv.read_only - - decl_info, = tv.decl_info(self.target, - index_dtype=kernel.index_dtype) - decl = self.wrap_global_constant( - self.get_temporary_decl( - codegen_state, schedule_index, tv, - decl_info)) - - if tv.initializer is not None: - decl = Initializer(decl, generate_array_literal( - codegen_state, tv, tv.initializer)) - - result.append(decl) - + def get_function_definition(self, kernel, name, implemented_data_info, + function_decl, function_body): + from cgen import FunctionBody fbody = FunctionBody(function_decl, function_body) - if not result: - return fbody - else: - return Collection(result+[Line(), fbody]) + return fbody def idi_to_cgen_declarator(self, kernel, idi): from loopy.kernel.data import InameArg @@ -796,25 +749,25 @@ def idi_to_cgen_declarator(self, kernel, idi): else: return var_descr.get_arg_decl(self) - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, + is_generating_device_code, is_entrypoint): from cgen import FunctionDeclaration, Value - name = codegen_result.current_program(codegen_state).name if self.target.fortran_abi: name += "_" - if codegen_state.is_entrypoint: + if is_entrypoint: name = Value("void", name) else: name = Value("static void", name) return FunctionDeclarationWrapper( FunctionDeclaration( name, - [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in codegen_state.implemented_data_info])) + [self.idi_to_cgen_declarator(kernel, idi) + for idi in implemented_data_info])) - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): return None def get_temporary_decls(self, codegen_state, schedule_index): @@ -952,11 +905,26 @@ def get_temporary_decls(self, codegen_state, schedule_index): return result + @property + def ast_base_class(self): + from cgen import Generable + return Generable + @property def ast_block_class(self): from cgen import Block return Block + @property + def ast_for_class(self): + from cgen import For + return For + + @property + def ast_if_class(self): + from cgen import If + return If + @property def ast_block_scope_class(self): return ScopingBlock @@ -970,13 +938,14 @@ def ast_module(self): import cgen return cgen - def get_expression_to_code_mapper(self, codegen_state): - return self.get_expression_to_c_expression_mapper(codegen_state) + def get_expression_to_code_mapper(self, kernel, callables_table): + return self.get_expression_to_c_expression_mapper(kernel, + callables_table) - def get_expression_to_c_expression_mapper(self, codegen_state): + def get_expression_to_c_expression_mapper(self, kernel, callables_table): from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper return ExpressionToCExpressionMapper( - codegen_state, fortran_abi=self.target.fortran_abi) + kernel, callables_table, self, fortran_abi=self.target.fortran_abi) def get_c_expression_to_code_mapper(self): from loopy.target.c.codegen.expression import CExpressionToCodeMapper @@ -1053,13 +1022,13 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): return arg_decl - def emit_assignment(self, codegen_state, insn): - kernel = codegen_state.kernel - ecm = codegen_state.expression_to_code_mapper + def emit_assignment(self, kernel, insn): + + ecm = self.get_expression_to_code_mapper(kernel) assignee_var_name, = insn.assignee_var_names() - lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_var = kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity is not None: @@ -1087,15 +1056,15 @@ def emit_assignment(self, codegen_state, insn): needed_dtype=lhs_dtype)) elif isinstance(lhs_atomicity, AtomicInit): - codegen_state.seen_atomic_dtypes.add(lhs_dtype) - return codegen_state.ast_builder.emit_atomic_init( + self.seen_atomic_dtypes.add(lhs_dtype) + return self.emit_atomic_init( codegen_state, lhs_atomicity, lhs_var, insn.assignee, insn.expression, lhs_dtype, rhs_type_context) elif isinstance(lhs_atomicity, AtomicUpdate): - codegen_state.seen_atomic_dtypes.add(lhs_dtype) - return codegen_state.ast_builder.emit_atomic_update( + self.seen_atomic_dtypes.add(lhs_dtype) + return self.emit_atomic_update( codegen_state, lhs_atomicity, lhs_var, insn.assignee, insn.expression, lhs_dtype, rhs_type_context) @@ -1160,9 +1129,9 @@ def emit_multiple_assignment(self, codegen_state, insn): CExpression(self.get_c_expression_to_code_mapper(), in_knl_callable_as_call)) - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): - ecm = codegen_state.expression_to_code_mapper + def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, + inner): + ecm = self.get_expression_to_code_mapper(kernel) from pymbolic import var from pymbolic.primitives import Comparison diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index f54007e7e..f2b59349f 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -57,16 +57,22 @@ class ExpressionToCExpressionMapper(IdentityMapper): Mapper that converts a loopy-semantic expression to a C-semantic expression with typecasts, appropriate arithmetic semantic mapping, etc. """ - def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): - self.kernel = codegen_state.kernel - self.codegen_state = codegen_state + def __init__(self, kernel, callables_table, ast_builder, + vectorization_info=None, fortran_abi=False, + type_inf_mapper=None): + self.kernel = kernel + self.callables_table = callables_table + self.ast_builder = ast_builder if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, - self.codegen_state.callables_table) + callables_table) + self.type_inf_mapper = type_inf_mapper - self.allow_complex = codegen_state.allow_complex + # TODO: rewire mapper methods so that we don't store vectorization_info + # as a state, but instead pass it as an argument to each mapper method + self.vectorization_info = vectorization_info self.fortran_abi = fortran_abi @@ -74,13 +80,14 @@ def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): def with_assignments(self, names_to_vars): type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars) - return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper) + return type(self)(self.kernel, self.vectorization_info, + self.fortran_abi, type_inf_mapper) def infer_type(self, expr): result = self.type_inf_mapper(expr) assert isinstance(result, LoopyType) - self.codegen_state.seen_dtypes.add(result) + self.ast_builder.seen_dtypes.add(result) return result def find_array(self, expr): @@ -124,7 +131,7 @@ def __call__(self, expr, prec=None, type_context=None, needed_dtype=None): assert prec == PREC_NONE return CExpression( - self.codegen_state.ast_builder.get_c_expression_to_code_mapper(), + self.ast_builder.get_c_expression_to_code_mapper(), self.rec(expr, type_context, needed_dtype)) # }}} @@ -135,17 +142,7 @@ def map_variable(self, expr, type_context): def postproc(x): return x - if expr.name in self.codegen_state.var_subst_map: - if self.kernel.options.annotate_inames: - return var( - "/* {} */ {}".format( - expr.name, - self.rec(self.codegen_state.var_subst_map[expr.name], - type_context))) - else: - return self.rec(self.codegen_state.var_subst_map[expr.name], - type_context) - elif expr.name in self.kernel.arg_dict: + if expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] from loopy.kernel.array import ArrayBase if isinstance(arg, ArrayBase): @@ -176,7 +173,7 @@ def postproc(x): or temporary.address_space == AddressSpace.GLOBAL): postproc = lambda x: x[0] # noqa - result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name) + result = self.kernel.mangle_symbol(self.ast_builder, expr.name) if result is not None: _, c_name = result return postproc(var(c_name)) @@ -209,15 +206,13 @@ def make_var(name): ary = self.find_array(expr) from loopy.kernel.array import get_access_info - from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + expr, self.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) @@ -278,7 +273,7 @@ def make_var(name): self.kernel, self.rec(subscript, "i"))) if access_info.vector_index is not None: - return self.codegen_state.ast_builder.add_vector_access( + return self.ast_builder.add_vector_access( result, access_info.vector_index) else: return result @@ -341,7 +336,7 @@ def map_integer_div_operator(self, base_func_name, op_func, expr, type_context): def seen_func(name): from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( + self.ast_builder.seen_functions.add( SeenFunction( name, f"{name}_{suffix}", (result_dtype, result_dtype), @@ -398,7 +393,7 @@ def map_comparison(self, expr, type_context): self.rec(expr.right, inner_type_context)) def map_type_cast(self, expr, type_context): - registry = self.codegen_state.ast_builder.target.get_dtype_registry() + registry = self.ast_builder.target.get_dtype_registry() cast = var("(%s)" % registry.dtype_to_ctype(expr.type)) return cast(self.rec(expr.child, type_context)) @@ -511,7 +506,7 @@ def map_power(self, expr, type_context): func_name = ("loopy_pow_" f"{tgt_dtype.numpy_dtype}_{exponent_dtype.numpy_dtype}") - self.codegen_state.seen_functions.add( + self.ast_builder.seen_functions.add( SeenFunction( "int_pow", func_name, (tgt_dtype, exponent_dtype), diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 08413b615..13102b654 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -321,10 +321,12 @@ def known_callables(self): # {{{ top-level codegen - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - fdecl = super().get_function_declaration( - codegen_state, codegen_result, schedule_index) + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, + is_generating_device_code, is_entrypoint): + fdecl = super().get_function_declaration(kernel, name, + implemented_data_info, + is_generating_device_code) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) @@ -337,12 +339,9 @@ def get_function_declaration(self, codegen_state, codegen_result, from cgen import Extern fdecl = Extern("C", fdecl) - from loopy.schedule import get_insn_ids_for_block_at - _, local_grid_size = \ - codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at( - codegen_state.kernel.linearization, schedule_index), - codegen_state.callables_table) + from loopy.schedule.tree import get_insns_in_function + _, local_grid_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insns_in_function(kernel, name), callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 3c6ff52b0..e46a10150 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -201,12 +201,12 @@ def get_dtype_registry(self): class ISPCASTBuilder(CFamilyASTBuilder): - def _arg_names_and_decls(self, codegen_state): - implemented_data_info = codegen_state.implemented_data_info + def _arg_names_and_decls(self, kernel, implemented_data_info): + implemented_data_info = implemented_data_info arg_names = [iai.name for iai in implemented_data_info] arg_decls = [ - self.idi_to_cgen_declarator(codegen_state.kernel, idi) + self.idi_to_cgen_declarator(kernel, idi) for idi in implemented_data_info] # {{{ occa compatibility hackery @@ -230,16 +230,14 @@ def _arg_names_and_decls(self, codegen_state): # {{{ top-level codegen - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - name = codegen_result.current_program(codegen_state).name - + def get_function_declaration(self, name, kernel, implemented_data_info, + is_generating_device_code): from cgen import (FunctionDeclaration, Value) from cgen.ispc import ISPCExport, ISPCTask - arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + arg_names, arg_decls = self._arg_names_and_decls(kernel) - if codegen_state.is_generating_device_code: + if is_generating_device_code: result = ISPCTask( FunctionDeclaration( Value("void", name), @@ -255,19 +253,24 @@ def get_function_declaration(self, codegen_state, codegen_result, # }}} - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): - ecm = self.get_expression_to_code_mapper(codegen_state) + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + ecm = self.get_expression_to_code_mapper(kernel) + from loopy.schedule.tree import get_insns_in_function from pymbolic.mapper.stringifier import PREC_NONE - result = [] from cgen import Statement as S, Block + + gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insns_in_function(kernel, name)) + + result = [] if lsize: result.append( S( "assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE))) - arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + arg_names, arg_decls = self._arg_names_and_decls(kernel) from cgen.ispc import ISPCLaunch result.append( @@ -283,7 +286,7 @@ def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): # {{{ code generation guts def get_expression_to_c_expression_mapper(self, codegen_state): - return ExprToISPCExprMapper(codegen_state) + return ExprToISPCExprMapper(codegen_state, self) def add_vector_access(self, access_expr, index): return access_expr[index] @@ -485,17 +488,15 @@ def emit_assignment(self, codegen_state, insn): from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): - ecm = codegen_state.expression_to_code_mapper - + def emit_sequential_loop(self, kernel, iname, iname_dtype, + lbound, ubound, inner): from loopy.target.c import POD - from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer - from cgen.ispc import ISPCUniform + ecm = self.get_expression_to_code_mapper(kernel) + return For( InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), diff --git a/loopy/target/numba.py b/loopy/target/numba.py index 2df81ec1f..bdca2739c 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -47,28 +47,32 @@ def preamble_generators(self): _base_numba_preamble_generator ]) - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, - function_decl, function_body): + def get_function_definition(self, kernel, name, implemented_data_info, + function_decl, function_body): assert function_decl is None from genpy import Function return Function( - codegen_result.current_program(codegen_state).name, - [idi.name for idi in codegen_state.implemented_data_info], + name, + [idi.name for idi in implemented_data_info], function_body, decorators=self.get_python_function_decorators()) def get_python_function_decorators(self): return () - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + ecm = self.get_expression_to_code_mapper(kernel) from pymbolic.mapper.stringifier import PREC_NONE from genpy import Statement - ecm = self.get_expression_to_code_mapper(codegen_state) - implemented_data_info = codegen_state.implemented_data_info + ecm = self.get_expression_to_code_mapper(kernel) + implemented_data_info = implemented_data_info + + from loopy.schedule.tree import get_insns_in_function + gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insns_in_function(kernel, name)) return Statement( "{}[{}, {}]({})".format( diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 89710c023..700d0c5fc 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -599,14 +599,18 @@ def preamble_generators(self): # {{{ top-level codegen - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - fdecl = super().get_function_declaration( - codegen_state, codegen_result, schedule_index) + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, + is_generating_device_code, is_entrypoint): + assert is_generating_device_code + + fdecl = super().get_function_declaration(kernel, name, + implemented_data_info, + is_generating_device_code) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not codegen_state.is_entrypoint: + if not is_entrypoint: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature return fdecl @@ -616,11 +620,9 @@ def get_function_declaration(self, codegen_state, codegen_result, from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) - from loopy.schedule import get_insn_ids_for_block_at - _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at( - codegen_state.kernel.linearization, schedule_index), - codegen_state.callables_table) + from loopy.schedule.tree import get_insns_in_function + _, local_sizes = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insns_in_function(kernel, name), callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 06ff41908..6d17a1226 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -661,18 +661,18 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): # {{{ code generation guts - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + def get_function_definition(self, kernel, name, implemented_data_info, + function_decl, function_body): from loopy.kernel.data import TemporaryVariable args = ( ["_lpy_cl_kernels", "queue"] - + [idi.name for idi in codegen_state.implemented_data_info + + [idi.name for idi in implemented_data_info if not issubclass(idi.arg_class, TemporaryVariable)] + ["wait_for=None", "allocator=None"]) from genpy import (For, Function, Suite, Return, Line, Statement as S) return Function( - codegen_result.current_program(codegen_state).name, + name, args, Suite([ Line(), @@ -684,22 +684,22 @@ def get_function_definition(self, codegen_state, codegen_result, For("_tv", "_global_temporaries", # free global temporaries S("_tv.release()")) - ] if self._get_global_temporaries(codegen_state) else [] + ] if self._get_global_temporaries(kernel) else [] ) + [ Line(), Return("_lpy_evt"), ])) - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration(self, kernel, name, implemented_data_info, + is_generating_device_code): # no such thing in Python return None - def _get_global_temporaries(self, codegen_state): + def _get_global_temporaries(self, kernel): from loopy.kernel.data import AddressSpace return sorted( - (tv for tv in codegen_state.kernel.temporary_variables.values() + (tv for tv in kernel.temporary_variables.values() if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) @@ -766,22 +766,26 @@ def alloc_nbytes(tv): return code_lines - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): - ecm = self.get_expression_to_code_mapper(codegen_state) + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + ecm = self.get_expression_to_code_mapper(kernel) + + from loopy.schedule.tree import get_insns_in_function + gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insns_in_function(kernel, name)) if not gsize: gsize = (1,) if not lsize: lsize = (1,) - all_args = codegen_state.implemented_data_info + extra_args + all_args = implemented_data_info + extra_args value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ generate_value_arg_setup( - codegen_state.kernel, + kernel, all_args) arry_arg_code = generate_array_arg_setup( - codegen_state.kernel, + kernel, all_args, arg_idx_to_cl_arg_idx) @@ -855,7 +859,7 @@ def preamble_generators(self): # }}} def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToPyOpenCLCExpressionMapper(codegen_state) + return ExpressionToPyOpenCLCExpressionMapper(codegen_state, self) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index 15ddc4679..5068fb3f8 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -35,15 +35,17 @@ # {{{ expression to code class ExpressionToPythonMapper(StringifyMapper): - def __init__(self, codegen_state, type_inf_mapper=None): - self.kernel = codegen_state.kernel - self.codegen_state = codegen_state + def __init__(self, kernel, type_inf_mapper=None): + self.kernel = kernel if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper + self.seen_functions = set() + self.seen_dtypes = set() + def handle_unsupported_expression(self, victim, enclosing_prec): return Mapper.handle_unsupported_expression(self, victim, enclosing_prec) @@ -56,12 +58,6 @@ def map_constant(self, expr, enclosing_prec): return repr(expr) def map_variable(self, expr, enclosing_prec): - if expr.name in self.codegen_state.var_subst_map: - # Unimplemented: annotate_inames - return str(self.rec( - self.codegen_state.var_subst_map[expr.name], - enclosing_prec)) - if expr.name in self.kernel.all_inames(): return super().map_variable( expr, enclosing_prec) @@ -161,20 +157,19 @@ def ast_module(self): import genpy return genpy - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration(self, kernel, name, implemented_data_info, + is_generating_device_code): return None - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, - function_decl, function_body): + def get_function_definition(self, kernel, name, implemented_data_info, + function_decl, function_body): assert function_decl is None from genpy import Function return Function( - codegen_result.current_program(codegen_state).name, - [idi.name for idi in codegen_state.implemented_data_info], + name, + [idi.name for idi in implemented_data_info], function_body) def get_temporary_decls(self, codegen_state, schedule_index): @@ -204,13 +199,28 @@ def get_temporary_decls(self, codegen_state, schedule_index): return result - def get_expression_to_code_mapper(self, codegen_state): - return ExpressionToPythonMapper(codegen_state) + def get_expression_to_code_mapper(self, kernel): + return ExpressionToPythonMapper(kernel) + + @property + def ast_base_class(self): + from genpy import Generable + return Generable @property def ast_block_class(self): return Suite + @property + def ast_for_class(self): + from genpy import For + return For + + @property + def ast_if_class(self): + from genpyt import If + return If + @property def ast_block_scope_class(self): # Once a new version of genpy is released, switch to this: @@ -218,9 +228,9 @@ def ast_block_scope_class(self): # and delete the implementation above. return Collection - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): - ecm = codegen_state.expression_to_code_mapper + def emit_sequential_loop(self, kernel, iname, iname_dtype, + lbound, ubound, inner): + ecm = self.get_expression_to_code_mapper(kernel) from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM from genpy import For From cdce7253ef49597db6bdff17970818295fbfbbaf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 21 May 2021 17:57:25 -0500 Subject: [PATCH 030/109] handle hw inames --- loopy/codegen/instruction.py | 7 +- loopy/codegen/result.py | 60 +++++++-- loopy/kernel/tools.py | 19 +++ loopy/schedule/tree.py | 179 ++++++++++++++++++++++----- loopy/target/__init__.py | 4 +- loopy/target/c/__init__.py | 24 ++-- loopy/target/c/codegen/expression.py | 23 +++- loopy/target/cuda.py | 4 +- loopy/target/ispc.py | 10 +- loopy/target/numba.py | 7 +- loopy/target/opencl.py | 9 +- loopy/target/pyopencl.py | 17 ++- loopy/target/python.py | 17 +-- 13 files changed, 294 insertions(+), 86 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 7f1c91eff..d743bab68 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -104,8 +104,9 @@ def generate_instruction_code(codegen_state, insn): ast) -def generate_assignment_instruction_code(kernel, insn, ast_builder, vinfo): - ecm = ast_builder.get_expression_to_code_mapper(kernel) +def generate_assignment_instruction_code(kernel, insn, ast_builder, + hw_inames_expr, vinfo): + ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr) from loopy.expression import VectorizabilityChecker # {{{ vectorization handling @@ -154,7 +155,7 @@ def generate_assignment_instruction_code(kernel, insn, ast_builder, vinfo): del lhs - result = ast_builder.emit_assignment(kernel, insn) + result = ast_builder.emit_assignment(kernel, insn, hw_inames_expr) # {{{ tracing diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 97a8efb2a..e229ce358 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -20,10 +20,11 @@ THE SOFTWARE. """ +import pymbolic.primitives as prim from loopy.codegen import VectorizationInfo from loopy.schedule.tree import CombineMapper from dataclasses import dataclass -from typing import Optional, Any, List, Union +from typing import Optional, Any, List, Union, Mapping from pytools import ImmutableRecord @@ -169,17 +170,23 @@ class CodeGenerationContext: target AST. """ in_device: bool + hw_iname_exprs: Mapping[str, prim.Expression] vectorization_info: Optional[VectorizationInfo] = None - def copy(self, *, in_device=None, vectorization_info=None): + def copy(self, *, in_device=None, hw_iname_exprs=None, + vectorization_info=None): if in_device is None: in_device = self.in_device + if hw_iname_exprs is None: + hw_iname_exprs = self.hw_iname_exprs + if vectorization_info is None: vectorization_info = self.vectorization_info return CodeGenerationContext( in_device=in_device, + hw_iname_exprs=hw_iname_exprs, vectorization_info=vectorization_info) @@ -234,7 +241,8 @@ def _is_a_list_of_ast_nodes(astb, ast): def map_schedule(self, expr): from loopy.kernel.data import AddressSpace - children_res = self.combine([self.rec(child, CodeGenerationContext(False)) + children_res = self.combine([self.rec(child, + CodeGenerationContext(False, {})) for child in expr.children]) for tv in self.kernel.temporary_variables.items(): @@ -313,7 +321,42 @@ def map_function(self, expr, context): # {{{ Device side: Define the kernel - dwnstrm_ctx = context.copy(in_device=True) + # {{{ record the hw_iname_exprs for downstream elements + + from functools import reduce + from loopy.kernel.data import (HardwareConcurrentTag, GroupIndexTag, + LocalIndexTag) + from loopy.isl_helpers import static_min_of_pw_aff + from loopy.symbolic import (GroupHardwareAxisIndex, + LocalHardwareAxisIndex, + pw_aff_to_expr) + + all_inames = reduce(frozenset.union, + (self.kernel.id_to_insn[id].within_inames + for id in InstructionGatherer()(expr)), + frozenset()) + + def _hw_iname_expr(iname): + tag, = self.kernel.iname_tags_of_type(iname, HardwareConcurrentTag) + assert isinstance(tag, (GroupIndexTag, LocalIndexTag)) + lbound = static_min_of_pw_aff(self + .kernel.get_iname_bounds(iname) + .lower_bound_pw_aff, + constants_only=False) + + return pw_aff_to_expr(lbound) + (GroupHardwareAxisIndex(tag.axis) + if isinstance(tag, GroupIndexTag) + else + LocalHardwareAxisIndex(tag.axis)) + + hw_iname_exprs = {iname: _hw_iname_expr(iname) + for iname in all_inames + if self.kernel.iname_tags_of_type(iname, + HardwareConcurrentTag)} + + # }}} + + dwnstrm_ctx = context.copy(in_device=True, hw_iname_exprs=hw_iname_exprs) dev_fn_decl = (self .device_ast_builder @@ -373,7 +416,8 @@ def map_for(self, expr, context): self.kernel.index_dtype, expr.lower_bound, expr.upper_bound, - loop_body + loop_body, + context.hw_iname_exprs ) if context.in_device: return CodeGenMapperAccumulator(host_ast=children_res.host_ast, @@ -395,8 +439,8 @@ def map_if(self, expr, context): if context.in_device else children_res.host_ast) - if_ast = ast_builder.emit_if(expr.condition, - if_body) + if_ast = ast_builder.emit_if(self.kernel, expr.condition, if_body, + context.hw_iname_exprs) if context.in_device: return CodeGenMapperAccumulator(host_ast=children_res.host_ast, @@ -427,6 +471,8 @@ def map_run_instruction(self, expr, context): elif isinstance(insn, Assignment): insn_ast = generate_assignment_instruction_code(self.kernel, insn, ast_builder, + (context + .hw_iname_exprs), (context .vectorization_info)) elif isinstance(insn, CInstruction): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 1c6b59b0b..365746a33 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2064,4 +2064,23 @@ def get_outer_params(domains): # }}} +def get_all_hw_inames(kernel): + """ + Returns :class:`frozenset` of all iname traversing across the target + hardware's execution grid. + """ + from loopy.kernel.data import (filter_iname_tags_by_type, HardwareConcurrentTag) + return frozenset(iname.name + for iname in kernel.inames.values() + if filter_iname_tags_by_type(iname.tags, HardwareConcurrentTag)) + + +@memoize_on_first_arg +def has_complex_dtyped_var(kernel): + """ + Returns *True* if any variable in *kernel* is complex dtyped. + """ + return any(var.dtype.involves_complex() + for var in kernel.args + list(kernel.temporary_variables.values())) + # vim: foldmethod=marker diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index a5d222e48..a4f881885 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -7,6 +7,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from loopy.kernel import KernelState +from loopy.kernel.tools import get_all_hw_inames # {{{ LoopKernel.schedule a tree @@ -148,6 +149,10 @@ def make_current_node(self, node): self._build_stack.append(node) def make_and_enter_function(self, name, extra_args, extra_inames): + if isinstance(self.current_node, InstructionBlock): + # end of instruction block + self._build_stack.pop() + assert isinstance(self.current_node, Schedule) new_function = Function(name, extra_args, extra_inames, []) self.current_node.children.append(new_function) @@ -160,6 +165,10 @@ def make_and_enter_instruction_block(self): self.make_current_node(new_block) def make_and_enter_loop(self, iname): + if isinstance(self.current_node, InstructionBlock): + # end of instruction block + self._build_stack.pop() + assert isinstance(self.current_node, (Schedule, Function, Loop)) new_loop = Loop(iname, []) self.current_node.children.append(new_loop) @@ -180,6 +189,7 @@ def add_barrier(self, comment, kind, insn_id): def exit_function(self): if isinstance(self.current_node, InstructionBlock): self._build_stack.pop() + assert isinstance(self.current_node, Function) return self._build_stack.pop() @@ -354,19 +364,78 @@ def _align_and_intersect(d1, d2): return d1 & d2 +def _align_and_gist(d1, d2): + d1, d2 = isl.align_two(d1, d2) + return d1.gist(d2) + + def _wrap_in_if(cond, nodes): from loopy.symbolic import set_to_cond_expr if cond.is_universe(): return nodes else: - return [If(set_to_cond_expr(cond), nodes)] + return If(set_to_cond_expr(cond), nodes) + + +def _implement_hw_axes_in_domains(implemented_domain, domain, + kernel, gsize, lsize): + """ + If *domain* contains any inames going along hardware inames account for + those in *implemented_domain*. + + :arg gsize: A tuple of :class:`isl.PwAff` denoting the size of the + + :returns: An instance of :class:`isl.BasicSet` that includes constraints + from *implemented_domain* and constraints arising from constraining + hardware inames in *domain* to their corresponding + """ + from loopy.kernel.data import HardwareConcurrentTag, GroupIndexTag, LocalIndexTag + from loopy.isl_helpers import make_slab, static_min_of_pw_aff + + all_hw_inames = get_all_hw_inames(kernel) + + for dim_name in domain.get_var_dict(): + if dim_name in all_hw_inames: + if dim_name in implemented_domain.get_var_dict(): + # this hardware dim is already implemented => ignore + continue + + tag, = kernel.iname_tags_of_type(dim_name, HardwareConcurrentTag) + assert isinstance(tag, (GroupIndexTag, LocalIndexTag)) + + lbound = static_min_of_pw_aff(kernel + .get_iname_bounds(dim_name) + .lower_bound_pw_aff, constants_only=False) + size = (gsize[tag.axis] + + if isinstance(tag, GroupIndexTag) + + else + + lsize[tag.axis]) + + if not isinstance(size, int): + lbound, size = isl.align_two(lbound, size) + + implemented_domain = (implemented_domain + .add_dims(dim_type.param, 1) + .set_dim_name(dim_type.param, + implemented_domain + .dim(dim_type.param), + dim_name)) + + implemented_domain = (implemented_domain + & make_slab(implemented_domain.space, dim_name, + lbound, lbound + size)) + + return implemented_domain @dataclass(frozen=True) class PredicateInsertionContext: implemented_domain: isl.BasicSet - gsize: Optional[Tuple[isl.PwAff, ...]] = None - lsize: Optional[Tuple[isl.PwAff, ...]] = None + gsize: Optional[Tuple[prim.Expression, ...]] = None + lsize: Optional[Tuple[prim.Expression, ...]] = None def copy(self, *, implemented_domain=None, gsize=None, lsize=None): if implemented_domain is None: @@ -386,16 +455,13 @@ def __init__(self, kernel): self.kernel = kernel def map_schedule(self, expr): - universe = isl.BasicSet.universe(isl.Space.create_from_names(self.kernel - .isl_context, - [])) - - return super().map_schedule(expr, PredicateInsertionContext(universe)) + impl_domain = self.kernel.assumptions + return super().map_schedule(expr, PredicateInsertionContext(impl_domain)) def map_function(self, expr, context): # get the implemented domain for the insn ids in this kernel # Shouldn't be difficult to write a combine mapper for it. - gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids( + gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids_as_exprs( InstructionGatherer()(expr)) return super().map_function(expr, context.copy(gsize=gsize, lsize=lsize)) @@ -406,17 +472,53 @@ def map_barrier(self, expr, context): return expr def map_instruction_block(self, expr, context): + from loopy.symbolic import set_to_cond_expr + if all(isinstance(child, RunInstruction) for child in expr.children): - # need to add a predicate for the hardware axes usage. assert len({self.kernel.id_to_insn[child.insn_id].within_inames for child in expr.children}) == 1 - inames = self.kernel.id_to_insn[expr.children[0].insn_id].within_inames - hw_inames = inames - set(context.implemented_domain.get_var_dict()) + assert len({self.kernel.id_to_insn[child.insn_id].predicates + for child in expr.children}) == 1 + + inames, = {self.kernel.id_to_insn[child.insn_id].within_inames + for child in expr.children} + predicates, = {self.kernel.id_to_insn[child.insn_id].predicates + for child in expr.children} + + # {{{ compute the predicates due to the hardware inames + + hw_inames = inames & get_all_hw_inames(self.kernel) + if hw_inames: - raise NotImplementedError - return InstructionBlock([self.rec(child, context) - for child in expr.children]) + impl_domain = context.implemented_domain + domain = (self.kernel.get_inames_domain(hw_inames) + .project_out_except(types=[dim_type.set], + names=hw_inames)) + impl_domain = _implement_hw_axes_in_domains(impl_domain, + domain, + self.kernel, + context.gsize, + context.lsize) + domain = (domain + .move_dims(dim_type.param, domain.dim(dim_type.param), + dim_type.set, 0, domain.dim(dim_type.set))) + unimplemented_domain = (isl.align_spaces(domain, impl_domain) + .gist(impl_domain)) + + if not unimplemented_domain.is_universe(): + predicates |= {set_to_cond_expr(unimplemented_domain)} + + # }}} + + new_insn_block = InstructionBlock([self.rec(child, context) + for child in expr.children]) + + if predicates: + from pymbolic.primitives import LogicalAnd + return If(LogicalAnd(tuple(predicates)), [new_insn_block]) + else: + return new_insn_block else: assert all(isinstance(child, Barrier) for child in expr.childre) return InstructionBlock([self.rec(child, context) @@ -424,13 +526,20 @@ def map_instruction_block(self, expr, context): def map_loop(self, expr, context): from loopy.symbolic import pw_aff_to_expr + from loopy.isl_helpers import make_slab implemented_domain = context.implemented_domain assert implemented_domain.dim(dim_type.set) == 0 domain = self.kernel.get_inames_domain(expr.iname) - # {{{ make already implemented loops as parallel; project out inner loops + implemented_domain = _implement_hw_axes_in_domains(implemented_domain, + domain, + self.kernel, + context.gsize, + context.lsize) + + # {{{ make already implemented loops as parameters; project out inner loops for set_dim in domain.get_var_names(dim_type.set): dt, pos = domain.get_var_dict()[set_dim] @@ -450,9 +559,10 @@ def map_loop(self, expr, context): assert domain.dim(dim_type.set) == 1 - domain, implemented_domain = isl.align_two(domain, - implemented_domain) - domain = domain.gist(implemented_domain) + domain = _align_and_gist(domain, implemented_domain) + + lower_bound = domain.dim_min(0) + upper_bound = domain.dim_max(0) downstream_domain = _align_and_intersect(domain .move_dims(dim_type.param, @@ -460,13 +570,11 @@ def map_loop(self, expr, context): .param), dim_type.set, 0, 1), - implemented_domain) - - outer_condition = isl.align_spaces(domain.project_out(dim_type.set, 0, 1), - downstream_domain).gist(downstream_domain) - - lower_bound = domain.dim_min(0) - upper_bound = domain.dim_max(0) + implemented_domain + ) + set_implemented_in_loop = make_slab(domain.space, expr.iname, + lower_bound, upper_bound+1) + outer_condition = domain.gist(set_implemented_in_loop) inner_condition = domain.affine_hull() step = 1 # TODO: from inner_condition try to guess the step @@ -475,13 +583,18 @@ def map_loop(self, expr, context): .copy(implemented_domain=downstream_domain))) for child in expr.children] - return _wrap_in_if(outer_condition, - For(iname=expr.iname, - lower_bound=pw_aff_to_expr(lower_bound), - upper_bound=pw_aff_to_expr(upper_bound), - step=step, - children=_wrap_in_if(inner_condition, - children))) + for_ = For(iname=expr.iname, + lower_bound=pw_aff_to_expr(lower_bound), + upper_bound=pw_aff_to_expr(upper_bound), + step=step, + children=_wrap_in_if(inner_condition, + children)) + + if outer_condition.is_universe(): + return for_ + else: + from loopy.symbolic import set_to_cond_expr + return If(set_to_cond_expr(outer_condition), [for_]) class InstructionGatherer(CombineMapper): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index bc9c476f5..30715dba8 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -221,7 +221,7 @@ def ast_for_class(self): def ast_if_class(self): raise NotImplementedError() - def get_expression_to_code_mapper(self, codegen_state): + def get_expression_to_code_mapper(self, kernel, var_subst_map): raise NotImplementedError() def add_vector_access(self, access_expr, index): @@ -307,7 +307,7 @@ def get_function_declaration(self, kernel, name, implemented_data_info, def get_temporary_decls(self, codegen_state, schedule_index): return [] - def get_expression_to_code_mapper(self, codegen_state): + def get_expression_to_code_mapper(self, kernel, var_subst_map): return _DummyExpressionToCodeMapper() def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 78c4acf85..5027cdb5a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -938,14 +938,17 @@ def ast_module(self): import cgen return cgen - def get_expression_to_code_mapper(self, kernel, callables_table): + def get_expression_to_code_mapper(self, kernel, callables_table, var_subst_map): return self.get_expression_to_c_expression_mapper(kernel, - callables_table) + callables_table, + var_subst_map) - def get_expression_to_c_expression_mapper(self, kernel, callables_table): + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map): from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper return ExpressionToCExpressionMapper( - kernel, callables_table, self, fortran_abi=self.target.fortran_abi) + kernel, callables_table, self, var_subst_map, + fortran_abi=self.target.fortran_abi) def get_c_expression_to_code_mapper(self): from loopy.target.c.codegen.expression import CExpressionToCodeMapper @@ -1022,9 +1025,9 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): return arg_decl - def emit_assignment(self, kernel, insn): + def emit_assignment(self, kernel, insn, var_subst_map): - ecm = self.get_expression_to_code_mapper(kernel) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) assignee_var_name, = insn.assignee_var_names() @@ -1130,8 +1133,8 @@ def emit_multiple_assignment(self, codegen_state, insn): in_knl_callable_as_call)) def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, - inner): - ecm = self.get_expression_to_code_mapper(kernel) + inner, var_subst_map): + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) from pymbolic import var from pymbolic.primitives import Comparison @@ -1173,9 +1176,10 @@ def emit_comment(self, s): def can_implement_conditionals(self): return True - def emit_if(self, condition_str, ast): + def emit_if(self, kernel, condition, ast, var_subst_map): from cgen import If - return If(condition_str, ast) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + return If(ecm(condition), ast) # }}} diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index f2b59349f..bd29e0560 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -56,13 +56,19 @@ class ExpressionToCExpressionMapper(IdentityMapper): """ Mapper that converts a loopy-semantic expression to a C-semantic expression with typecasts, appropriate arithmetic semantic mapping, etc. + + .. attribute:: var_subst_map + + A mapping from variable name to expression it is to be substituted + with. A caller might set this to map iname to hardware iname expressions. """ - def __init__(self, kernel, callables_table, ast_builder, + def __init__(self, kernel, callables_table, ast_builder, var_subst_map, vectorization_info=None, fortran_abi=False, type_inf_mapper=None): self.kernel = kernel self.callables_table = callables_table self.ast_builder = ast_builder + self.var_subst_map = var_subst_map if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, @@ -80,7 +86,8 @@ def __init__(self, kernel, callables_table, ast_builder, def with_assignments(self, names_to_vars): type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars) - return type(self)(self.kernel, self.vectorization_info, + return type(self)(self.kernel, self.callables_table, self.ast_builder, + self.var_subst_map, self.vectorization_info, self.fortran_abi, type_inf_mapper) def infer_type(self, expr): @@ -142,7 +149,17 @@ def map_variable(self, expr, type_context): def postproc(x): return x - if expr.name in self.kernel.arg_dict: + if expr.name in self.var_subst_map: + if self.kernel.options.annotate_inames: + return var( + "/* {} */ {}".format( + expr.name, + self.rec(self.var_subst_map[expr.name], + type_context))) + else: + return self.rec(self.var_subst_map[expr.name], + type_context) + elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] from loopy.kernel.array import ArrayBase if isinstance(arg, ArrayBase): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 13102b654..53df744b2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -364,8 +364,8 @@ def preamble_generators(self): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToCudaCExpressionMapper(codegen_state) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + return ExpressionToCudaCExpressionMapper(kernel, self, var_subst_map) _VEC_AXES = "xyzw" diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index e46a10150..83b0936b6 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -254,7 +254,7 @@ def get_function_declaration(self, name, kernel, implemented_data_info, # }}} def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) from loopy.schedule.tree import get_insns_in_function from pymbolic.mapper.stringifier import PREC_NONE @@ -285,8 +285,8 @@ def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExprToISPCExprMapper(codegen_state, self) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + return ExprToISPCExprMapper(kernel, self, var_subst_map) def add_vector_access(self, access_expr, index): return access_expr[index] @@ -489,13 +489,13 @@ def emit_assignment(self, codegen_state, insn): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, kernel, iname, iname_dtype, - lbound, ubound, inner): + lbound, ubound, inner, var_subst_map): from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer from cgen.ispc import ISPCUniform - ecm = self.get_expression_to_code_mapper(kernel) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) return For( InlineInitializer( diff --git a/loopy/target/numba.py b/loopy/target/numba.py index bdca2739c..f32fad066 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -63,12 +63,11 @@ def get_python_function_decorators(self): return () def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel) from pymbolic.mapper.stringifier import PREC_NONE from genpy import Statement - ecm = self.get_expression_to_code_mapper(kernel) implemented_data_info = implemented_data_info + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) from loopy.schedule.tree import get_insns_in_function gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( @@ -165,8 +164,8 @@ def preamble_generators(self): def get_python_function_decorators(self): return ("@_lpy_ncu.jit",) - def get_expression_to_code_mapper(self, codegen_state): - return NumbaCudaExpressionToPythonMapper(codegen_state) + def get_expression_to_code_mapper(self, kernel, var_subst_map): + return NumbaCudaExpressionToPythonMapper(kernel, self, var_subst_map) class NumbaCudaTarget(TargetBase): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 700d0c5fc..d75d4ec11 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -649,8 +649,8 @@ def generate_top_of_body(self, codegen_state): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToOpenCLCExpressionMapper(codegen_state) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + return ExpressionToOpenCLCExpressionMapper(kernel, self, var_subst_map) def add_vector_access(self, access_expr, index): # The 'int' avoids an 'L' suffix for long ints. @@ -893,8 +893,9 @@ def make_subscript(self, array, base_expr, subscript): class VolatileMemOpenCLCASTBuilder(OpenCLCASTBuilder): - def get_expression_to_c_expression_mapper(self, codegen_state): - return VolatileMemExpressionToOpenCLCExpressionMapper(codegen_state) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, self, + var_subst_map) class VolatileMemOpenCLTarget(OpenCLTarget): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 6d17a1226..795f1af97 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -189,6 +189,11 @@ def complex_type_name(self, dtype): else: raise RuntimeError + @property + def allow_complex(self): + from loopy.kernel.tools import has_complex_dtyped_var + return has_complex_dtyped_var(self.kernel) + def wrap_in_typecast_lazy(self, actual_type_func, needed_dtype, s): if needed_dtype.is_complex(): return self.wrap_in_typecast(actual_type_func(), needed_dtype, s) @@ -767,7 +772,7 @@ def alloc_nbytes(tv): return code_lines def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) from loopy.schedule.tree import get_insns_in_function gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( @@ -858,9 +863,8 @@ def preamble_generators(self): # }}} - def get_expression_to_c_expression_mapper(self, codegen_state): - return ExpressionToPyOpenCLCExpressionMapper(codegen_state, self) - + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + return ExpressionToPyOpenCLCExpressionMapper(kernel, self, var_subst_map) # }}} @@ -868,10 +872,11 @@ def get_expression_to_c_expression_mapper(self, codegen_state): # {{{ volatile mem acccess target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): - def get_expression_to_c_expression_mapper(self, codegen_state): + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): from loopy.target.opencl import \ VolatileMemExpressionToOpenCLCExpressionMapper - return VolatileMemExpressionToOpenCLCExpressionMapper(codegen_state) + return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, self, + var_subst_map) class VolatileMemPyOpenCLTarget(PyOpenCLTarget): diff --git a/loopy/target/python.py b/loopy/target/python.py index 5068fb3f8..1810aa426 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -35,8 +35,10 @@ # {{{ expression to code class ExpressionToPythonMapper(StringifyMapper): - def __init__(self, kernel, type_inf_mapper=None): + def __init__(self, kernel, ast_builder, var_subst_map, type_inf_mapper=None): self.kernel = kernel + self.ast_builder = ast_builder + self.var_subst_map = var_subst_map if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, @@ -199,8 +201,8 @@ def get_temporary_decls(self, codegen_state, schedule_index): return result - def get_expression_to_code_mapper(self, kernel): - return ExpressionToPythonMapper(kernel) + def get_expression_to_code_mapper(self, kernel, var_subst_map): + return ExpressionToPythonMapper(kernel, self, var_subst_map) @property def ast_base_class(self): @@ -229,8 +231,8 @@ def ast_block_scope_class(self): return Collection def emit_sequential_loop(self, kernel, iname, iname_dtype, - lbound, ubound, inner): - ecm = self.get_expression_to_code_mapper(kernel) + lbound, ubound, inner, var_subst_map): + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM from genpy import For @@ -260,9 +262,10 @@ def emit_comment(self, s): def can_implement_conditionals(self): return True - def emit_if(self, condition_str, ast): + def emit_if(self, kernel, condition, ast, var_subst_map): from genpy import If - return If(condition_str, ast) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + return If(ecm(condition), ast) def emit_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper From 6ce94e25375e72a18f07406f78c2407680c79dec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 10:46:17 -0500 Subject: [PATCH 031/109] homogenizes instruction before performing predicate insertion --- loopy/schedule/tree.py | 203 +++++++++++++++++++++++++++++------------ 1 file changed, 145 insertions(+), 58 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index a4f881885..8ab2b529c 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -57,10 +57,8 @@ class InstructionBlock(ScheduleNode): block cannot contain other blocks or loops. .. attribute:: children - - A list of instruction ids contained in the block. """ - children: List[Union[Barrier, RunInstruction]] + children: List[Union[RunInstruction]] mapper_method: str = field(default="map_instruction_block", repr=False, init=False) @@ -72,10 +70,16 @@ class Loop(ScheduleNode): A loop with the induction variable *iname*. """ iname: str - children: List[Union[InstructionBlock, "Loop"]] + children: List[Union[InstructionBlock, "Loop", Barrier]] mapper_method: str = field(default="map_loop", repr=False, init=False) + def with_children(self, children): + """ + Returns a copy of *self* with *children* as its new children. + """ + return Loop(self.iname, children) + @dataclass class Function(ScheduleNode): @@ -93,10 +97,16 @@ class Function(ScheduleNode): name: str extra_args: List[Any] extra_inames: List[str] - children: List[Union[InstructionBlock, Loop]] + children: List[Union[InstructionBlock, Loop, Barrier]] mapper_method: str = field(default="map_function", repr=False, init=False) + def with_children(self, children): + """ + Returns a copy of *self* with *children* as its new children. + """ + return Function(self.name, self.extra_args, self.extra_inames, children) + @dataclass class For(Loop): @@ -104,7 +114,7 @@ class For(Loop): lower_bound: Union[int, prim.Expression] upper_bound: Union[int, prim.Expression] step: int - children: List[Union[InstructionBlock, Loop, "If"]] + children: List[Union[InstructionBlock, Loop, "If", Barrier]] mapper_method: str = field(default="map_for", repr=False, init=False) @@ -122,10 +132,16 @@ class Schedule(ScheduleNode): """ Top-level schedule description. """ - children: List[Union[Loop, InstructionBlock, Function]] + children: List[Union[Loop, InstructionBlock, Function, Barrier]] mapper_method: str = field(default="map_schedule", repr=False, init=False) + def with_children(self, children): + """ + Returns a copy of *self* with *children* as its new children. + """ + return Schedule(children) + @dataclass class ScheduleTreeBuilder: @@ -181,9 +197,10 @@ def add_run_instruction(self, insn_id): self.current_node.children.append(RunInstruction(insn_id)) def add_barrier(self, comment, kind, insn_id): - if not isinstance(self.current_node, InstructionBlock): - self.make_instruction_block() + if isinstance(self.current_node, InstructionBlock): + self._build_stack.pop() + assert isinstance(self.current_node, (Schedule, Function, Loop)) self.current_node.children.append(Barrier(comment, kind, insn_id)) def exit_function(self): @@ -232,7 +249,8 @@ def make_schedule_tree(kernel): else: raise NotImplementedError(type(sched_item)) - return kernel.copy(schedule=bob.exit()) + kernel = kernel.copy(schedule=bob.exit()) + return kernel # }}} @@ -474,55 +492,50 @@ def map_barrier(self, expr, context): def map_instruction_block(self, expr, context): from loopy.symbolic import set_to_cond_expr - if all(isinstance(child, RunInstruction) for child in expr.children): - assert len({self.kernel.id_to_insn[child.insn_id].within_inames - for child in expr.children}) == 1 - assert len({self.kernel.id_to_insn[child.insn_id].predicates - for child in expr.children}) == 1 - - inames, = {self.kernel.id_to_insn[child.insn_id].within_inames - for child in expr.children} - predicates, = {self.kernel.id_to_insn[child.insn_id].predicates - for child in expr.children} - - # {{{ compute the predicates due to the hardware inames - - hw_inames = inames & get_all_hw_inames(self.kernel) - - if hw_inames: - - impl_domain = context.implemented_domain - domain = (self.kernel.get_inames_domain(hw_inames) - .project_out_except(types=[dim_type.set], - names=hw_inames)) - impl_domain = _implement_hw_axes_in_domains(impl_domain, - domain, - self.kernel, - context.gsize, - context.lsize) - domain = (domain - .move_dims(dim_type.param, domain.dim(dim_type.param), - dim_type.set, 0, domain.dim(dim_type.set))) - unimplemented_domain = (isl.align_spaces(domain, impl_domain) - .gist(impl_domain)) - - if not unimplemented_domain.is_universe(): - predicates |= {set_to_cond_expr(unimplemented_domain)} - - # }}} - - new_insn_block = InstructionBlock([self.rec(child, context) - for child in expr.children]) - - if predicates: - from pymbolic.primitives import LogicalAnd - return If(LogicalAnd(tuple(predicates)), [new_insn_block]) - else: - return new_insn_block + assert len({self.kernel.id_to_insn[child.insn_id].within_inames + for child in expr.children}) == 1 + assert len({self.kernel.id_to_insn[child.insn_id].predicates + for child in expr.children}) == 1 + + inames, = {self.kernel.id_to_insn[child.insn_id].within_inames + for child in expr.children} + predicates, = {self.kernel.id_to_insn[child.insn_id].predicates + for child in expr.children} + + # {{{ compute the predicates due to the hardware inames + + hw_inames = inames & get_all_hw_inames(self.kernel) + + if hw_inames: + + impl_domain = context.implemented_domain + domain = (self.kernel.get_inames_domain(hw_inames) + .project_out_except(types=[dim_type.set], + names=hw_inames)) + impl_domain = _implement_hw_axes_in_domains(impl_domain, + domain, + self.kernel, + context.gsize, + context.lsize) + domain = (domain + .move_dims(dim_type.param, domain.dim(dim_type.param), + dim_type.set, 0, domain.dim(dim_type.set))) + unimplemented_domain = (isl.align_spaces(domain, impl_domain) + .gist(impl_domain)) + + if not unimplemented_domain.is_universe(): + predicates |= {set_to_cond_expr(unimplemented_domain)} + + # }}} + + new_insn_block = InstructionBlock([self.rec(child, context) + for child in expr.children]) + + if predicates: + from pymbolic.primitives import LogicalAnd + return If(LogicalAnd(tuple(predicates)), [new_insn_block]) else: - assert all(isinstance(child, Barrier) for child in expr.childre) - return InstructionBlock([self.rec(child, context) - for child in expr.children]) + return new_insn_block def map_loop(self, expr, context): from loopy.symbolic import pw_aff_to_expr @@ -615,7 +628,81 @@ def map_barrier(self, expr): return frozenset() +class InstructionBlockHomogenizer(IdentityMapper): + """ + Splits instruction blocks into multiple instruction blocks into multiple + instruction blocks so that all the instruction blocks + """ + def __init__(self, kernel): + self.kernel = kernel + + def _are_insns_similar(self, insn1_id, insn2_id): + insn1 = self.kernel.id_to_insn[insn1_id] + insn2 = self.kernel.id_to_insn[insn2_id] + return (insn1.within_inames == insn2.within_inames + and insn1.predicates == insn2.predicates) + + def _map_insn_block_parent(self, expr): + new_children = [] + + for child in expr.children: + if isinstance(child, InstructionBlock): + # {{{ add instruction blocks containing similar run instructions + similar_run_insns = [] + + for run_insn in child.children: + if similar_run_insns: + if self._are_insns_similar(similar_run_insns[-1].insn_id, + run_insn.insn_id): + similar_run_insns.append(run_insn) + else: + new_children.append(InstructionBlock(similar_run_insns)) + similar_run_insns = [run_insn] + else: + similar_run_insns.append(run_insn) + + new_children.append(InstructionBlock(similar_run_insns)) + + # }}} + else: + new_children.append(self.rec(child)) + + return expr.with_children(new_children) + + map_loop = _map_insn_block_parent + map_function = _map_insn_block_parent + map_schedule = _map_insn_block_parent + map_for = _map_insn_block_parent + map_if = _map_insn_block_parent + + +def homogenize_instruction_blocks(kernel): + """ + Returns a copy of *kernel* by splitting each instruction blocks in the + kernel's schedule into multiple instruction blocks so that all the + instructions in the updated instruction blocks have same predicates and + within_inames. + + .. note:: + + This might be a helpful transformation if the caller intends to operate + on instruction blocks with the assumption that all instructions in an + instruction block are homogenous under some criterion. + """ + # TODO: Could be generalized by taking the homogenization criterion as an + # argument. + + new_schedule = InstructionBlockHomogenizer(kernel)(kernel.schedule) + return kernel.copy(schedule=new_schedule) + + def insert_predicates_into_schedule(kernel): + # {{{ preprocessing before beginning the predicate insertion. + + kernel = homogenize_instruction_blocks(kernel) + + # }}} + assert kernel.state >= KernelState.LINEARIZED assert isinstance(kernel.schedule, Schedule) new_schedule = PredicateInsertionMapper(kernel)(kernel.schedule) From 8f9b32b9e0a81a30d221ccf16c6f12be84694f8d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 15:17:56 -0500 Subject: [PATCH 032/109] hw_iname_exprs -> iname_exprs - Something like an unroll iname might also be set with an expression that would be needed to be handled by the expression to code mapper. --- loopy/codegen/result.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index e229ce358..108fb1508 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -170,23 +170,23 @@ class CodeGenerationContext: target AST. """ in_device: bool - hw_iname_exprs: Mapping[str, prim.Expression] + iname_exprs: Mapping[str, prim.Expression] vectorization_info: Optional[VectorizationInfo] = None - def copy(self, *, in_device=None, hw_iname_exprs=None, + def copy(self, *, in_device=None, iname_exprs=None, vectorization_info=None): if in_device is None: in_device = self.in_device - if hw_iname_exprs is None: - hw_iname_exprs = self.hw_iname_exprs + if iname_exprs is None: + iname_exprs = self.iname_exprs if vectorization_info is None: vectorization_info = self.vectorization_info return CodeGenerationContext( in_device=in_device, - hw_iname_exprs=hw_iname_exprs, + iname_exprs=iname_exprs, vectorization_info=vectorization_info) @@ -321,7 +321,7 @@ def map_function(self, expr, context): # {{{ Device side: Define the kernel - # {{{ record the hw_iname_exprs for downstream elements + # {{{ record the iname_exprs for downstream elements from functools import reduce from loopy.kernel.data import (HardwareConcurrentTag, GroupIndexTag, @@ -349,14 +349,14 @@ def _hw_iname_expr(iname): else LocalHardwareAxisIndex(tag.axis)) - hw_iname_exprs = {iname: _hw_iname_expr(iname) + iname_exprs = {iname: _hw_iname_expr(iname) for iname in all_inames if self.kernel.iname_tags_of_type(iname, HardwareConcurrentTag)} # }}} - dwnstrm_ctx = context.copy(in_device=True, hw_iname_exprs=hw_iname_exprs) + dwnstrm_ctx = context.copy(in_device=True, iname_exprs=iname_exprs) dev_fn_decl = (self .device_ast_builder @@ -417,7 +417,7 @@ def map_for(self, expr, context): expr.lower_bound, expr.upper_bound, loop_body, - context.hw_iname_exprs + context.iname_exprs ) if context.in_device: return CodeGenMapperAccumulator(host_ast=children_res.host_ast, @@ -440,7 +440,7 @@ def map_if(self, expr, context): else children_res.host_ast) if_ast = ast_builder.emit_if(self.kernel, expr.condition, if_body, - context.hw_iname_exprs) + context.iname_exprs) if context.in_device: return CodeGenMapperAccumulator(host_ast=children_res.host_ast, @@ -472,7 +472,7 @@ def map_run_instruction(self, expr, context): insn_ast = generate_assignment_instruction_code(self.kernel, insn, ast_builder, (context - .hw_iname_exprs), + .iname_exprs), (context .vectorization_info)) elif isinstance(insn, CInstruction): From f4d992b308b2673f558946ffb3afce88621b510c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 23 May 2021 18:35:57 -0500 Subject: [PATCH 033/109] introduces a GroupedChildren as an intermediary for IdentityMapper - GroupedChildren are helpful if the IdentityMapper wishes to return multiple nodes instead of one. (Loop fission for example is one such user) --- loopy/schedule/tree.py | 96 ++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 8ab2b529c..59cd01420 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -255,6 +255,11 @@ def make_schedule_tree(kernel): # }}} +@dataclass +class GroupedChildren: + contents: List[ScheduleNode] + + class Mapper: def __call__(self, expr, *args, **kwargs): try: @@ -269,25 +274,36 @@ def __call__(self, expr, *args, **kwargs): class IdentityMapper(Mapper): + def combine(self, values): + result = [] + for val in values: + if isinstance(val, GroupedChildren): + result.extend(val.contents) + else: + assert isinstance(val, ScheduleNode) + result.append(val) + + return result + def map_schedule(self, expr, *args, **kwargs): - return Schedule([self.rec(child, *args, **kwargs) - for child in expr.children]) + return Schedule(self.combine([self.rec(child, *args, **kwargs) + for child in expr.children])) def map_instruction_block(self, expr, *args, **kwargs): - return InstructionBlock([self.rec(child, *args, **kwargs) - for child in expr.children]) + return InstructionBlock(self.combine([self.rec(child, *args, **kwargs) + for child in expr.children])) def map_function(self, expr, *args, **kwargs): return Function(expr.name, expr.extra_args, expr.extra_inames, - [self.rec(child, *args, **kwargs) - for child in expr.children]) + self.combine([self.rec(child, *args, **kwargs) + for child in expr.children])) def map_loop(self, expr, *args, **kwargs): return Loop(expr.iname, - [self.rec(child, *args, **kwargs) - for child in expr.children]) + self.combine([self.rec(child, *args, **kwargs) + for child in expr.children])) def map_barrier(self, expr, *args, **kwargs): return Barrier(expr.comment, expr.synchronization_kind, @@ -317,6 +333,10 @@ def map_loop(self, expr, *args, **kwargs): return self.combine([self.rec(child, *args, **kwargs) for child in expr.children]) + def map_polyhedral_loop(self, expr, *args, **kwargs): + return self.combine([self.rec(child, *args, **kwargs) + for child in expr.children]) + def map_for(self, expr, *args, **kwargs): return self.combine([self.rec(child, *args, **kwargs) for child in expr.children]) @@ -364,6 +384,11 @@ def map_loop(self, expr, level=0): super().map_loop(expr, level+1), f"{self._indent(level)}end {expr.iname}"]) + def map_polyhedral_loop(self, expr, level=0): + return self.combine([f"{self._indent(level)}PolyhedralFor({expr.domain})", + super().map_polyhedral_loop(expr, level+1), + f"{self._indent(level)}end {expr.iname}"]) + def map_for(self, expr, level=0): return self.combine([f"{self._indent(level)}For({expr.iname}, " f"{expr.lower_bound}, {expr.upper_bound}, " @@ -392,7 +417,7 @@ def _wrap_in_if(cond, nodes): if cond.is_universe(): return nodes else: - return If(set_to_cond_expr(cond), nodes) + return [If(set_to_cond_expr(cond), nodes)] def _implement_hw_axes_in_domains(implemented_domain, domain, @@ -642,38 +667,25 @@ def _are_insns_similar(self, insn1_id, insn2_id): return (insn1.within_inames == insn2.within_inames and insn1.predicates == insn2.predicates) - def _map_insn_block_parent(self, expr): - new_children = [] - - for child in expr.children: - if isinstance(child, InstructionBlock): - # {{{ add instruction blocks containing similar run instructions - similar_run_insns = [] - - for run_insn in child.children: - if similar_run_insns: - if self._are_insns_similar(similar_run_insns[-1].insn_id, - run_insn.insn_id): - similar_run_insns.append(run_insn) - else: - new_children.append(InstructionBlock(similar_run_insns)) - similar_run_insns = [run_insn] - else: - similar_run_insns.append(run_insn) + def map_instruction_block(self, expr): + insn_blocks = [] - new_children.append(InstructionBlock(similar_run_insns)) + similar_run_insns = [] - # }}} + for run_insn in expr.children: + if similar_run_insns: + if self._are_insns_similar(similar_run_insns[-1].insn_id, + run_insn.insn_id): + similar_run_insns.append(run_insn) + else: + insn_blocks.append(InstructionBlock(similar_run_insns)) + similar_run_insns = [run_insn] else: - new_children.append(self.rec(child)) + similar_run_insns.append(run_insn) - return expr.with_children(new_children) + insn_blocks.append(InstructionBlock(similar_run_insns)) - map_loop = _map_insn_block_parent - map_function = _map_insn_block_parent - map_schedule = _map_insn_block_parent - map_for = _map_insn_block_parent - map_if = _map_insn_block_parent + return GroupedChildren(insn_blocks) def homogenize_instruction_blocks(kernel): @@ -697,16 +709,20 @@ def homogenize_instruction_blocks(kernel): def insert_predicates_into_schedule(kernel): + assert kernel.state >= KernelState.LINEARIZED + assert isinstance(kernel.schedule, Schedule) + # {{{ preprocessing before beginning the predicate insertion. kernel = homogenize_instruction_blocks(kernel) # }}} - assert kernel.state >= KernelState.LINEARIZED - assert isinstance(kernel.schedule, Schedule) - new_schedule = PredicateInsertionMapper(kernel)(kernel.schedule) - return kernel.copy(schedule=new_schedule) + schedule = PolyhedronLoopifier(kernel)(kernel.schedule) + schedule = Unroller(kernel)(schedule) + schedule = PredicateInsertionMapper(kernel)(schedule) + + return kernel.copy(schedule=schedule) def get_insns_in_function(kernel, name): From 8dff587bba1391020e6556cf3f4b95000d013ed4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 23 May 2021 18:38:33 -0500 Subject: [PATCH 034/109] introduces mappers to handle unr inames --- loopy/codegen/result.py | 55 +++++----- loopy/schedule/tree.py | 220 +++++++++++++++++++++++++++------------- 2 files changed, 181 insertions(+), 94 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 108fb1508..b4727c63f 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -350,9 +350,9 @@ def _hw_iname_expr(iname): LocalHardwareAxisIndex(tag.axis)) iname_exprs = {iname: _hw_iname_expr(iname) - for iname in all_inames - if self.kernel.iname_tags_of_type(iname, - HardwareConcurrentTag)} + for iname in all_inames + if self.kernel.iname_tags_of_type(iname, + HardwareConcurrentTag)} # }}} @@ -394,36 +394,45 @@ def map_for(self, expr, context): InOrderSequentialSequentialTag) ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 - if self.kernel.iname_tags_of_type(expr.iname, unr_tags): - dwnstrm_ctx = 1/0 - raise NotImplementedError - elif self.kernel.iname_tags_of_type(expr.iname, vec_tags): - dwnstrm_ctx = 1/0 + if self.kernel.iname_tags_of_type(expr.iname, vec_tags): raise NotImplementedError else: assert (len(self.kernel.inames[expr.iname].tags) == 0 or self.kernel.iname_tags_of_type(expr.iname, - seq_tags)) + seq_tags+unr_tags)) + assert expr.step == 1 + + if expr.upper_bound != expr.lower_bound: + dwnstrm_ctx = context.copy(vectorization_info=None) + else: + # special case: if ubound == lbound => unroll + new_iname_exprs = context.iname_exprs.copy() + new_iname_exprs[expr.iname] = expr.upper_bound + dwnstrm_ctx = context.copy(vectorization_info=None, + iname_exprs=new_iname_exprs) - dwnstrm_ctx = context.copy(vectorization_info=None) children_res = self.combine([self.rec(child, dwnstrm_ctx) for child in expr.children]) - loop_body = ast_builder.ast_block_class(children_res.device_ast - if context.in_device - else children_res.host_ast) - assert expr.step == 1 - loop_ast = ast_builder.emit_sequential_loop(self.kernel, expr.iname, - self.kernel.index_dtype, - expr.lower_bound, - expr.upper_bound, - loop_body, - context.iname_exprs - ) + body_ast = (children_res.device_ast + if context.in_device + else children_res.host_ast) + + if expr.upper_bound != expr.lower_bound: + loop_body = ast_builder.ast_block_class(body_ast) + loop_ast = [ast_builder.emit_sequential_loop(self.kernel, expr.iname, + self.kernel.index_dtype, + expr.lower_bound, + expr.upper_bound, + loop_body, + context.iname_exprs)] + else: + loop_ast = body_ast + if context.in_device: return CodeGenMapperAccumulator(host_ast=children_res.host_ast, - device_ast=[loop_ast]) + device_ast=loop_ast) else: - return CodeGenMapperAccumulator(host_ast=[loop_ast], + return CodeGenMapperAccumulator(host_ast=loop_ast, device_ast=children_res.device_ast) # }}} diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 59cd01420..fafc376ad 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -108,6 +108,15 @@ def with_children(self, children): return Function(self.name, self.extra_args, self.extra_inames, children) +@dataclass +class PolyhedralLoop(Loop): + iname: str + domain: isl.BasicSet + children: List[Union[InstructionBlock, Loop, "If", Barrier]] + + mapper_method: str = field(default="map_polyhedral_loop", repr=False, init=False) + + @dataclass class For(Loop): iname: str @@ -475,7 +484,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, @dataclass(frozen=True) -class PredicateInsertionContext: +class PolyhedronLoopifierContext: implemented_domain: isl.BasicSet gsize: Optional[Tuple[prim.Expression, ...]] = None lsize: Optional[Tuple[prim.Expression, ...]] = None @@ -490,30 +499,135 @@ def copy(self, *, implemented_domain=None, gsize=None, lsize=None): if lsize is None: lsize = self.lsize - return PredicateInsertionContext(implemented_domain, gsize, lsize) + return PolyhedronLoopifierContext(implemented_domain, gsize, lsize) -class PredicateInsertionMapper(IdentityMapper): +class PolyhedronLoopifier(IdentityMapper): def __init__(self, kernel): self.kernel = kernel def map_schedule(self, expr): impl_domain = self.kernel.assumptions - return super().map_schedule(expr, PredicateInsertionContext(impl_domain)) + return super().map_schedule(expr, + PolyhedronLoopifierContext(impl_domain)) def map_function(self, expr, context): # get the implemented domain for the insn ids in this kernel # Shouldn't be difficult to write a combine mapper for it. gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids_as_exprs( InstructionGatherer()(expr)) - return super().map_function(expr, context.copy(gsize=gsize, lsize=lsize)) + # FIXME: Somehow we need to get rid of allowing the hardware inames to + # be slabbed. + return super().map_function(expr, context.copy(gsize=gsize, + lsize=lsize)) + + def map_loop(self, expr, context): + implemented_domain = context.implemented_domain + assert implemented_domain.dim(dim_type.set) == 0 + + domain = self.kernel.get_inames_domain(expr.iname) + + implemented_domain = _implement_hw_axes_in_domains(implemented_domain, + domain, + self.kernel, + context.gsize, + context.lsize) + + # {{{ make already implemented loops as parameters; project out inner loops + + for set_dim in domain.get_var_names(dim_type.set): + dt, pos = domain.get_var_dict()[set_dim] + assert dt == dim_type.set + + if set_dim in implemented_domain.get_var_dict(): + # make outer loop's iname a param + domain = domain.move_dims(dim_type.param, + domain.dim(dim_type.param), + dt, pos, 1) + elif set_dim != expr.iname: + domain = domain.project_out(dt, pos, 1) + else: + pass + + # }}} + + assert domain.dim(dim_type.set) == 1 + + domain = _align_and_gist(domain, implemented_domain) + + downstream_domain = _align_and_intersect(domain + .move_dims(dim_type.param, + domain.dim(dim_type + .param), + dim_type.set, + 0, 1), + implemented_domain + ) + children = [self.rec(child, (context + .copy(implemented_domain=downstream_domain))) + for child in expr.children] + + return PolyhedralLoop(iname=expr.iname, + children=self.combine(children), + domain=domain) + + def map_polyhedral_loop(self, expr, context): + domain = _align_and_gist(expr.domain, context.implemented_domain) + downstream_domain = _align_and_intersect(domain, + context.implemented_domain) + children = [self.rec(child, (context + .copy(implemented_domain=downstream_domain))) + for child in expr.children] - def map_run_instruction(self, expr, context): - return expr + return PolyhedralLoop(iname=expr.iname, + children=self.combine(children), domain=domain) + + +class Unroller(PolyhedronLoopifier): + + def map_polyhedral_loop(self, expr, context): + from loopy.kernel.data import UnrollTag, UnrolledIlpTag + from loopy.isl_helpers import (make_slab, static_max_of_pw_aff, + static_min_of_pw_aff) + + if self.kernel.iname_tags_of_type(expr.iname, (UnrolledIlpTag, + UnrollTag)): + domain = _align_and_gist(expr.domain, context.implemented_domain) + ubound = static_max_of_pw_aff(domain.dim_max(0), constants_only=False) + lbound = static_min_of_pw_aff(domain.dim_min(0), + constants_only=False) + # FIXME: Write a better error message o'er here that the loop + # cannot be unrolled. + size = static_max_of_pw_aff(ubound-lbound+1, constants_only=True) + assert size.is_cst() + + result = [] + for i in range(size.get_constant_val().to_python()): + unrll_dom = make_slab(domain.space, expr.iname, lbound+i, + lbound+i+1) & domain + if unrll_dom.is_empty(): + continue + + dwnstrm_dom = _align_and_intersect(unrll_dom, + context.implemented_domain) + children = [self.rec(child, (context + .copy(implemented_domain=dwnstrm_dom))) + for child in expr.children] + + result.append(PolyhedralLoop(iname=expr.iname, + children=self.combine(children), + domain=unrll_dom)) + + return GroupedChildren(contents=result) + else: + return super().map_polyhedral_loop(expr, context) + + def map_loop(self, expr, context): + raise RuntimeError("At this point, all loops should have resolved as" + " polyhedral loops.") - def map_barrier(self, expr, context): - return expr +class PredicateInsertionMapper(PolyhedronLoopifier): def map_instruction_block(self, expr, context): from loopy.symbolic import set_to_cond_expr @@ -538,10 +652,10 @@ def map_instruction_block(self, expr, context): .project_out_except(types=[dim_type.set], names=hw_inames)) impl_domain = _implement_hw_axes_in_domains(impl_domain, - domain, - self.kernel, - context.gsize, - context.lsize) + domain, + self.kernel, + context.gsize, + context.lsize) domain = (domain .move_dims(dim_type.param, domain.dim(dim_type.param), dim_type.set, 0, domain.dim(dim_type.set))) @@ -562,71 +676,35 @@ def map_instruction_block(self, expr, context): else: return new_insn_block - def map_loop(self, expr, context): + def map_polyhedral_loop(self, expr, context): from loopy.symbolic import pw_aff_to_expr - from loopy.isl_helpers import make_slab - - implemented_domain = context.implemented_domain - assert implemented_domain.dim(dim_type.set) == 0 - - domain = self.kernel.get_inames_domain(expr.iname) - - implemented_domain = _implement_hw_axes_in_domains(implemented_domain, - domain, - self.kernel, - context.gsize, - context.lsize) - - # {{{ make already implemented loops as parameters; project out inner loops - - for set_dim in domain.get_var_names(dim_type.set): - dt, pos = domain.get_var_dict()[set_dim] - assert dt == dim_type.set - - if set_dim in implemented_domain.get_var_dict(): - # make outer loop's iname a param - domain = domain.move_dims(dim_type.param, - domain.dim(dim_type.param), - dt, pos, 1) - elif set_dim != expr.iname: - domain = domain.project_out(dt, pos, 1) - else: - pass - - # }}} - - assert domain.dim(dim_type.set) == 1 - - domain = _align_and_gist(domain, implemented_domain) - - lower_bound = domain.dim_min(0) - upper_bound = domain.dim_max(0) + from loopy.isl_helpers import (static_min_of_pw_aff, + static_max_of_pw_aff, make_slab) + + lb = static_min_of_pw_aff(expr.domain.dim_min(0).gist(context + .implemented_domain), + constants_only=False) + ub = static_max_of_pw_aff(expr.domain.dim_max(0).gist(context + .implemented_domain), + constants_only=False) + set_implemented_in_loop = make_slab(expr.domain.space, expr.iname, lb, ub+1) + + outer_condition = _align_and_gist(expr.domain.project_out(dim_type.set, + 0, 1), + set_implemented_in_loop) + inner_condition = _align_and_gist(expr.domain.affine_hull(), + set_implemented_in_loop) - downstream_domain = _align_and_intersect(domain - .move_dims(dim_type.param, - domain.dim(dim_type - .param), - dim_type.set, - 0, 1), - implemented_domain - ) - set_implemented_in_loop = make_slab(domain.space, expr.iname, - lower_bound, upper_bound+1) - outer_condition = domain.gist(set_implemented_in_loop) - - inner_condition = domain.affine_hull() step = 1 # TODO: from inner_condition try to guess the step - children = [self.rec(child, (context - .copy(implemented_domain=downstream_domain))) - for child in expr.children] - for_ = For(iname=expr.iname, - lower_bound=pw_aff_to_expr(lower_bound), - upper_bound=pw_aff_to_expr(upper_bound), + lower_bound=pw_aff_to_expr(lb), + upper_bound=pw_aff_to_expr(ub), step=step, children=_wrap_in_if(inner_condition, - children)) + (super() + .map_polyhedral_loop(expr, context) + .children))) if outer_condition.is_universe(): return for_ From 28de5d8ed862ac7acd25d3604745cc43c95c2f47 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 10:40:46 -0500 Subject: [PATCH 035/109] schedule tree: handle vectorized loops --- loopy/kernel/tools.py | 7 +- loopy/schedule/tree.py | 161 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 153 insertions(+), 15 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 365746a33..8382fd002 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2064,15 +2064,16 @@ def get_outer_params(domains): # }}} -def get_all_hw_inames(kernel): +@memoize_on_first_arg +def get_all_inames_tagged_with(kernel, tag_type): """ Returns :class:`frozenset` of all iname traversing across the target hardware's execution grid. """ - from loopy.kernel.data import (filter_iname_tags_by_type, HardwareConcurrentTag) + from loopy.kernel.data import filter_iname_tags_by_type return frozenset(iname.name for iname in kernel.inames.values() - if filter_iname_tags_by_type(iname.tags, HardwareConcurrentTag)) + if filter_iname_tags_by_type(iname.tags, tag_type)) @memoize_on_first_arg diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index fafc376ad..60f159028 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -7,7 +7,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from loopy.kernel import KernelState -from loopy.kernel.tools import get_all_hw_inames +from loopy.kernel.tools import get_all_inames_tagged_with # {{{ LoopKernel.schedule a tree @@ -441,10 +441,10 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, from *implemented_domain* and constraints arising from constraining hardware inames in *domain* to their corresponding """ - from loopy.kernel.data import HardwareConcurrentTag, GroupIndexTag, LocalIndexTag + from loopy.kernel.data import AxisTag, GroupIndexTag, LocalIndexTag from loopy.isl_helpers import make_slab, static_min_of_pw_aff - all_hw_inames = get_all_hw_inames(kernel) + all_hw_inames = get_all_inames_tagged_with(kernel, AxisTag) for dim_name in domain.get_var_dict(): if dim_name in all_hw_inames: @@ -452,7 +452,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, # this hardware dim is already implemented => ignore continue - tag, = kernel.iname_tags_of_type(dim_name, HardwareConcurrentTag) + tag, = kernel.iname_tags_of_type(dim_name, AxisTag) assert isinstance(tag, (GroupIndexTag, LocalIndexTag)) lbound = static_min_of_pw_aff(kernel @@ -480,7 +480,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, & make_slab(implemented_domain.space, dim_name, lbound, lbound + size)) - return implemented_domain + return implemented_domain.params() @dataclass(frozen=True) @@ -562,7 +562,7 @@ def map_loop(self, expr, context): dim_type.set, 0, 1), implemented_domain - ) + ).params() children = [self.rec(child, (context .copy(implemented_domain=downstream_domain))) for child in expr.children] @@ -572,9 +572,16 @@ def map_loop(self, expr, context): domain=domain) def map_polyhedral_loop(self, expr, context): + assert expr.domain.dim(dim_type.set) == 1 + assert context.implemented_domain.dim(dim_type.set) == 0 + domain = _align_and_gist(expr.domain, context.implemented_domain) downstream_domain = _align_and_intersect(domain, context.implemented_domain) + downstream_domain = downstream_domain.move_dims(dim_type.param, + (downstream_domain + .dim(dim_type.param)), + dim_type.set, 0, 1).params() children = [self.rec(child, (context .copy(implemented_domain=downstream_domain))) for child in expr.children] @@ -583,19 +590,137 @@ def map_polyhedral_loop(self, expr, context): children=self.combine(children), domain=domain) +class UnvectorizableInamesCollector(CombineMapper): + """ + Mapper to gather all insn ids. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + assert all(isinstance(value, frozenset) for value in values) + return reduce(frozenset.union, values, frozenset()) + + def map_polyhedral_loop(self, expr): + from loopy.kernel.data import VectorizeTag + from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff + from loopy.diagnostic import warn + from loopy.symbolic import pw_aff_to_expr + from loopy.expression import VectorizabilityChecker + from loopy.codegen import Unvectorizable + from loopy.kernel.instruction import MultiAssignmentBase + + if self.kernel.iname_tags_of_type(expr.iname, VectorizeTag): + # FIXME: also assert that all children are just instruction blocks.. + bounds = self.kernel.get_iname_bounds(expr.iname, constants_only=True) + + length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) + + if not length_aff.is_cst(): + warn(self.kernel, "vec_upper_not_const", + f"upper bound for vectorized loop '{expr.iname}' is not" + " a constant, cannot vectorize.") + return frozenset([expr.iname]) + + length = int(pw_aff_to_expr(length_aff)) + + lower_bound_aff = static_value_of_pw_aff(bounds + .lower_bound_pw_aff + .coalesce(), + constants_only=False) + + if not lower_bound_aff.plain_is_zero(): + warn(self.kernel, "vec_lower_not_0", + f"lower bound for vectorized loop '{expr.iname}' is not zero," + "cannot vectorize.") + return frozenset([expr.iname]) + + # {{{ validate the vectorizability of instructions within the + + for child in expr.children: + if not isinstance(child, InstructionBlock): + warn(self.kernel, "vec_loop_complex_control_flow", + f"loop nest of vectorized loop '{expr.iname}' contains" + " other loops or barriers => unvectorizable.") + return frozenset([expr.iname]) + + assert isinstance(child, InstructionBlock) + for run_insn in child.children: + insn = self.kernel.id_to_insn[run_insn.insn_id] + + if not isinstance(insn, MultiAssignmentBase): + warn(self.kernel, "vec_loop_contains_non_assignment_insn", + f"loop nest of vectorized loop '{expr.iname}' contains" + f" instruction of type {type(insn)} that cannot be" + " vectorized.") + return frozenset([expr.iname]) + + if insn.predicates: + warn(self.kernel, "vec_loop_contains_predicates", + f"loop nest of vectorized loop '{expr.iname}' contains" + "predicates => masking instances of vectorized" + "loop not yet supported.") + return frozenset([expr.iname]) + + if insn.atomicity: + warn(self.kernel, "vec_loop_contains_atomic_insns", + f"loop nest of vectorized loop '{expr.iname}' contains" + "atomic instructions => unvectorizable.") + return frozenset([expr.iname]) + + vcheck = VectorizabilityChecker( + self.kernel, expr.iname, length) + + try: + lhs_is_vector = vcheck(insn.assignee) + rhs_is_vector = vcheck(insn.expression) + except Unvectorizable as e: + warn(self.kernel, "vectorize_failed", + f"Vectorization of '{expr.iname}' failed due to '{e}'" + f" in '{insn.id}'.") + return frozenset([expr.iname]) + else: + if not lhs_is_vector and rhs_is_vector: + warn(self.kernel, "vectorize_failed", + f"Vectorization of '{expr.iname}' failed in'" + f" '{insn.id}' as LHS is scalar, RHS is vector," + " cannot assign") + return frozenset([expr.iname]) + + # }}} + + return super().map_polyhedral_loop(expr) + + def map_run_instruction(self, expr): + return frozenset() + + def map_barrier(self, expr): + return frozenset() + + class Unroller(PolyhedronLoopifier): + """ + .. attribute extra_unroll_inames:: + + A :class:`frozenset` of inames that are to be unrolled other than the + usual suspects tagged with 'unr`. One use-case could be unrolling could + be a fallback implementation for other iname implementations. + """ + def __init__(self, kernel, extra_unroll_inames): + super().__init__(kernel) + self.extra_unroll_inames = extra_unroll_inames def map_polyhedral_loop(self, expr, context): from loopy.kernel.data import UnrollTag, UnrolledIlpTag from loopy.isl_helpers import (make_slab, static_max_of_pw_aff, static_min_of_pw_aff) - if self.kernel.iname_tags_of_type(expr.iname, (UnrolledIlpTag, - UnrollTag)): + if (self.kernel.iname_tags_of_type(expr.iname, (UnrolledIlpTag, + UnrollTag)) + or expr.iname in self.extra_unroll_inames): domain = _align_and_gist(expr.domain, context.implemented_domain) ubound = static_max_of_pw_aff(domain.dim_max(0), constants_only=False) - lbound = static_min_of_pw_aff(domain.dim_min(0), - constants_only=False) + lbound = static_min_of_pw_aff(domain.dim_min(0), constants_only=False) # FIXME: Write a better error message o'er here that the loop # cannot be unrolled. size = static_max_of_pw_aff(ubound-lbound+1, constants_only=True) @@ -610,6 +735,11 @@ def map_polyhedral_loop(self, expr, context): dwnstrm_dom = _align_and_intersect(unrll_dom, context.implemented_domain) + + dwnstrm_dom = dwnstrm_dom.move_dims(dim_type.param, + (dwnstrm_dom + .dim(dim_type.param)), + dim_type.set, 0, 1).params() children = [self.rec(child, (context .copy(implemented_domain=dwnstrm_dom))) for child in expr.children] @@ -643,7 +773,8 @@ def map_instruction_block(self, expr, context): # {{{ compute the predicates due to the hardware inames - hw_inames = inames & get_all_hw_inames(self.kernel) + from loopy.kernel.data import AxisTag + hw_inames = inames & get_all_inames_tagged_with(self.kernel, AxisTag) if hw_inames: @@ -681,6 +812,7 @@ def map_polyhedral_loop(self, expr, context): from loopy.isl_helpers import (static_min_of_pw_aff, static_max_of_pw_aff, make_slab) + assert expr.domain.dim(dim_type.set) == 1 lb = static_min_of_pw_aff(expr.domain.dim_min(0).gist(context .implemented_domain), constants_only=False) @@ -797,9 +929,14 @@ def insert_predicates_into_schedule(kernel): # }}} schedule = PolyhedronLoopifier(kernel)(kernel.schedule) - schedule = Unroller(kernel)(schedule) + unvectorizable_inames = UnvectorizableInamesCollector(kernel)(schedule) + # FIXME: (For now) unvectorizable inames always fallback to unrolling this + # should be selected based on the target. + schedule = Unroller(kernel, unvectorizable_inames)(schedule) schedule = PredicateInsertionMapper(kernel)(schedule) + kernel = kernel.copy(schedule=schedule) + return kernel.copy(schedule=schedule) From 477b20269cf7cd6bfeaba88d49d52d73979e99fc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 10:50:44 -0500 Subject: [PATCH 036/109] pass vectorization to ast_builder's method --- loopy/codegen/instruction.py | 30 ++++-------------------------- loopy/codegen/result.py | 20 ++++++++++++++------ 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index d743bab68..06565f345 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -28,6 +28,7 @@ dim_type = isl.dim_type from loopy.codegen import UnvectorizableError from loopy.codegen.result import CodeGenerationResult +from loopy.diagnostic import LoopyError from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_on_first_arg @@ -107,29 +108,6 @@ def generate_instruction_code(codegen_state, insn): def generate_assignment_instruction_code(kernel, insn, ast_builder, hw_inames_expr, vinfo): ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr) - from loopy.expression import VectorizabilityChecker - - # {{{ vectorization handling - - if vinfo is not None: - if insn.atomicity: - raise UnvectorizableError("atomic operation") - - vcheck = VectorizabilityChecker( - kernel, vinfo.iname, vinfo.length) - lhs_is_vector = vcheck(insn.assignee) - rhs_is_vector = vcheck(insn.expression) - - if not lhs_is_vector and rhs_is_vector: - raise UnvectorizableError( - "LHS is scalar, RHS is vector, cannot assign") - - is_vector = lhs_is_vector - - del lhs_is_vector - del rhs_is_vector - - # }}} from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript @@ -155,15 +133,15 @@ def generate_assignment_instruction_code(kernel, insn, ast_builder, del lhs - result = ast_builder.emit_assignment(kernel, insn, hw_inames_expr) + result = ast_builder.emit_assignment(kernel, insn, hw_inames_expr, vinfo) # {{{ tracing lhs_dtype = kernel.get_var_descriptor(assignee_var_name).dtype if kernel.options.trace_assignments or kernel.options.trace_assignment_values: - if vinfo and is_vector: - raise UnvectorizableError("tracing does not support vectorization") + if vinfo: + raise LoopyError("tracing does not support vectorization") from pymbolic.mapper.stringifier import PREC_NONE lhs_code = ecm(insn.assignee, PREC_NONE) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index b4727c63f..5055040ef 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -324,8 +324,7 @@ def map_function(self, expr, context): # {{{ record the iname_exprs for downstream elements from functools import reduce - from loopy.kernel.data import (HardwareConcurrentTag, GroupIndexTag, - LocalIndexTag) + from loopy.kernel.data import GroupIndexTag, LocalIndexTag from loopy.isl_helpers import static_min_of_pw_aff from loopy.symbolic import (GroupHardwareAxisIndex, LocalHardwareAxisIndex, @@ -337,8 +336,8 @@ def map_function(self, expr, context): frozenset()) def _hw_iname_expr(iname): - tag, = self.kernel.iname_tags_of_type(iname, HardwareConcurrentTag) - assert isinstance(tag, (GroupIndexTag, LocalIndexTag)) + tag, = self.kernel.iname_tags_of_type(iname, (GroupIndexTag, + LocalIndexTag)) lbound = static_min_of_pw_aff(self .kernel.get_iname_bounds(iname) .lower_bound_pw_aff, @@ -352,7 +351,8 @@ def _hw_iname_expr(iname): iname_exprs = {iname: _hw_iname_expr(iname) for iname in all_inames if self.kernel.iname_tags_of_type(iname, - HardwareConcurrentTag)} + (LocalIndexTag, + GroupIndexTag))} # }}} @@ -395,7 +395,15 @@ def map_for(self, expr, context): ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 if self.kernel.iname_tags_of_type(expr.iname, vec_tags): - raise NotImplementedError + assert isinstance(expr.lower_bound, int) + assert isinstance(expr.upper_bound, int) + assert expr.step == 1 + length = expr.upper_bound - expr.lower_bound + 1 + dwnstrm_ctx = context.copy( + vectorization_info=VectorizationInfo(iname=expr.iname, + length=length)) + return self.combine([self.rec(child, dwnstrm_ctx) + for child in expr.children]) else: assert (len(self.kernel.inames[expr.iname].tags) == 0 or self.kernel.iname_tags_of_type(expr.iname, From 556d6af7143de69987400cacce3ea14b8c15481c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 11:48:35 -0500 Subject: [PATCH 037/109] target: adapt to the new calling mechanism for passing vectorization_info --- loopy/codegen/__init__.py | 4 +--- loopy/codegen/instruction.py | 3 ++- loopy/codegen/result.py | 3 ++- loopy/target/__init__.py | 10 ++++++---- loopy/target/c/__init__.py | 29 +++++++++++++++++----------- loopy/target/c/codegen/expression.py | 9 ++------- loopy/target/cuda.py | 6 ++++-- loopy/target/ispc.py | 18 +++++++++-------- loopy/target/numba.py | 6 ++++-- loopy/target/opencl.py | 12 ++++++++---- loopy/target/pyopencl.py | 16 +++++++++------ loopy/target/python.py | 29 ++++++++++++++++++++-------- 12 files changed, 88 insertions(+), 57 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3488a2a5e..e70d06c96 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -140,13 +140,11 @@ class VectorizationInfo: """ .. attribute:: iname .. attribute:: length - .. attribute:: space """ - def __init__(self, iname, length, space): + def __init__(self, iname, length): self.iname = iname self.length = length - self.space = space class SeenFunction(ImmutableRecord): diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 06565f345..1f32492bf 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -107,7 +107,8 @@ def generate_instruction_code(codegen_state, insn): def generate_assignment_instruction_code(kernel, insn, ast_builder, hw_inames_expr, vinfo): - ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr) + ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr, + vinfo) from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 5055040ef..6ade4ccd3 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -457,7 +457,8 @@ def map_if(self, expr, context): else children_res.host_ast) if_ast = ast_builder.emit_if(self.kernel, expr.condition, if_body, - context.iname_exprs) + context.iname_exprs, + context.vectorization_info) if context.in_device: return CodeGenMapperAccumulator(host_ast=children_res.host_ast, diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 30715dba8..b650554d2 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -221,7 +221,8 @@ def ast_for_class(self): def ast_if_class(self): raise NotImplementedError() - def get_expression_to_code_mapper(self, kernel, var_subst_map): + def get_expression_to_code_mapper(self, kernel, var_subst_map, + vectorization_info): raise NotImplementedError() def add_vector_access(self, access_expr, index): @@ -243,7 +244,7 @@ def get_global_arg_decl(self, name, shape, dtype, is_written): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() - def emit_assignment(self, codegen_state, insn): + def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): raise NotImplementedError() def emit_multiple_assignment(self, codegen_state, insn): @@ -257,7 +258,7 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, def can_implement_conditionals(self): return False - def emit_if(self, condition_str, ast): + def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): raise NotImplementedError() def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): @@ -307,7 +308,8 @@ def get_function_declaration(self, kernel, name, implemented_data_info, def get_temporary_decls(self, codegen_state, schedule_index): return [] - def get_expression_to_code_mapper(self, kernel, var_subst_map): + def get_expression_to_code_mapper(self, kernel, var_subst_map, + vectorization_info): return _DummyExpressionToCodeMapper() def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5027cdb5a..1c7c0fdb8 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -938,17 +938,20 @@ def ast_module(self): import cgen return cgen - def get_expression_to_code_mapper(self, kernel, callables_table, var_subst_map): + def get_expression_to_code_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): return self.get_expression_to_c_expression_mapper(kernel, callables_table, - var_subst_map) + var_subst_map, + vectorization_info) def get_expression_to_c_expression_mapper(self, kernel, callables_table, - var_subst_map): + var_subst_map, + vectorization_info): from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper - return ExpressionToCExpressionMapper( - kernel, callables_table, self, var_subst_map, - fortran_abi=self.target.fortran_abi) + return ExpressionToCExpressionMapper(kernel, callables_table, self, + var_subst_map, vectorization_info, + fortran_abi=self.target.fortran_abi) def get_c_expression_to_code_mapper(self): from loopy.target.c.codegen.expression import CExpressionToCodeMapper @@ -1025,9 +1028,10 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): return arg_decl - def emit_assignment(self, kernel, insn, var_subst_map): + def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info) assignee_var_name, = insn.assignee_var_names() @@ -1134,7 +1138,8 @@ def emit_multiple_assignment(self, codegen_state, insn): def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, inner, var_subst_map): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info=None) from pymbolic import var from pymbolic.primitives import Comparison @@ -1176,9 +1181,11 @@ def emit_comment(self, s): def can_implement_conditionals(self): return True - def emit_if(self, kernel, condition, ast, var_subst_map): + def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): + assert vectorization_info is None, "cannot be vectorizable if we see an if" from cgen import If - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info=None) return If(ecm(condition), ast) # }}} diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index bd29e0560..81f3d2a96 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -63,23 +63,18 @@ class ExpressionToCExpressionMapper(IdentityMapper): with. A caller might set this to map iname to hardware iname expressions. """ def __init__(self, kernel, callables_table, ast_builder, var_subst_map, - vectorization_info=None, fortran_abi=False, - type_inf_mapper=None): + vectorization_info, fortran_abi=False, type_inf_mapper=None): self.kernel = kernel self.callables_table = callables_table self.ast_builder = ast_builder self.var_subst_map = var_subst_map + self.vectorization_info = vectorization_info if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, callables_table) self.type_inf_mapper = type_inf_mapper - - # TODO: rewire mapper methods so that we don't store vectorization_info - # as a state, but instead pass it as an argument to each mapper method - self.vectorization_info = vectorization_info - self.fortran_abi = fortran_abi # {{{ helpers diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 53df744b2..94564420d 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -364,8 +364,10 @@ def preamble_generators(self): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): - return ExpressionToCudaCExpressionMapper(kernel, self, var_subst_map) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + vectorization_info): + return ExpressionToCudaCExpressionMapper(kernel, self, var_subst_map, + vectorization_info) _VEC_AXES = "xyzw" diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 83b0936b6..2dbd9f70b 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -285,8 +285,9 @@ def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): - return ExprToISPCExprMapper(kernel, self, var_subst_map) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + vectorization_info): + return ExprToISPCExprMapper(kernel, self, var_subst_map, vectorization_info) def add_vector_access(self, access_expr, index): return access_expr[index] @@ -372,13 +373,14 @@ def get_value_arg_decl(self, name, shape, dtype, is_written): from cgen.ispc import ISPCUniform return ISPCUniform(result) - def emit_assignment(self, codegen_state, insn): - kernel = codegen_state.kernel - ecm = codegen_state.expression_to_code_mapper + def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): + raise NotImplementedError + ecm = self.expression_to_code_mapper(kernel, var_subst_map, + vectorization_info) assignee_var_name, = insn.assignee_var_names() - lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_var = kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity: @@ -408,8 +410,8 @@ def emit_assignment(self, codegen_state, insn): simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) access_info = get_access_info(kernel.target, ary, index_tuple, - lambda expr: evaluate(expr, codegen_state.var_subst_map), - codegen_state.vectorization_info) + lambda expr: evaluate(expr, var_subst_map), + vectorization_info) from loopy.kernel.data import ArrayArg, TemporaryVariable diff --git a/loopy/target/numba.py b/loopy/target/numba.py index f32fad066..c18b07cbd 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -164,8 +164,10 @@ def preamble_generators(self): def get_python_function_decorators(self): return ("@_lpy_ncu.jit",) - def get_expression_to_code_mapper(self, kernel, var_subst_map): - return NumbaCudaExpressionToPythonMapper(kernel, self, var_subst_map) + def get_expression_to_code_mapper(self, kernel, var_subst_map, + vectorization_info): + return NumbaCudaExpressionToPythonMapper(kernel, self, var_subst_map, + vectorization_info) class NumbaCudaTarget(TargetBase): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d75d4ec11..7c7191f79 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -649,8 +649,10 @@ def generate_top_of_body(self, codegen_state): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): - return ExpressionToOpenCLCExpressionMapper(kernel, self, var_subst_map) + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + vectorization_info): + return ExpressionToOpenCLCExpressionMapper(kernel, self, var_subst_map, + vectorization_info) def add_vector_access(self, access_expr, index): # The 'int' avoids an 'L' suffix for long ints. @@ -893,9 +895,11 @@ def make_subscript(self, array, base_expr, subscript): class VolatileMemOpenCLCASTBuilder(OpenCLCASTBuilder): - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + vectorization_info): return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, self, - var_subst_map) + var_subst_map, + vectorization_info) class VolatileMemOpenCLTarget(OpenCLTarget): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 795f1af97..60efd4ff0 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -772,7 +772,8 @@ def alloc_nbytes(tv): return code_lines def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) from loopy.schedule.tree import get_insns_in_function gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( @@ -863,20 +864,23 @@ def preamble_generators(self): # }}} - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): - return ExpressionToPyOpenCLCExpressionMapper(kernel, self, var_subst_map) - + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + vectorization_info): + return ExpressionToPyOpenCLCExpressionMapper(kernel, self, var_subst_map, + vectorization_info) # }}} # {{{ volatile mem acccess target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map): + def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + vectorization_info): from loopy.target.opencl import \ VolatileMemExpressionToOpenCLCExpressionMapper return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, self, - var_subst_map) + var_subst_map, + vectorization_info) class VolatileMemPyOpenCLTarget(PyOpenCLTarget): diff --git a/loopy/target/python.py b/loopy/target/python.py index 1810aa426..37aab18ad 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -35,11 +35,17 @@ # {{{ expression to code class ExpressionToPythonMapper(StringifyMapper): - def __init__(self, kernel, ast_builder, var_subst_map, type_inf_mapper=None): + def __init__(self, kernel, ast_builder, var_subst_map, vectorization_info, + type_inf_mapper=None): self.kernel = kernel self.ast_builder = ast_builder self.var_subst_map = var_subst_map + if vectorization_info: + raise NotImplementedError("vectorization not implemented") + + self.vectorization_info = vectorization_info + if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, self.codegen_state.callables_table) @@ -201,8 +207,10 @@ def get_temporary_decls(self, codegen_state, schedule_index): return result - def get_expression_to_code_mapper(self, kernel, var_subst_map): - return ExpressionToPythonMapper(kernel, self, var_subst_map) + def get_expression_to_code_mapper(self, kernel, var_subst_map, + vectorization_info): + return ExpressionToPythonMapper(kernel, self, var_subst_map, + vectorization_info) @property def ast_base_class(self): @@ -262,20 +270,25 @@ def emit_comment(self, s): def can_implement_conditionals(self): return True - def emit_if(self, kernel, condition, ast, var_subst_map): + def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): + assert vectorization_info is None from genpy import If - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info) return If(ecm(condition), ast) - def emit_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - + def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): if insn.atomicity: raise NotImplementedError("atomic ops in Python") + if vectorization_info: + raise NotImplementedError("vectorized assignments in Python") + from pymbolic.mapper.stringifier import PREC_NONE from genpy import Assign + ecm = self.expression_to_code_mapper(kernel, var_subst_map) + return Assign( ecm(insn.assignee, prec=PREC_NONE, type_context=None), ecm(insn.expression, prec=PREC_NONE, type_context=None)) From 7a2bfb48125223a94f88550801d89f9dab32474a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 14:22:28 -0500 Subject: [PATCH 038/109] schedule.tree.Barrier: also note down the mem_kind --- loopy/schedule/tree.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 60f159028..30bd24503 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -45,6 +45,7 @@ class Barrier(ScheduleNode): """ comment: str synchronization_kind: str + mem_kind: str originating_insn_id: Optional[str] mapper_method: str = field(default="map_barrier", repr=False, init=False) @@ -205,12 +206,13 @@ def add_run_instruction(self, insn_id): self.current_node.children.append(RunInstruction(insn_id)) - def add_barrier(self, comment, kind, insn_id): + def add_barrier(self, comment, sync_kind, mem_kind, insn_id): if isinstance(self.current_node, InstructionBlock): self._build_stack.pop() assert isinstance(self.current_node, (Schedule, Function, Loop)) - self.current_node.children.append(Barrier(comment, kind, insn_id)) + self.current_node.children.append(Barrier(comment, sync_kind, mem_kind, + insn_id)) def exit_function(self): if isinstance(self.current_node, InstructionBlock): @@ -254,6 +256,7 @@ def make_schedule_tree(kernel): elif isinstance(sched_item, schedule.Barrier): bob.add_barrier(sched_item.comment, sched_item.synchronization_kind, + sched_item.mem_kind, sched_item.originating_insn_id) else: raise NotImplementedError(type(sched_item)) @@ -316,7 +319,7 @@ def map_loop(self, expr, *args, **kwargs): def map_barrier(self, expr, *args, **kwargs): return Barrier(expr.comment, expr.synchronization_kind, - expr.originating_insn_id) + expr.mem_kind, expr.originating_insn_id) def map_run_instruction(self, expr, *args, **kwargs): return RunInstruction(expr.insn_id) From fedc8e0004fdccd52a29af0a4189123eddb3c3b2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 14:22:44 -0500 Subject: [PATCH 039/109] CodeGenMapper: implements barriers --- loopy/codegen/result.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 6ade4ccd3..1a165f3b7 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -434,6 +434,7 @@ def map_for(self, expr, context): loop_body, context.iname_exprs)] else: + # special case: if ubound == lbound => just have the body loop_ast = body_ast if context.in_device: @@ -470,8 +471,20 @@ def map_if(self, expr, context): # }}} def map_barrier(self, expr, context): - # ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 - raise NotImplementedError + if context.in_device: + ast_builder = self.device_ast_builder + barrier_ast = ast_builder.emit_barrier(expr.synchronization_kind, + expr.mem_kind, expr.comment) + return CodeGenMapperAccumulator(device_ast=[barrier_ast], + host_ast=[]) + else: + if expr.synchronization_kind in ["global", "local"]: + return CodeGenMapperAccumulator(host_ast=[self + .host_ast_builder + .emit_blank_line()], + device_ast=[]) + else: + raise NotImplementedError(f"Host barrier for {expr}.") # {{{ instruction From c9408773dc1526e0e87f36651053daa52b9f6bae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 14:45:08 -0500 Subject: [PATCH 040/109] formatting: move CodeGenMapperAccumulator definition near to the CodeGenMapper --- loopy/codegen/result.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 1a165f3b7..7519f69a1 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -163,6 +163,14 @@ def get_idis_for_kernel(kernel): return implemented_data_info +# {{{ program generation top-level + +@dataclass(frozen=True) +class CodeGenMapperAccumulator: + host_ast: List[Union[Any]] + device_ast: List[Union[GeneratedProgram, Any]] + + @dataclass(frozen=True) class CodeGenerationContext: """ @@ -190,14 +198,6 @@ def copy(self, *, in_device=None, iname_exprs=None, vectorization_info=vectorization_info) -# {{{ program generation top-level - -@dataclass(frozen=True) -class CodeGenMapperAccumulator: - host_ast: List[Union[Any]] - device_ast: List[Union[GeneratedProgram, Any]] - - class CodeGenMapper(CombineMapper): def __init__(self, kernel): self.kernel = kernel From 30bb84cd19b7a5b9dd7b8bc95bc84ee7c7da8934 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 14:45:58 -0500 Subject: [PATCH 041/109] note that iname_slab_incremenets are not implemented for now --- loopy/schedule/tree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 30bd24503..9cc739a3b 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -922,6 +922,9 @@ def homogenize_instruction_blocks(kernel): def insert_predicates_into_schedule(kernel): + if kernel.iname_slab_increments: + raise NotImplementedError + assert kernel.state >= KernelState.LINEARIZED assert isinstance(kernel.schedule, Schedule) From 4210bda35f7f15ed4ebe1475b1866ad2fd38f520 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 14:46:23 -0500 Subject: [PATCH 042/109] remove earlier implementations --- loopy/codegen/control.py | 477 --------------------------------------- loopy/codegen/loop.py | 373 ------------------------------ 2 files changed, 850 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 8adf97fa7..2f2b3e221 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -23,12 +23,6 @@ THE SOFTWARE. """ -import islpy as isl -from loopy.schedule import ( - EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, - gather_schedule_block, generate_sub_sched_items) -from loopy.diagnostic import LoopyError - def synthesize_idis_for_extra_args(kernel, sched_item): """ @@ -63,475 +57,4 @@ def synthesize_idis_for_extra_args(kernel, sched_item): return idis - -def generate_code_for_sched_index(codegen_state, sched_index): - kernel = codegen_state.kernel - sched_item = kernel.linearization[sched_index] - - if isinstance(sched_item, CallKernel): - assert not codegen_state.is_generating_device_code - - from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) - _, past_end_i = gather_schedule_block(kernel.linearization, sched_index) - assert past_end_i <= codegen_state.schedule_index_end - - extra_args = synthesize_idis_for_extra_args(kernel, sched_index) - - new_codegen_state = codegen_state.copy( - is_generating_device_code=True, - gen_program_name=sched_item.kernel_name, - schedule_index_end=past_end_i-1, - implemented_data_info=(codegen_state.implemented_data_info - + extra_args)) - - from loopy.codegen.result import generate_host_or_device_program - codegen_result = generate_host_or_device_program( - new_codegen_state, sched_index) - - if codegen_state.is_entrypoint: - glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.linearization, sched_index), - codegen_state.callables_table) - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) - else: - # do not generate host code for non-entrypoint kernels - return codegen_result - - elif isinstance(sched_item, EnterLoop): - from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, - ForceSequentialTag, LoopedIlpTag, VectorizeTag, - InameImplementationTag, - InOrderSequentialSequentialTag, filter_iname_tags_by_type) - - tags = kernel.iname_tags_of_type(sched_item.iname, InameImplementationTag) - tags = tuple(tag for tag in tags if tag) - - from loopy.codegen.loop import ( - generate_unroll_loop, - generate_vectorize_loop, - generate_sequential_loop_dim_code) - - if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): - func = generate_unroll_loop - elif filter_iname_tags_by_type(tags, VectorizeTag): - func = generate_vectorize_loop - elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, - ForceSequentialTag, InOrderSequentialSequentialTag)): - func = generate_sequential_loop_dim_code - else: - raise RuntimeError("encountered (invalid) EnterLoop " - "for '%s', tagged '%s'" - % (sched_item.iname, ", ".join(str(tag) for tag in tags))) - - return func(codegen_state, sched_index) - - elif isinstance(sched_item, Barrier): - # {{{ emit barrier code - - from loopy.codegen.result import CodeGenerationResult - - if codegen_state.is_generating_device_code: - barrier_ast = codegen_state.ast_builder.emit_barrier( - sched_item.synchronization_kind, sched_item.mem_kind, - sched_item.comment) - if sched_item.originating_insn_id: - return CodeGenerationResult.new( - codegen_state, - sched_item.originating_insn_id, - barrier_ast, - codegen_state.implemented_domain) - else: - return barrier_ast - else: - # host code - if sched_item.synchronization_kind in ["global", "local"]: - # host code is assumed globally and locally synchronous - return CodeGenerationResult( - host_program=None, - device_programs=[], - implemented_domains={}, - implemented_data_info=codegen_state.implemented_data_info) - - else: - raise LoopyError("do not know how to emit code for barrier " - "synchronization kind '%s'" "in host code" - % sched_item.synchronization_kind) - - # }}} - - elif isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] - - from loopy.codegen.instruction import generate_instruction_code - return codegen_state.try_vectorized( - "instruction %s" % insn.id, - lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) - - else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) - - -def get_required_predicates(kernel, sched_index): - result = None - for _, sched_item in generate_sub_sched_items(kernel.linearization, sched_index): - if isinstance(sched_item, Barrier): - my_preds = frozenset() - elif isinstance(sched_item, RunInstruction): - my_preds = kernel.id_to_insn[sched_item.insn_id].predicates - else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) - - if result is None: - result = my_preds - else: - result = result & my_preds - - if result is None: - result = frozenset() - - return result - - -def group_by(entry, key, merge): - if not entry: - return entry - - result = [] - previous = entry[0] - - for item in entry[1:]: - if key(previous) == key(item): - previous = merge(previous, item) - - else: - result.append(previous) - previous = item - - result.append(previous) - return result - - -def build_loop_nest(codegen_state, schedule_index): - # Most of the complexity of this function goes towards finding groups of - # instructions that can be nested inside a shared conditional. - - kernel = codegen_state.kernel - - # If the AST builder does not implement conditionals, we can save us - # some work about hoisting conditionals and directly go into recursion. - if not codegen_state.ast_builder.can_implement_conditionals: - result = [] - inner = generate_code_for_sched_index(codegen_state, schedule_index) - if inner is not None: - result.append(inner) - return merge_codegen_results(codegen_state, result) - - # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices - - # i.e. go up to the next LeaveLoop, and skip over inner loops. - - my_sched_indices = [] - - i = schedule_index - while i < codegen_state.schedule_index_end: - sched_item = kernel.linearization[i] - - if isinstance(sched_item, LeaveLoop): - break - - my_sched_indices.append(i) - - if isinstance(sched_item, (EnterLoop, CallKernel)): - _, i = gather_schedule_block(kernel.linearization, i) - assert i <= codegen_state.schedule_index_end, \ - "schedule block extends beyond schedule_index_end" - - elif isinstance(sched_item, Barrier): - i += 1 - - elif isinstance(sched_item, RunInstruction): - i += 1 - else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) - - del i - - # }}} - - # {{{ pass 2: find admissible conditional inames for each sibling schedule item - - from pytools import ImmutableRecord - - class ScheduleIndexInfo(ImmutableRecord): - """ - .. attribute:: schedule_index - .. attribute:: admissible_cond_inames - .. attribute:: required_predicates - .. attribute:: used_inames_within - """ - - from loopy.schedule import find_used_inames_within - from loopy.codegen.bounds import get_usable_inames_for_conditional - - sched_index_info_entries = [ - ScheduleIndexInfo( - schedule_indices=[i], - admissible_cond_inames=( - get_usable_inames_for_conditional(kernel, i, - codegen_state.codegen_cachemanager)), - required_predicates=get_required_predicates(kernel, i), - used_inames_within=find_used_inames_within(kernel, i) - ) - for i in my_sched_indices - ] - - sched_index_info_entries = group_by( - sched_index_info_entries, - key=lambda sii: ( - sii.admissible_cond_inames, - sii.required_predicates, - sii.used_inames_within), - merge=lambda sii1, sii2: sii1.copy( - schedule_indices=( - sii1.schedule_indices - + - sii2.schedule_indices))) - - # }}} - - # {{{ pass 3: greedily group schedule items that share admissible inames - - from pytools import memoize_method - - class BoundsCheckCache: - def __init__(self, kernel, impl_domain): - self.kernel = kernel - self.impl_domain = impl_domain - - @memoize_method - def __call__(self, check_inames): - if not check_inames: - return [] - - domain = isl.align_spaces( - self.kernel.get_inames_domain(check_inames), - self.impl_domain, obj_bigger_ok=True) - from loopy.codegen.bounds import get_approximate_convex_bounds_checks - # Each instruction individually gets its bounds checks, - # so we can safely overapproximate here. - return get_approximate_convex_bounds_checks(domain, - check_inames, self.impl_domain, self.kernel.cache_manager) - - def build_insn_group(sched_index_info_entries, codegen_state, - done_group_lengths=frozenset()): - """ - :arg done_group_lengths: A set of group lengths (integers) that grows - from empty to include the longest found group and downwards with every - recursive call. It serves to prevent infinite recursion by preventing - recursive calls from doing anything about groups that are too small. - """ - - from loopy.symbolic import get_dependencies - - # The rough plan here is that build_insn_group starts out with the - # entirety of the current schedule item's downward siblings (i.e. all - # the ones up to the next LeaveLoop). It will then iterate upward to - # find the largest usable conditional hoist group. - # - # It will then call itself recursively, telling its recursive instances - # to ignore the hoist group it just found by adding that group length - # to done_group_length. (It'll also chop the set of schedule indices - # considered down so that a callee cannot find a *longer* hoist group.) - # - # Upon return the hoist is wrapped around the returned code and - # build_insn_group calls itself for the remainder of schedule indices - # that were not in the hoist group. - - if not sched_index_info_entries: - return [] - - origin_si_entry = sched_index_info_entries[0] - current_iname_set = origin_si_entry.admissible_cond_inames - current_pred_set = (origin_si_entry.required_predicates - - codegen_state.implemented_predicates) - - # {{{ grow schedule item group - - # Keep growing schedule item group as long as group fulfills minimum - # size requirement. - - bounds_check_cache = BoundsCheckCache( - kernel, codegen_state.implemented_domain) - - found_hoists = [] - - candidate_group_length = 1 - while candidate_group_length <= len(sched_index_info_entries): - if candidate_group_length in done_group_lengths: - candidate_group_length += 1 - continue - - current_iname_set = ( - current_iname_set - & sched_index_info_entries[candidate_group_length-1] - .admissible_cond_inames) - current_pred_set = ( - current_pred_set - & sched_index_info_entries[candidate_group_length-1] - .required_predicates) - - current_pred_set = frozenset( - pred for pred in current_pred_set - if get_dependencies(pred) & kernel.all_inames() - <= current_iname_set) - - # {{{ see which inames are actually used in group - - # And only generate conditionals for those. - used_inames = set() - for sched_index_info_entry in \ - sched_index_info_entries[0:candidate_group_length]: - used_inames |= sched_index_info_entry.used_inames_within - - # }}} - - only_unshared_inames = kernel._remove_inames_for_shared_hw_axes( - current_iname_set & used_inames) - - bounds_checks = bounds_check_cache(only_unshared_inames) - - if (bounds_checks # found a bounds check - or current_pred_set - or candidate_group_length == 1): - # length-1 must always be an option to reach the recursion base - # case below - found_hoists.append((candidate_group_length, - bounds_checks, current_pred_set)) - - if not bounds_checks and not current_pred_set: - # already no more checks possible, let's not waste time - # checking longer groups. - break - - candidate_group_length += 1 - - # }}} - - # pick largest such group - group_length, bounds_checks, pred_checks = max(found_hoists) - - check_set = None - for cns in bounds_checks: - cns_set = (isl.BasicSet.universe(cns.get_space()) - .add_constraint(cns)) - - if check_set is None: - check_set = cns_set - else: - check_set, cns_set = isl.align_two(check_set, cns_set) - check_set = check_set.intersect(cns_set) - - if check_set is None: - new_codegen_state = codegen_state - is_empty = False - else: - is_empty = check_set.is_empty() - new_codegen_state = codegen_state.intersect(check_set) - - if pred_checks: - new_codegen_state = new_codegen_state.copy( - implemented_predicates=new_codegen_state.implemented_predicates - | pred_checks) - - if is_empty: - result = [] - else: - if group_length == 1: - # group only contains starting schedule item - def gen_code(inner_codegen_state): - result = [] - for i in origin_si_entry.schedule_indices: - inner = generate_code_for_sched_index( - inner_codegen_state, i) - - if inner is not None: - result.append(inner) - - return result - - else: - # recurse with a bigger done_group_lengths - def gen_code(inner_codegen_state): - return build_insn_group( - sched_index_info_entries[0:group_length], - inner_codegen_state, - done_group_lengths=( - done_group_lengths | {group_length})) - - # gen_code returns a list - - if bounds_checks or pred_checks: - from loopy.symbolic import constraint_to_cond_expr - - prev_gen_code = gen_code - - def gen_code(inner_codegen_state): # noqa pylint:disable=function-redefined - condition_exprs = [ - constraint_to_cond_expr(cns) - for cns in bounds_checks] + [ - pred_chk for pred_chk in pred_checks] - - prev_result = prev_gen_code(inner_codegen_state) - - return [wrap_in_if( - inner_codegen_state, - condition_exprs, - merge_codegen_results(codegen_state, prev_result))] - - cannot_vectorize = False - if new_codegen_state.vectorization_info is not None: - from loopy.isl_helpers import obj_involves_variable - for cond in bounds_checks: - if obj_involves_variable( - cond, - new_codegen_state.vectorization_info.iname): - cannot_vectorize = True - break - - if cannot_vectorize: - def gen_code_wrapper(inner_codegen_state): - # gen_code returns a list, but this needs to return a - # GeneratedCode instance. - - return gen_code(inner_codegen_state) - - result = [new_codegen_state.unvectorize(gen_code_wrapper)] - else: - result = gen_code(new_codegen_state) - - else: - result = gen_code(new_codegen_state) - - return result + build_insn_group( - sched_index_info_entries[group_length:], codegen_state) - - # }}} - - insn_group = build_insn_group(sched_index_info_entries, codegen_state) - return merge_codegen_results( - codegen_state, - insn_group) - - # vim: foldmethod=marker diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a0d22330f..c2a303f18 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -21,12 +21,7 @@ """ -from loopy.diagnostic import warn, LoopyError -from loopy.codegen.result import merge_codegen_results import islpy as isl -from islpy import dim_type -from loopy.codegen.control import build_loop_nest -from pymbolic.mapper.stringifier import PREC_NONE # {{{ conditional-reducing slab decomposition @@ -116,372 +111,4 @@ def get_slab_decomposition(kernel, iname): # }}} -# {{{ unrolled loops - -def generate_unroll_loop(codegen_state, sched_index): - kernel = codegen_state.kernel - - iname = kernel.linearization[sched_index].iname - - bounds = kernel.get_iname_bounds(iname, constants_only=True) - - from loopy.isl_helpers import ( - static_max_of_pw_aff, static_value_of_pw_aff) - from loopy.symbolic import pw_aff_to_expr - - length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) - - if not length_aff.is_cst(): - raise LoopyError( - "length of unrolled loop '%s' is not a constant, " - "cannot unroll") - - length = int(pw_aff_to_expr(length_aff)) - - try: - lower_bound_aff = static_value_of_pw_aff( - bounds.lower_bound_pw_aff.coalesce(), - constants_only=False) - except Exception as e: - raise type(e)("while finding lower bound of '%s': " % iname) - - result = [] - - for i in range(length): - idx_aff = lower_bound_aff + i - new_codegen_state = codegen_state.fix(iname, idx_aff) - result.append( - build_loop_nest(new_codegen_state, sched_index+1)) - - return merge_codegen_results(codegen_state, result) - -# }}} - - -# {{{ vectorized loops - -def generate_vectorize_loop(codegen_state, sched_index): - kernel = codegen_state.kernel - - iname = kernel.linearization[sched_index].iname - - bounds = kernel.get_iname_bounds(iname, constants_only=True) - - from loopy.isl_helpers import ( - static_max_of_pw_aff, static_value_of_pw_aff) - from loopy.symbolic import pw_aff_to_expr - - length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) - - if not length_aff.is_cst(): - warn(kernel, "vec_upper_not_const", - "upper bound for vectorized loop '%s' is not a constant, " - "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) - - length = int(pw_aff_to_expr(length_aff)) - - try: - lower_bound_aff = static_value_of_pw_aff( - bounds.lower_bound_pw_aff.coalesce(), - constants_only=False) - except Exception as e: - raise type(e)("while finding lower bound of '%s': " % iname) - - if not lower_bound_aff.plain_is_zero(): - warn(kernel, "vec_lower_not_0", - "lower bound for vectorized loop '%s' is not zero, " - "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) - - # {{{ 'implement' vectorization bounds - - domain = kernel.get_inames_domain(iname) - - from loopy.isl_helpers import make_slab - slab = make_slab(domain.get_space(), iname, - lower_bound_aff, lower_bound_aff+length) - codegen_state = codegen_state.intersect(slab) - - # }}} - - from loopy.codegen import VectorizationInfo - new_codegen_state = codegen_state.copy( - vectorization_info=VectorizationInfo( - iname=iname, - length=length, - space=length_aff.space)) - - return build_loop_nest(new_codegen_state, sched_index+1) - -# }}} - - -def intersect_kernel_with_slab(kernel, slab, iname): - from loopy.kernel.tools import DomainChanger - - domch = DomainChanger(kernel, (iname,)) - orig_domain = domch.get_original_domain() - orig_domain, slab = isl.align_two(slab, orig_domain) - return domch.get_kernel_with(orig_domain & slab) - - -# {{{ hw-parallel loop - -def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, - hw_inames_left=None): - kernel = codegen_state.kernel - - from loopy.kernel.data import (UniqueInameTag, HardwareConcurrentTag, - LocalInameTag, GroupInameTag, VectorizeTag, InameImplementationTag) - - from loopy.schedule import get_insn_ids_for_block_at - insn_ids_for_block = get_insn_ids_for_block_at(kernel.linearization, - schedule_index) - - if hw_inames_left is None: - all_inames_by_insns = set() - for insn_id in insn_ids_for_block: - all_inames_by_insns |= kernel.insn_inames(insn_id) - - hw_inames_left = [iname for iname in all_inames_by_insns - if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) - and not kernel.iname_tags_of_type(iname, VectorizeTag)] - - if not hw_inames_left: - return next_func(codegen_state) - - global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block, codegen_state.callables_table, return_dict=True) - - hw_inames_left = hw_inames_left[:] - iname = hw_inames_left.pop() - - from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex - - tag, = kernel.iname_tags_of_type(iname, UniqueInameTag, max_num=1, min_num=1) - - if isinstance(tag, GroupInameTag): - hw_axis_expr = GroupHardwareAxisIndex(tag.axis) - elif isinstance(tag, LocalInameTag): - hw_axis_expr = LocalHardwareAxisIndex(tag.axis) - else: - raise RuntimeError("unexpected hw tag type") - - other_inames_with_same_tag = [ - other_iname for other_iname in kernel.all_inames() - if (kernel.iname_tags_of_type(other_iname, UniqueInameTag) - and other_iname != iname - and any(_tag.key == tag.key - for _tag in kernel.iname_tags_of_type( - other_iname, InameImplementationTag)))] - - # {{{ 'implement' hardware axis boundaries - - if isinstance(tag, LocalInameTag): - hw_axis_size = local_size[tag.axis] - elif isinstance(tag, GroupInameTag): - hw_axis_size = global_size[tag.axis] - else: - raise RuntimeError("unknown hardware parallel tag") - - result = [] - - bounds = kernel.get_iname_bounds(iname) - domain = kernel.get_inames_domain(iname) - - # It's ok to find a bound that's too "loose". The conditional - # generators will mop up after us. - from loopy.isl_helpers import static_min_of_pw_aff - lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, - constants_only=False) - - # These bounds are 'implemented' by the hardware. Make sure - # that the downstream conditional generators realize that. - if not isinstance(hw_axis_size, int): - hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) - - from loopy.isl_helpers import make_slab - slab = make_slab(domain.get_space(), iname, - lower_bound, lower_bound+hw_axis_size) - codegen_state = codegen_state.intersect(slab) - - from loopy.symbolic import pw_aff_to_expr - hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) - - # }}} - - slabs = get_slab_decomposition(kernel, iname) - - if other_inames_with_same_tag and len(slabs) > 1: - raise RuntimeError("cannot do slab decomposition on inames that share " - "a tag with other inames") - - result = [] - - for slab_name, slab in slabs: - if len(slabs) > 1: - result.append( - codegen_state.ast_builder.emit_comment( - f"{slab_name} slab for '{iname}'")) - - # Have the conditional infrastructure generate the - # slabbing conditionals. - slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) - new_codegen_state = (codegen_state - .copy_and_assign(iname, hw_axis_expr) - .copy(kernel=slabbed_kernel)) - - inner = set_up_hw_parallel_loops( - new_codegen_state, schedule_index, next_func, - hw_inames_left) - - result.append(inner) - - return merge_codegen_results(codegen_state, result) - -# }}} - - -# {{{ sequential loop - -def generate_sequential_loop_dim_code(codegen_state, sched_index): - kernel = codegen_state.kernel - - ecm = codegen_state.expression_to_code_mapper - loop_iname = kernel.linearization[sched_index].iname - - slabs = get_slab_decomposition(kernel, loop_iname) - - from loopy.codegen.bounds import get_usable_inames_for_conditional - - # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, sched_index, - codegen_state.codegen_cachemanager) - domain = kernel.get_inames_domain(loop_iname) - - result = [] - - for slab_name, slab in slabs: - cmt = f"{slab_name} slab for '{loop_iname}'" - if len(slabs) == 1: - cmt = None - - # {{{ find bounds - - aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True) - - dom_and_slab = aligned_domain & slab - - assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) - dom_and_slab, assumptions_non_param = isl.align_two( - dom_and_slab, assumptions_non_param) - dom_and_slab = dom_and_slab & assumptions_non_param - - # move inames that are usable into parameters - moved_inames = [] - for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): - if das_iname in usable_inames: - moved_inames.append(das_iname) - dt, idx = dom_and_slab.get_var_dict()[das_iname] - dom_and_slab = dom_and_slab.move_dims( - dim_type.param, dom_and_slab.dim(dim_type.param), - dt, idx, 1) - - _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] - - impl_domain = isl.align_spaces( - codegen_state.implemented_domain, - dom_and_slab, - obj_bigger_ok=True - ).params() - - lbound = ( - kernel.cache_manager.dim_min( - dom_and_slab, loop_iname_idx) - .gist(kernel.assumptions) - .gist(impl_domain) - .coalesce()) - ubound = ( - kernel.cache_manager.dim_max( - dom_and_slab, loop_iname_idx) - .gist(kernel.assumptions) - .gist(impl_domain) - .coalesce()) - - # }}} - - # {{{ find implemented loop, build inner code - - from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr - impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) - impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) - - # impl_loop may be overapproximated - from loopy.isl_helpers import make_loop_bounds_from_pwaffs - impl_loop = make_loop_bounds_from_pwaffs( - dom_and_slab.space, - loop_iname, - impl_lbound, - impl_ubound) - - for moved_iname in moved_inames: - # move moved_iname to 'set' dim_type in impl_loop - dt, idx = impl_loop.get_var_dict()[moved_iname] - impl_loop = impl_loop.move_dims( - dim_type.set, impl_loop.dim(dim_type.set), - dt, idx, 1) - - new_codegen_state = ( - codegen_state - .intersect(impl_loop) - .copy(kernel=intersect_kernel_with_slab( - kernel, slab, loop_iname))) - - inner = build_loop_nest(new_codegen_state, sched_index+1) - - # }}} - - if cmt is not None: - result.append(codegen_state.ast_builder.emit_comment(cmt)) - - astb = codegen_state.ast_builder - - from loopy.symbolic import pw_aff_to_expr - - if impl_ubound.is_equal(impl_lbound): - # single-trip, generate just a variable assignment, not a loop - inner = merge_codegen_results(codegen_state, [ - astb.emit_initializer( - codegen_state, - kernel.index_dtype, loop_iname, - ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), - is_const=True), - astb.emit_blank_line(), - inner, - ]) - result.append( - inner.with_new_ast( - codegen_state, - astb.ast_block_scope_class( - inner.current_ast(codegen_state)))) - - else: - inner_ast = inner.current_ast(codegen_state) - - from loopy.isl_helpers import simplify_pw_aff - - result.append( - inner.with_new_ast( - codegen_state, - astb.emit_sequential_loop( - codegen_state, loop_iname, kernel.index_dtype, - pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), - pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), - inner_ast))) - - return merge_codegen_results(codegen_state, result) - -# }}} - # vim: foldmethod=marker From 48e824c1983a304d31568ffc36f31ffbfd75cc98 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 15:15:19 -0500 Subject: [PATCH 043/109] generate C instruction --- loopy/codegen/instruction.py | 15 ++++++++------- loopy/codegen/result.py | 11 ++++++++--- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 1f32492bf..435b2bfbe 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -227,10 +227,12 @@ def generate_call_code(codegen_state, insn): return result -def generate_c_instruction_code(codegen_state, insn): - kernel = codegen_state.kernel +def generate_c_instruction_code(kernel, insn, ast_builder, + hw_inames_expr, vinfo): + ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr, + vinfo) - if codegen_state.vectorization_info is not None: + if vinfo is not None: raise UnvectorizableError("C instructions cannot be vectorized") body = [] @@ -241,15 +243,14 @@ def generate_c_instruction_code(codegen_state, insn): from pymbolic.primitives import Variable for name, iname_expr in insn.iname_exprs: if (isinstance(iname_expr, Variable) - and name not in codegen_state.var_subst_map): + and name not in hw_inames_expr): # No need, the bare symbol will work continue body.append( Initializer( - POD(codegen_state.ast_builder, kernel.index_dtype, name), - codegen_state.expression_to_code_mapper( - iname_expr, prec=PREC_NONE, type_context="i"))) + POD(ast_builder, kernel.index_dtype, name), + ecm(iname_expr, prec=PREC_NONE, type_context="i"))) if body: body.append(Line()) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7519f69a1..0474fab58 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -245,7 +245,7 @@ def map_schedule(self, expr): CodeGenerationContext(False, {})) for child in expr.children]) - for tv in self.kernel.temporary_variables.items(): + for tv in self.kernel.temporary_variables.values(): if tv.address_space == AddressSpace.GLOBAL and ( tv.initializer is not None): # prepend the initializer atop the code. @@ -491,7 +491,8 @@ def map_barrier(self, expr, context): def map_run_instruction(self, expr, context): from loopy.kernel.instruction import (CallInstruction, Assignment, CInstruction, NoOpInstruction) - from loopy.codegen.instruction import generate_assignment_instruction_code + from loopy.codegen.instruction import (generate_assignment_instruction_code, + generate_c_instruction_code) ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 @@ -507,7 +508,11 @@ def map_run_instruction(self, expr, context): (context .vectorization_info)) elif isinstance(insn, CInstruction): - raise NotImplementedError + insn_ast = generate_c_instruction_code(self.kernel, insn, + ast_builder, + context.iname_exprs, + (context + .vectorization_info)) elif isinstance(insn, NoOpInstruction): raise NotImplementedError else: From 432e901be16f6d725321b4813baec0db4eb92139 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 15:15:45 -0500 Subject: [PATCH 044/109] minor fixes: be precise about which slab increments we can't handle --- loopy/schedule/tree.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 9cc739a3b..26ef7b0c1 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -517,7 +517,7 @@ def map_schedule(self, expr): def map_function(self, expr, context): # get the implemented domain for the insn ids in this kernel # Shouldn't be difficult to write a combine mapper for it. - gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids_as_exprs( + gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids( InstructionGatherer()(expr)) # FIXME: Somehow we need to get rid of allowing the hardware inames to # be slabbed. @@ -922,7 +922,8 @@ def homogenize_instruction_blocks(kernel): def insert_predicates_into_schedule(kernel): - if kernel.iname_slab_increments: + if (kernel.iname_slab_increments + and (set(kernel.iname_slab_increments.values()) != {(0, 0)})): raise NotImplementedError assert kernel.state >= KernelState.LINEARIZED From 8317b54e7a050e63eb2ad834a76c13c6e16cbb20 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 16:21:52 -0500 Subject: [PATCH 045/109] codegen: prepend function body with temporary declarations --- loopy/codegen/instruction.py | 12 +++++------- loopy/codegen/result.py | 17 ++++++++++++----- loopy/schedule/tools.py | 18 ++++++++++++++++-- loopy/target/__init__.py | 4 ++-- loopy/target/c/__init__.py | 30 +++++++++++++----------------- loopy/target/ispc.py | 6 +++--- loopy/target/pyopencl.py | 6 +++--- loopy/target/python.py | 5 ++--- 8 files changed, 56 insertions(+), 42 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 435b2bfbe..d7ddf3f0a 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -232,8 +232,7 @@ def generate_c_instruction_code(kernel, insn, ast_builder, ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr, vinfo) - if vinfo is not None: - raise UnvectorizableError("C instructions cannot be vectorized") + assert vinfo is None body = [] @@ -260,10 +259,9 @@ def generate_c_instruction_code(kernel, insn, ast_builder, return Block(body) -def generate_nop_instruction_code(codegen_state, insn): - if codegen_state.vectorization_info is not None: - raise UnvectorizableError("C instructions cannot be vectorized") - return codegen_state.ast_builder.emit_comment( - "no-op (insn=%s)" % (insn.id)) +def generate_nop_instruction_code(kernel, insn, ast_builder, + hw_inames_expr, vinfo): + assert vinfo is None + return ast_builder.emit_comment("no-op (insn=%s)" % (insn.id)) # vim: foldmethod=marker diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 0474fab58..eda56ec52 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -362,10 +362,12 @@ def _hw_iname_expr(iname): .device_ast_builder .get_function_declaration(self.kernel, expr.name, idis, is_generating_device_code=True)) + temp_decls_asts = self.device_ast_builder.get_temporary_decls(self.kernel, expr.name) children_res = self.combine([self.rec(child, dwnstrm_ctx) for child in expr.children]) - dev_fn_body_ast = self.device_ast_builder.ast_block_class(children_res - .device_ast) + dev_fn_body_ast = self.device_ast_builder.ast_block_class(temp_decls_asts + + (children_res + .device_ast)) assert children_res.host_ast == [] dev_fn_ast = (self @@ -492,7 +494,8 @@ def map_run_instruction(self, expr, context): from loopy.kernel.instruction import (CallInstruction, Assignment, CInstruction, NoOpInstruction) from loopy.codegen.instruction import (generate_assignment_instruction_code, - generate_c_instruction_code) + generate_c_instruction_code, + generate_nop_instruction_code) ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 @@ -514,9 +517,13 @@ def map_run_instruction(self, expr, context): (context .vectorization_info)) elif isinstance(insn, NoOpInstruction): - raise NotImplementedError + insn_ast = generate_nop_instruction_code(self.kernel, insn, + ast_builder, + context.iname_exprs, + (context + .vectorization_info)) else: - raise NotImplementedError + raise NotImplementedError(type(insn)) if context.in_device: return CodeGenMapperAccumulator(host_ast=[], diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 70120dee9..719d9337b 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -55,7 +55,14 @@ def get_block_boundaries(schedule): def temporaries_read_in_subkernel(kernel, subkernel): from loopy.kernel.tools import get_subkernel_to_insn_id_map - insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + from loopy.schedule.tree import Schedule, get_insns_in_function + + if isinstance(kernel.schedule, Schedule): + insn_ids = get_insns_in_function(kernel, subkernel) + else: + assert isinstance(kernel.schedule, list) + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(tv for insn_id in insn_ids for tv in kernel.id_to_insn[insn_id].read_dependency_names() @@ -64,7 +71,14 @@ def temporaries_read_in_subkernel(kernel, subkernel): def temporaries_written_in_subkernel(kernel, subkernel): from loopy.kernel.tools import get_subkernel_to_insn_id_map - insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + from loopy.schedule.tree import Schedule, get_insns_in_function + + if isinstance(kernel.schedule, Schedule): + insn_ids = get_insns_in_function(kernel, subkernel) + else: + assert isinstance(kernel.schedule, list) + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(tv for insn_id in insn_ids for tv in kernel.id_to_insn[insn_id].write_dependency_names() diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index b650554d2..ad7122d8a 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -199,7 +199,7 @@ def get_function_declaration(self, kernel, name, implemented_data_info, def generate_top_of_body(self, codegen_state): return [] - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, kernel, subkernel_name): raise NotImplementedError def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): @@ -305,7 +305,7 @@ def get_function_declaration(self, kernel, name, implemented_data_info, is_generating_device_code): return None - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, kernel, subkernel_name): return [] def get_expression_to_code_mapper(self, kernel, var_subst_map, diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 1c7c0fdb8..c275d6b48 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -333,17 +333,15 @@ def generate_linearized_array(array, value): return data -def generate_array_literal(codegen_state, array, value): +def generate_array_literal(kernel, ecm, ast_builder, array, value): data = generate_linearized_array(array, value) - ecm = codegen_state.expression_to_code_mapper - from loopy.expression import dtype_to_type_context from loopy.symbolic import ArrayLiteral - type_context = dtype_to_type_context(codegen_state.kernel.target, array.dtype) + type_context = dtype_to_type_context(kernel.target, array.dtype) return CExpression( - codegen_state.ast_builder.get_c_expression_to_code_mapper(), + ast_builder.get_c_expression_to_code_mapper(), ArrayLiteral( tuple( ecm.map_constant(d_i, type_context) @@ -770,11 +768,11 @@ def get_function_declaration(self, kernel, callables_table, name, def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): return None - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, kernel, subkernel_name): from loopy.kernel.data import AddressSpace - kernel = codegen_state.kernel - + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) base_storage_decls = [] temp_decls = [] @@ -790,10 +788,9 @@ def get_temporary_decls(self, codegen_state, schedule_index): from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - subkernel = kernel.linearization[schedule_index].kernel_name sub_knl_temps = ( - temporaries_read_in_subkernel(kernel, subkernel) - | temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_read_in_subkernel(kernel, subkernel_name) + | temporaries_written_in_subkernel(kernel, subkernel_name)) for tv in sorted( kernel.temporary_variables.values(), @@ -807,13 +804,13 @@ def get_temporary_decls(self, codegen_state, schedule_index): tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( - codegen_state, schedule_index, tv, idi), + kernel, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( - codegen_state, tv, tv.initializer)) + kernel, ecm, self, tv, tv.initializer)) temp_decls.append(decl) @@ -875,8 +872,6 @@ def get_temporary_decls(self, codegen_state, schedule_index): idi.dtype.itemsize * product(si for si in idi.shape)) - ecm = self.get_expression_to_code_mapper(codegen_state) - for bs_name, bs_sizes in sorted(base_storage_sizes.items()): bs_var_decl = Value("char", bs_name) from pytools import single_valued @@ -957,7 +952,7 @@ def get_c_expression_to_code_mapper(self): from loopy.target.c.codegen.expression import CExpressionToCodeMapper return CExpressionToCodeMapper() - def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): + def get_temporary_decl(self, kernel, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if temp_var.read_only: @@ -966,7 +961,8 @@ def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info) if decl_info.shape: from cgen import ArrayOf - ecm = self.get_expression_to_code_mapper(codegen_state) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) temp_var_decl = ArrayOf(temp_var_decl, ecm(p.flattened_product(decl_info.shape), prec=PREC_NONE, type_context="i")) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 2dbd9f70b..99ad2ba2f 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -306,7 +306,7 @@ def emit_barrier(self, synchronization_kind, mem_kind, comment): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): + def get_temporary_decl(self, kernel, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) @@ -316,12 +316,12 @@ def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) - _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs() + _, lsize = kernel.get_grid_size_upper_bounds_as_exprs() shape = lsize + shape if shape: from cgen import ArrayOf - ecm = self.get_expression_to_code_mapper(codegen_state) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) temp_var_decl = ArrayOf( temp_var_decl, ecm(p.flattened_product(shape), diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 60efd4ff0..c73f10b54 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -708,7 +708,7 @@ def _get_global_temporaries(self, kernel): if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, kernel, subkernel_name): from genpy import Assign, Comment, Line from collections import defaultdict from numbers import Number @@ -720,9 +720,9 @@ def alloc_nbytes(tv): return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from pymbolic.mapper.stringifier import PREC_NONE - ecm = self.get_expression_to_code_mapper(codegen_state) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) - global_temporaries = self._get_global_temporaries(codegen_state) + global_temporaries = self._get_global_temporaries(kernel) if not global_temporaries: return [] diff --git a/loopy/target/python.py b/loopy/target/python.py index 37aab18ad..9cf3a614a 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -180,9 +180,8 @@ def get_function_definition(self, kernel, name, implemented_data_info, [idi.name for idi in implemented_data_info], function_body) - def get_temporary_decls(self, codegen_state, schedule_index): - kernel = codegen_state.kernel - ecm = codegen_state.expression_to_code_mapper + def get_temporary_decls(self, kernel, subkernel_name): + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) result = [] From 6e3b114f9244afac3afb61253f3d01d0c4e0735c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 16:53:54 -0500 Subject: [PATCH 046/109] provides implemented_data_info in CodeGenerationResult --- loopy/codegen/result.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index eda56ec52..83d8ca629 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -89,10 +89,11 @@ class CodeGenerationResult(ImmutableRecord): a list of :class:`loopy.codegen.ImplementedDataInfo` objects. Only added at the very end of code generation. """ - def __init__(self, host_program, device_programs, host_preambles=[], - device_preambles=[]): + def __init__(self, host_program, device_programs, implemented_data_info, + host_preambles=[], device_preambles=[]): super().__init__(host_program=host_program, device_programs=device_programs, + implemented_data_info=implemented_data_info, host_preambles=host_preambles, device_preambles=device_preambles) @@ -295,7 +296,7 @@ def map_schedule(self, expr): host_prog = GeneratedProgram(name=host_fn_name, is_device_program=False, ast=host_fn_ast) - return CodeGenerationResult(host_prog, children_res.device_ast) + return CodeGenerationResult(host_prog, children_res.device_ast, idis) def map_function(self, expr, context): from loopy.codegen.control import synthesize_idis_for_extra_args From 7abcae1dd69ec9da6dc3c4004ee7f4186c79a289 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 17:10:41 -0500 Subject: [PATCH 047/109] codegen: handle call instruction --- loopy/codegen/instruction.py | 11 +++++------ loopy/codegen/result.py | 9 +++++++-- loopy/target/__init__.py | 3 ++- loopy/target/c/__init__.py | 22 ++++++++++++++-------- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index d7ddf3f0a..8921059a5 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -203,20 +203,19 @@ def generate_assignment_instruction_code(kernel, insn, ast_builder, return result -def generate_call_code(codegen_state, insn): - kernel = codegen_state.kernel +def generate_call_code(kernel, insn, ast_builder, + hw_inames_expr, vinfo): + result = ast_builder.emit_multiple_assignment(kernel, insn, hw_inames_expr, + vinfo) # {{{ vectorization handling - if codegen_state.vectorization_info: + if vinfo: if insn.atomicity: raise UnvectorizableError("atomic operation") # }}} - result = codegen_state.ast_builder.emit_multiple_assignment( - codegen_state, insn) - # {{{ tracing if kernel.options.trace_assignments or kernel.options.trace_assignment_values: diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 83d8ca629..69245d150 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -496,14 +496,19 @@ def map_run_instruction(self, expr, context): CInstruction, NoOpInstruction) from loopy.codegen.instruction import (generate_assignment_instruction_code, generate_c_instruction_code, - generate_nop_instruction_code) + generate_nop_instruction_code, + generate_call_code) ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 insn = self.kernel.id_to_insn[expr.insn_id] if isinstance(insn, CallInstruction): - raise NotImplementedError + insn_ast = generate_call_code(self.kernel, insn, + ast_builder, + context.iname_exprs, + (context + .vectorization_info)) elif isinstance(insn, Assignment): insn_ast = generate_assignment_instruction_code(self.kernel, insn, ast_builder, diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index ad7122d8a..fdb7b214f 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -247,7 +247,8 @@ def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): raise NotImplementedError() - def emit_multiple_assignment(self, codegen_state, insn): + def emit_multiple_assignment(self, kernel, insn, var_subst_map, + vectorization_info): raise NotImplementedError() def emit_sequential_loop(self, kernel, iname, iname_dtype, diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index c275d6b48..acdb962fa 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1080,8 +1080,11 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) - def emit_tuple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper + def emit_tuple_assignment(self, kernel, callables_table, insn, + var_subst_map, vectorization_info): + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, + vectorization_info) from cgen import Assign, block_if_necessary assignments = [] @@ -1090,12 +1093,12 @@ def emit_tuple_assignment(self, codegen_state, insn): zip(insn.assignees, insn.expression.parameters)): lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None) assignee_var_name = insn.assignee_var_names()[i] - lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_var = kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype from loopy.expression import dtype_to_type_context rhs_type_context = dtype_to_type_context( - codegen_state.kernel.target, lhs_dtype) + kernel.target, lhs_dtype) rhs_code = ecm(parameter, prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) @@ -1103,15 +1106,18 @@ def emit_tuple_assignment(self, codegen_state, insn): return block_if_necessary(assignments) - def emit_multiple_assignment(self, codegen_state, insn): + def emit_multiple_assignment(self, kernel, callables_table, insn, + var_subst_map, vectorization_info): + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info) - ecm = codegen_state.expression_to_code_mapper func_id = insn.expression.function.name - in_knl_callable = codegen_state.callables_table[func_id] + in_knl_callable = callables_table[func_id] if isinstance(in_knl_callable, ScalarCallable) and ( in_knl_callable.name_in_target == "loopy_make_tuple"): - return self.emit_tuple_assignment(codegen_state, insn) + return self.emit_tuple_assignment(kernel, callables_table, insn, + var_subst_map, vectorization_info) # takes "is_returned" to infer whether insn.assignees[0] is a part of # LHS. From 5cd3b77a02311474da1857cea62d9bfb43200810 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 17:10:53 -0500 Subject: [PATCH 048/109] gets rid of unused code --- loopy/codegen/instruction.py | 76 ------------------------------------ 1 file changed, 76 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 8921059a5..7edaf008b 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -24,85 +24,9 @@ """ -import islpy as isl -dim_type = isl.dim_type from loopy.codegen import UnvectorizableError -from loopy.codegen.result import CodeGenerationResult from loopy.diagnostic import LoopyError from pymbolic.mapper.stringifier import PREC_NONE -from pytools import memoize_on_first_arg - - -@memoize_on_first_arg -def _get_new_implemented_domain(kernel, chk_domain, implemented_domain): - - chk_domain, implemented_domain = isl.align_two( - chk_domain, implemented_domain) - chk_domain = chk_domain.gist(implemented_domain) - - new_implemented_domain = implemented_domain & chk_domain - return chk_domain, new_implemented_domain - - -def to_codegen_result( - codegen_state, insn_id, domain, check_inames, required_preds, ast): - chk_domain = isl.Set.from_basic_set(domain) - chk_domain = chk_domain.remove_redundancies() - chk_domain = codegen_state.kernel.cache_manager.eliminate_except(chk_domain, - check_inames, (dim_type.set,)) - - chk_domain, new_implemented_domain = _get_new_implemented_domain( - codegen_state.kernel, chk_domain, codegen_state.implemented_domain) - - if chk_domain.is_empty(): - return None - - condition_exprs = [] - if not chk_domain.plain_is_universe(): - from loopy.symbolic import set_to_cond_expr - condition_exprs.append(set_to_cond_expr(chk_domain)) - - condition_exprs.extend( - required_preds - codegen_state.implemented_predicates) - - if condition_exprs: - from pymbolic.primitives import LogicalAnd - from pymbolic.mapper.stringifier import PREC_NONE - ast = codegen_state.ast_builder.emit_if( - codegen_state.expression_to_code_mapper( - LogicalAnd(tuple(condition_exprs)), PREC_NONE), - ast) - - return CodeGenerationResult.new( - codegen_state, insn_id, ast, new_implemented_domain) - - -def generate_instruction_code(codegen_state, insn): - kernel = codegen_state.kernel - - from loopy.kernel.instruction import ( - Assignment, CallInstruction, CInstruction, NoOpInstruction - ) - - if isinstance(insn, Assignment): - ast = generate_assignment_instruction_code(codegen_state, insn) - elif isinstance(insn, CallInstruction): - ast = generate_call_code(codegen_state, insn) - elif isinstance(insn, CInstruction): - ast = generate_c_instruction_code(codegen_state, insn) - elif isinstance(insn, NoOpInstruction): - ast = generate_nop_instruction_code(codegen_state, insn) - else: - raise RuntimeError("unexpected instruction type") - - insn_inames = insn.within_inames - - return to_codegen_result( - codegen_state, - insn.id, - kernel.get_inames_domain(insn_inames), insn_inames, - insn.predicates, - ast) def generate_assignment_instruction_code(kernel, insn, ast_builder, From c415319c2c222b36972eedee731ccde242480af9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 18:52:16 -0500 Subject: [PATCH 049/109] predicate insertion mapper: always account for hw inames domain before gisting --- loopy/schedule/tree.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 26ef7b0c1..5b3fc589e 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -578,9 +578,15 @@ def map_polyhedral_loop(self, expr, context): assert expr.domain.dim(dim_type.set) == 1 assert context.implemented_domain.dim(dim_type.set) == 0 - domain = _align_and_gist(expr.domain, context.implemented_domain) - downstream_domain = _align_and_intersect(domain, - context.implemented_domain) + implemented_domain = _implement_hw_axes_in_domains(context + .implemented_domain, + expr.domain, + self.kernel, + context.gsize, + context.lsize) + + domain = _align_and_gist(expr.domain, implemented_domain) + downstream_domain = _align_and_intersect(domain, implemented_domain) downstream_domain = downstream_domain.move_dims(dim_type.param, (downstream_domain .dim(dim_type.param)), From 597da9176a536ffb8be886230707d0ed6eabeea6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 20:39:20 -0500 Subject: [PATCH 050/109] use genpy.Collection --- loopy/target/python.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/loopy/target/python.py b/loopy/target/python.py index 9cf3a614a..5dd37d1aa 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -232,9 +232,7 @@ def ast_if_class(self): @property def ast_block_scope_class(self): - # Once a new version of genpy is released, switch to this: - # from genpy import Collection - # and delete the implementation above. + from genpy import Collection return Collection def emit_sequential_loop(self, kernel, iname, iname_dtype, From 2ce0a356ef1553a4e8227f1f2c9ac089ca9326b3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 20:40:02 -0500 Subject: [PATCH 051/109] introduce ASTBuilderBase.emit_collection --- loopy/target/__init__.py | 6 ++++++ loopy/target/c/__init__.py | 7 +++++++ loopy/target/python.py | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index fdb7b214f..e56b9f3f3 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -274,6 +274,12 @@ def emit_blank_line(self): def emit_comment(self, s): raise NotImplementedError() + def emit_collection(self, asts): + """ + :arg asts: A sequence of AST objects. + """ + raise NotImplementedError + # }}} def process_ast(self, node): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index acdb962fa..2c0102611 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1179,6 +1179,13 @@ def emit_comment(self, s): from cgen import Comment return Comment(s) + def emit_collection(self, asts): + """ + :arg asts: A sequence of AST objects. + """ + from cgen import Collection + return Collection(asts) + @property def can_implement_conditionals(self): return True diff --git a/loopy/target/python.py b/loopy/target/python.py index 5dd37d1aa..8b5c350b6 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -263,6 +263,13 @@ def emit_comment(self, s): from genpy import Comment return Comment(s) + def emit_collection(self, asts): + """ + :arg asts: A sequence of AST objects. + """ + from genpy import Collection + return Collection(asts) + @property def can_implement_conditionals(self): return True From 348bc25ed0b8a2fcddee226b05bcd4e67799045c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 20:40:37 -0500 Subject: [PATCH 052/109] Introduce ASTBuilder.emit_array_literal --- loopy/target/__init__.py | 8 +++++++- loopy/target/c/__init__.py | 35 ++++++++++++++++++++++++----------- loopy/target/python.py | 4 ++-- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index e56b9f3f3..8ab635ad8 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -244,6 +244,12 @@ def get_global_arg_decl(self, name, shape, dtype, is_written): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() + def emit_array_literal(self, kernel, array, value): + """ + :arg ary: An instance of :class:`loopy.kernel.array.ArrayBase`. + """ + raise NotImplementedError + def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): raise NotImplementedError() @@ -262,7 +268,7 @@ def can_implement_conditionals(self): def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): raise NotImplementedError() - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, decl, val): raise NotImplementedError() def emit_declaration_scope(self, codegen_state, inner): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 2c0102611..fc8c7375a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -765,6 +765,25 @@ def get_function_declaration(self, kernel, callables_table, name, [self.idi_to_cgen_declarator(kernel, idi) for idi in implemented_data_info])) + def emit_array_literal(self, kernel, array, value): + """ + :arg ary: An instance of :class:`loopy.kernel.array.ArrayBase`. + """ + data = generate_linearized_array(array, value) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) + + from loopy.expression import dtype_to_type_context + from loopy.symbolic import ArrayLiteral + + type_context = dtype_to_type_context(kernel.target, array.dtype) + return CExpression( + self.get_c_expression_to_code_mapper(), + ArrayLiteral( + tuple( + ecm.map_constant(d_i, type_context) + for d_i in data))) + def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): return None @@ -809,8 +828,8 @@ def get_temporary_decls(self, kernel, subkernel_name): if tv.initializer is not None: assert tv.read_only - decl = Initializer(decl, generate_array_literal( - kernel, ecm, self, tv, tv.initializer)) + decl = Initializer(decl, self.emit_array_literal( + kernel, ecm, tv, tv.initializer)) temp_decls.append(decl) @@ -1161,15 +1180,9 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, "++%s" % iname, inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): - decl = POD(self, dtype, name) - - from cgen import Initializer, Const - - if is_const: - decl = Const(decl) - - return Initializer(decl, val_str) + def emit_initializer(self, decl, val): + from cgen import Initializer + return Initializer(decl, val) def emit_blank_line(self): from cgen import Line diff --git a/loopy/target/python.py b/loopy/target/python.py index 8b5c350b6..6ffa42aec 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -251,9 +251,9 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, ), inner) - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): + def emit_initializer(self, decl, val): from genpy import Assign - return Assign(name, val_str) + return Assign(decl, val) def emit_blank_line(self): from genpy import Line From 733ab4852d3f3280ef3be54598dcce75290a9f10 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 24 May 2021 20:41:07 -0500 Subject: [PATCH 053/109] emit temporary declarations/initializers --- loopy/codegen/result.py | 93 +++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 37 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 69245d150..63ca77666 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -69,6 +69,17 @@ class GeneratedProgram(ImmutableRecord): """ +def prepend_code(prog, code, astb): + """ + Prepends *code* to :attr:`GeneratedProgram.ast`. + + :arg astb: An instance of :class:`loopy.target.ASTBuilderBase`. + :arg ast: AST within *astb*. + """ + new_ast = astb.emit_collection(code + [prog.ast]) + return prog.copy(ast=new_ast) + + class CodeGenerationResult(ImmutableRecord): """ .. attribute:: host_program @@ -242,43 +253,50 @@ def _is_a_list_of_ast_nodes(astb, ast): def map_schedule(self, expr): from loopy.kernel.data import AddressSpace - children_res = self.combine([self.rec(child, - CodeGenerationContext(False, {})) - for child in expr.children]) - - for tv in self.kernel.temporary_variables.values(): - if tv.address_space == AddressSpace.GLOBAL and ( - tv.initializer is not None): - # prepend the initializer atop the code. - raise NotImplementedError - - """ - for tv in sorted( - kernel.temporary_variables.values(), - key=lambda tv: tv.name): - - if tv.address_space == AddressSpace.GLOBAL and ( - tv.initializer is not None): - assert tv.read_only - - decl_info, = tv.decl_info(self.target, - index_dtype=kernel.index_dtype) - decl = self.wrap_global_constant( - self.get_temporary_decl( - codegen_state, schedule_index, tv, - decl_info)) - - if tv.initializer is not None: - decl = Initializer(decl, generate_array_literal( - codegen_state, tv, tv.initializer)) - - result.append(decl) - """ + downstream_asts = self.combine([self.rec(child, + CodeGenerationContext(False, {})) + for child in expr.children]) assert all(isinstance(el, GeneratedProgram) - for el in children_res.device_ast) + for el in downstream_asts.device_ast) + device_programs = downstream_asts.device_ast + host_ast = downstream_asts.host_ast + + # {{{ emit global temporary declarations/initializations + + tv_init_host_asts = self.host_ast_builder.get_temporary_decls( + self.kernel, subkernel_name=None) + + tv_init_device_asts = [] + for tv in sorted((tv for tv in self.kernel.temporary_variables.values() + if tv.address_space == AddressSpace.GLOBAL), + key=lambda tv: tv.name): + if tv.initializer is not None: + assert tv.read_only + + decl_info, = tv.decl_info(self.kernel.target, + index_dtype=self.kernel.index_dtype) + decl = self.device_ast_builder.wrap_global_constant( + self.device_ast_builder.get_temporary_decl(self.kernel, tv, + decl_info)) + rhs = self.device_ast_builder.emit_array_literal(self.kernel, + tv, + tv.initializer) + init_ast = self.device_ast_builder.emit_initializer(decl, rhs) + + tv_init_device_asts.append(init_ast) + + if tv_init_device_asts: + tv_init_device_asts.append(self.device_ast_builder.emit_blank_line()) + + # prepend the tv inits to the first device program + device_programs = ([prepend_code(device_programs[0], tv_init_device_asts, + self.device_ast_builder)] + + device_programs[1:]) + + # }}} - host_fn_body_ast = self.host_ast_builder.ast_block_class(children_res - .host_ast) + host_fn_body_ast = self.host_ast_builder.ast_block_class(host_ast + + tv_init_host_asts) idis = get_idis_for_kernel(self.kernel) host_fn_name = (self.kernel.target.host_program_name_prefix @@ -296,7 +314,7 @@ def map_schedule(self, expr): host_prog = GeneratedProgram(name=host_fn_name, is_device_program=False, ast=host_fn_ast) - return CodeGenerationResult(host_prog, children_res.device_ast, idis) + return CodeGenerationResult(host_prog, device_programs, idis) def map_function(self, expr, context): from loopy.codegen.control import synthesize_idis_for_extra_args @@ -363,7 +381,8 @@ def _hw_iname_expr(iname): .device_ast_builder .get_function_declaration(self.kernel, expr.name, idis, is_generating_device_code=True)) - temp_decls_asts = self.device_ast_builder.get_temporary_decls(self.kernel, expr.name) + temp_decls_asts = self.device_ast_builder.get_temporary_decls(self.kernel, + expr.name) children_res = self.combine([self.rec(child, dwnstrm_ctx) for child in expr.children]) dev_fn_body_ast = self.device_ast_builder.ast_block_class(temp_decls_asts From f9835b14f877f25f86ef266efbf24e01b8ff2b53 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 25 May 2021 05:32:16 -0500 Subject: [PATCH 054/109] prints barrier correctly --- loopy/schedule/tree.py | 2 +- loopy/target/pyopencl.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 5b3fc589e..1e056a041 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -389,7 +389,7 @@ def map_run_instruction(self, expr, level=0): f"{format_insn(self.kernel, expr.insn_id)}") def map_barrier(self, expr, level=0): - return (f"{self._indent(level)}... {expr.kind[0]}barrier") + return (f"{self._indent(level)}... {expr.synchronization_kind[0]}barrier") def map_loop(self, expr, level=0): return self.combine([f"{self._indent(level)}for {expr.iname}", diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c73f10b54..b9a433373 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -720,7 +720,8 @@ def alloc_nbytes(tv): return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from pymbolic.mapper.stringifier import PREC_NONE - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) global_temporaries = self._get_global_temporaries(kernel) if not global_temporaries: From 8a3de5f75bd17b4bfddb2c908ceddc1b641aefbd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 25 May 2021 05:56:27 -0500 Subject: [PATCH 055/109] remove divs before finding the projection --- loopy/schedule/tree.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 1e056a041..1391b61dc 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -830,8 +830,12 @@ def map_polyhedral_loop(self, expr, context): constants_only=False) set_implemented_in_loop = make_slab(expr.domain.space, expr.iname, lb, ub+1) - outer_condition = _align_and_gist(expr.domain.project_out(dim_type.set, - 0, 1), + # Removing divs because all we want is a projection, with no remaining + # constraints from the eliminated variables. + outer_condition = _align_and_gist(expr.domain + .project_out(dim_type.set, + 0, 1) + .remove_divs(), set_implemented_in_loop) inner_condition = _align_and_gist(expr.domain.affine_hull(), set_implemented_in_loop) From 86eef2998b1bfbe47df2d53cdcee4ee5f9478747 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 25 May 2021 05:57:07 -0500 Subject: [PATCH 056/109] evaluate via var_subst_map before finding the access_info --- loopy/target/c/codegen/expression.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 81f3d2a96..adc9243ba 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -218,13 +218,16 @@ def make_var(name): ary = self.find_array(expr) from loopy.kernel.array import get_access_info + from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, - expr, self.vectorization_info) + lambda expr: evaluate(expr, + self.var_subst_map), + self.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) From 86e072d131a36c2386780da006c8c92fee9f32ef Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 25 May 2021 10:07:19 -0500 Subject: [PATCH 057/109] codegen: handle emit_atomic_update, emit_top_of_body --- loopy/codegen/result.py | 11 ++++++++--- loopy/target/__init__.py | 5 +---- loopy/target/c/__init__.py | 8 ++++---- loopy/target/ispc.py | 4 ++-- loopy/target/opencl.py | 28 ++++++++++++++++++---------- 5 files changed, 33 insertions(+), 23 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 63ca77666..38199fd53 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -260,6 +260,7 @@ def map_schedule(self, expr): for el in downstream_asts.device_ast) device_programs = downstream_asts.device_ast host_ast = downstream_asts.host_ast + host_tgt_prelude = self.host_ast_builder.generate_top_of_body(self.kernel) # {{{ emit global temporary declarations/initializations @@ -295,8 +296,9 @@ def map_schedule(self, expr): # }}} - host_fn_body_ast = self.host_ast_builder.ast_block_class(host_ast - + tv_init_host_asts) + host_fn_body_ast = self.host_ast_builder.ast_block_class(host_tgt_prelude + + tv_init_host_asts + + host_ast) idis = get_idis_for_kernel(self.kernel) host_fn_name = (self.kernel.target.host_program_name_prefix @@ -381,11 +383,14 @@ def _hw_iname_expr(iname): .device_ast_builder .get_function_declaration(self.kernel, expr.name, idis, is_generating_device_code=True)) + + tgt_prelude = self.device_ast_builder.generate_top_of_body(self.kernel) temp_decls_asts = self.device_ast_builder.get_temporary_decls(self.kernel, expr.name) children_res = self.combine([self.rec(child, dwnstrm_ctx) for child in expr.children]) - dev_fn_body_ast = self.device_ast_builder.ast_block_class(temp_decls_asts + dev_fn_body_ast = self.device_ast_builder.ast_block_class(tgt_prelude + + temp_decls_asts + (children_res .device_ast)) assert children_res.host_ast == [] diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 8ab635ad8..3b55f134f 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -196,7 +196,7 @@ def get_function_declaration(self, kernel, name, implemented_data_info, is_generating_device_code): raise NotImplementedError - def generate_top_of_body(self, codegen_state): + def generate_top_of_body(self, kernel): return [] def get_temporary_decls(self, kernel, subkernel_name): @@ -271,9 +271,6 @@ def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): def emit_initializer(self, decl, val): raise NotImplementedError() - def emit_declaration_scope(self, codegen_state, inner): - raise NotImplementedError() - def emit_blank_line(self): raise NotImplementedError() diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index fc8c7375a..fd475c9c0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1080,14 +1080,14 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): elif isinstance(lhs_atomicity, AtomicInit): self.seen_atomic_dtypes.add(lhs_dtype) return self.emit_atomic_init( - codegen_state, lhs_atomicity, lhs_var, + kernel, var_subst_map, lhs_atomicity, lhs_var, insn.assignee, insn.expression, lhs_dtype, rhs_type_context) elif isinstance(lhs_atomicity, AtomicUpdate): self.seen_atomic_dtypes.add(lhs_dtype) return self.emit_atomic_update( - codegen_state, lhs_atomicity, lhs_var, + kernel, var_subst_map, lhs_atomicity, lhs_var, insn.assignee, insn.expression, lhs_dtype, rhs_type_context) @@ -1095,8 +1095,8 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): raise ValueError("unexpected lhs atomicity type: %s" % type(lhs_atomicity).__name__) - def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype): + def emit_atomic_update(self, kernel, var_subst_map, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) def emit_tuple_assignment(self, kernel, callables_table, insn, diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 99ad2ba2f..e569f26f4 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -111,8 +111,8 @@ def map_subscript(self, expr, type_context): from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, expr.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) + lambda expr: evaluate(expr, self.var_subst_map), + self.vectorization_info) subscript, = access_info.subscripts result = var(access_info.array_name)[ diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 7c7191f79..1f6d67294 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -27,7 +27,7 @@ from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper -from pytools import memoize_method +from pytools import memoize_method, UniqueNameGenerator from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper @@ -575,6 +575,10 @@ def vector_dtype(self, base, count): # {{{ ast builder class OpenCLCASTBuilder(CFamilyASTBuilder): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.var_name_generator = UniqueNameGenerator() + # {{{ library @property @@ -633,9 +637,9 @@ def get_function_declaration(self, kernel, callables_table, name, return FunctionDeclarationWrapper(fdecl) - def generate_top_of_body(self, codegen_state): + def generate_top_of_body(self, kernel): from loopy.kernel.data import ImageArg - if any(isinstance(arg, ImageArg) for arg in codegen_state.kernel.args): + if any(isinstance(arg, ImageArg) for arg in kernel.args): from cgen import Value, Const, Initializer return [ Initializer(Const(Value("sampler_t", "loopy_sampler")), @@ -739,22 +743,26 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): # {{{ - def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var, + def emit_atomic_init(self, kernel, var_subst_map, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): # for the CL1 flavor, this is as simple as a regular update with whatever # the RHS value is... - return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, + return self.emit_atomic_update(kernel, var_subst_map, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) # }}} # {{{ code generation for atomic update - def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + def emit_atomic_update(self, kernel, var_subst_map, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE + ecm = self.get_expression_to_code_mapper(kernel, + var_subst_map=var_subst_map, + vectorization_info=None) + # FIXME: Could detect operations, generate atomic_{add,...} when # appropriate. @@ -762,11 +770,11 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, np.int32, np.int64, np.float32, np.float64]: from cgen import Block, DoWhile, Assign from loopy.target.c import POD - old_val_var = codegen_state.var_name_generator("loopy_old_val") - new_val_var = codegen_state.var_name_generator("loopy_new_val") + old_val_var = self.var_name_generator("_lp_old_val") + new_val_var = self.var_name_generator("_lp_new_val") from loopy.kernel.data import TemporaryVariable, AddressSpace - ecm = codegen_state.expression_to_code_mapper.with_assignments( + ecm = ecm.with_assignments( { old_val_var: TemporaryVariable(old_val_var, lhs_dtype, shape=()), @@ -872,7 +880,7 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, class VolatileMemExpressionToOpenCLCExpressionMapper( ExpressionToOpenCLCExpressionMapper): def make_subscript(self, array, base_expr, subscript): - registry = self.codegen_state.ast_builder.target.get_dtype_registry() + registry = self.ast_builder.target.get_dtype_registry() from loopy.kernel.data import AddressSpace if array.address_space == AddressSpace.GLOBAL: From a2086ab63cea24053f34bdfd3c38b2d02df51de1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 25 May 2021 10:09:54 -0500 Subject: [PATCH 058/109] be careful about intersection / projection --- loopy/schedule/tree.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 1391b61dc..d4ee10af9 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -556,7 +556,7 @@ def map_loop(self, expr, context): assert domain.dim(dim_type.set) == 1 - domain = _align_and_gist(domain, implemented_domain) + domain = _align_and_intersect(domain, implemented_domain) downstream_domain = _align_and_intersect(domain .move_dims(dim_type.param, @@ -585,7 +585,7 @@ def map_polyhedral_loop(self, expr, context): context.gsize, context.lsize) - domain = _align_and_gist(expr.domain, implemented_domain) + domain = _align_and_intersect(expr.domain, implemented_domain) downstream_domain = _align_and_intersect(domain, implemented_domain) downstream_domain = downstream_domain.move_dims(dim_type.param, (downstream_domain @@ -727,7 +727,7 @@ def map_polyhedral_loop(self, expr, context): if (self.kernel.iname_tags_of_type(expr.iname, (UnrolledIlpTag, UnrollTag)) or expr.iname in self.extra_unroll_inames): - domain = _align_and_gist(expr.domain, context.implemented_domain) + domain = _align_and_intersect(expr.domain, context.implemented_domain) ubound = static_max_of_pw_aff(domain.dim_max(0), constants_only=False) lbound = static_min_of_pw_aff(domain.dim_min(0), constants_only=False) # FIXME: Write a better error message o'er here that the loop @@ -786,11 +786,10 @@ def map_instruction_block(self, expr, context): hw_inames = inames & get_all_inames_tagged_with(self.kernel, AxisTag) if hw_inames: - impl_domain = context.implemented_domain domain = (self.kernel.get_inames_domain(hw_inames) - .project_out_except(types=[dim_type.set], - names=hw_inames)) + .project_out_except(types=[dim_type.set], + names=hw_inames)) impl_domain = _implement_hw_axes_in_domains(impl_domain, domain, self.kernel, @@ -822,22 +821,23 @@ def map_polyhedral_loop(self, expr, context): static_max_of_pw_aff, make_slab) assert expr.domain.dim(dim_type.set) == 1 - lb = static_min_of_pw_aff(expr.domain.dim_min(0).gist(context - .implemented_domain), - constants_only=False) - ub = static_max_of_pw_aff(expr.domain.dim_max(0).gist(context - .implemented_domain), - constants_only=False) - set_implemented_in_loop = make_slab(expr.domain.space, expr.iname, lb, ub+1) + domain = _align_and_intersect(expr.domain, context.implemented_domain) + lb = domain.dim_min(0) + ub = domain.dim_max(0) + set_implemented_in_loop = make_slab(expr.domain.space, expr.iname, + static_min_of_pw_aff(lb, False), + static_max_of_pw_aff(ub, False)+1) # Removing divs because all we want is a projection, with no remaining # constraints from the eliminated variables. - outer_condition = _align_and_gist(expr.domain + outer_condition = _align_and_gist(domain .project_out(dim_type.set, 0, 1) .remove_divs(), - set_implemented_in_loop) - inner_condition = _align_and_gist(expr.domain.affine_hull(), + _align_and_intersect( + set_implemented_in_loop, + context.implemented_domain)) + inner_condition = _align_and_gist(domain.affine_hull(), set_implemented_in_loop) step = 1 # TODO: from inner_condition try to guess the step @@ -946,14 +946,12 @@ def insert_predicates_into_schedule(kernel): # }}} schedule = PolyhedronLoopifier(kernel)(kernel.schedule) + unvectorizable_inames = UnvectorizableInamesCollector(kernel)(schedule) # FIXME: (For now) unvectorizable inames always fallback to unrolling this # should be selected based on the target. schedule = Unroller(kernel, unvectorizable_inames)(schedule) schedule = PredicateInsertionMapper(kernel)(schedule) - - kernel = kernel.copy(schedule=schedule) - return kernel.copy(schedule=schedule) From d9ded6bf37204e0c86f6a2724deef4be48992af0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 25 May 2021 13:31:33 -0500 Subject: [PATCH 059/109] lp.Target.c.POD : do not store ast_builder --- loopy/target/c/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index fd475c9c0..73288688b 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -257,7 +257,8 @@ def __init__(self, ast_builder, dtype, name): from loopy.types import LoopyType assert isinstance(dtype, LoopyType) - self.ast_builder = ast_builder + # TODO: just pass 'ctype' instead of ast_builder. + self.ctype = ast_builder.target.dtype_to_typename(dtype) self.dtype = dtype self.name = name @@ -271,9 +272,6 @@ def struct_maker_code(self, name): def struct_format(self): return self.dtype.char - def alignment_requirement(self): - return self.ast_builder.target.alignment_requirement(self) - def default_value(self): return 0 From cc5def7bbfcd5550a30ccee282923295a42a5db7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 17:49:08 -0500 Subject: [PATCH 060/109] unrolling inames is easier than you think --- loopy/schedule/tree.py | 86 +++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index d4ee10af9..ab469d1a4 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -707,6 +707,16 @@ def map_barrier(self, expr): return frozenset() +def _max_val_on_pwaff_for_unrolling(bset, pwaff, iname_to_unr): + max_vals = [(bset & set_i).max_val(aff_i) + for set_i, aff_i in pwaff.get_pieces()] + + if any(max_val.is_infty() for max_val in max_vals): + raise LoopyError(f"{iname_to_unr} doesn't have an integral loop length" + " => cannot unroll.") + return max(max_val.to_python() for max_val in max_vals) + + class Unroller(PolyhedronLoopifier): """ .. attribute extra_unroll_inames:: @@ -721,46 +731,52 @@ def __init__(self, kernel, extra_unroll_inames): def map_polyhedral_loop(self, expr, context): from loopy.kernel.data import UnrollTag, UnrolledIlpTag - from loopy.isl_helpers import (make_slab, static_max_of_pw_aff, - static_min_of_pw_aff) + from loopy.isl_helpers import make_slab - if (self.kernel.iname_tags_of_type(expr.iname, (UnrolledIlpTag, + if not (self.kernel.iname_tags_of_type(expr.iname, (UnrolledIlpTag, UnrollTag)) or expr.iname in self.extra_unroll_inames): - domain = _align_and_intersect(expr.domain, context.implemented_domain) - ubound = static_max_of_pw_aff(domain.dim_max(0), constants_only=False) - lbound = static_min_of_pw_aff(domain.dim_min(0), constants_only=False) - # FIXME: Write a better error message o'er here that the loop - # cannot be unrolled. - size = static_max_of_pw_aff(ubound-lbound+1, constants_only=True) - assert size.is_cst() - - result = [] - for i in range(size.get_constant_val().to_python()): - unrll_dom = make_slab(domain.space, expr.iname, lbound+i, - lbound+i+1) & domain - if unrll_dom.is_empty(): - continue - - dwnstrm_dom = _align_and_intersect(unrll_dom, - context.implemented_domain) - - dwnstrm_dom = dwnstrm_dom.move_dims(dim_type.param, - (dwnstrm_dom - .dim(dim_type.param)), - dim_type.set, 0, 1).params() - children = [self.rec(child, (context - .copy(implemented_domain=dwnstrm_dom))) - for child in expr.children] - - result.append(PolyhedralLoop(iname=expr.iname, - children=self.combine(children), - domain=unrll_dom)) - - return GroupedChildren(contents=result) - else: return super().map_polyhedral_loop(expr, context) + implemented_domain = _implement_hw_axes_in_domains(context + .implemented_domain, + expr.domain, + self.kernel, + context.gsize, + context.lsize) + + domain = _align_and_intersect(expr.domain, implemented_domain) + + lbound = domain.dim_min(0) + loop_length_pw_aff = domain.dim_max(0) - lbound + 1 + loop_length = _max_val_on_pwaff_for_unrolling(implemented_domain, + loop_length_pw_aff, + expr.iname) + + result = [] + for i in range(loop_length): + unrll_dom = make_slab(domain.space, expr.iname, lbound+i, + lbound+i+1) & domain + if unrll_dom.is_empty(): + continue + + dwnstrm_dom = _align_and_intersect(unrll_dom, + implemented_domain) + + dwnstrm_dom = dwnstrm_dom.move_dims(dim_type.param, + (dwnstrm_dom + .dim(dim_type.param)), + dim_type.set, 0, 1).params() + children = [self.rec(child, (context + .copy(implemented_domain=dwnstrm_dom))) + for child in expr.children] + + result.append(PolyhedralLoop(iname=expr.iname, + children=self.combine(children), + domain=unrll_dom)) + + return GroupedChildren(contents=result) + def map_loop(self, expr, context): raise RuntimeError("At this point, all loops should have resolved as" " polyhedral loops.") From 9463b5cef5fa1bd81aa8ff4e055c709dcd8b3d6e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 18:47:56 -0500 Subject: [PATCH 061/109] require passing ctype to POD, instead of an ASTBuilder --- loopy/target/__init__.py | 12 ++++++++++-- loopy/target/c/__init__.py | 33 +++++++++++++++++++-------------- loopy/target/cuda.py | 6 ++++-- loopy/target/ispc.py | 9 ++++++--- loopy/target/opencl.py | 13 ++++++++----- 5 files changed, 47 insertions(+), 26 deletions(-) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 3b55f134f..496a97698 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -298,7 +298,11 @@ def rec(self, expr, prec, type_context=None, needed_dtype=None): __call__ = rec -class _DummyASTBlock: +class _DummyAST: + pass + + +class _DummyASTBlock(_DummyAST): def __init__(self, arg): self.contents = [] @@ -323,7 +327,11 @@ def get_expression_to_code_mapper(self, kernel, var_subst_map, return _DummyExpressionToCodeMapper() def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - return None + return _DummyASTBlock([]) + + @property + def ast_base_class(self): + return _DummyAST @property def ast_block_class(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 73288688b..c08f70cce 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -253,13 +253,11 @@ class POD(Declarator): and the *name* is given as a string. """ - def __init__(self, ast_builder, dtype, name): + def __init__(self, ctype, dtype, name): from loopy.types import LoopyType assert isinstance(dtype, LoopyType) - # TODO: just pass 'ctype' instead of ast_builder. - - self.ctype = ast_builder.target.dtype_to_typename(dtype) + self.ctype = ctype self.dtype = dtype self.name = name @@ -352,7 +350,7 @@ def generate_array_literal(kernel, ecm, ast_builder, array, value): class CASTIdentityMapper(CASTIdentityMapperBase): def map_loopy_pod(self, node, *args, **kwargs): - return type(node)(node.ast_builder, node.dtype, node.name) + return type(node)(node.ctype, node.dtype, node.name) def map_function_decl_wrapper(self, node, *args, **kwargs): return FunctionDeclarationWrapper( @@ -730,7 +728,8 @@ def idi_to_cgen_declarator(self, kernel, idi): or idi.stride_for_name_and_axis is not None): assert not idi.is_written from cgen import Const - return Const(POD(self, idi.dtype, idi.name)) + return Const(POD(self.target.dtype_to_typename(idi.dtype), + idi.dtype, idi.name)) elif issubclass(idi.arg_class, InameArg): return InameArg(idi.name, idi.dtype).get_arg_decl(self) else: @@ -827,7 +826,7 @@ def get_temporary_decls(self, kernel, subkernel_name): if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, self.emit_array_literal( - kernel, ecm, tv, tv.initializer)) + kernel, tv, tv.initializer)) temp_decls.append(decl) @@ -855,8 +854,10 @@ def get_temporary_decls(self, kernel, subkernel_name): align_size) for idi in decl_info: - cast_decl = POD(self, idi.dtype, "") - temp_var_decl = POD(self, idi.dtype, idi.name) + cast_decl = POD(self.target.dtype_to_typename(idi.dtype), + idi.dtype, "") + temp_var_decl = POD(self.target.dtype_to_typename(idi.dtype), + idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( @@ -970,7 +971,8 @@ def get_c_expression_to_code_mapper(self): return CExpressionToCodeMapper() def get_temporary_decl(self, kernel, temp_var, decl_info): - temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + temp_var_decl = POD(self.target.dtype_to_typename(decl_info.dtype), + decl_info.dtype, decl_info.name) if temp_var.read_only: from cgen import Const @@ -1000,7 +1002,7 @@ def wrap_global_constant(self, decl): def get_value_arg_decl(self, name, shape, dtype, is_written): assert shape == () - result = POD(self, dtype, name) + result = POD(self.target.dtype_to_typename(dtype), dtype, name) if not is_written: from cgen import Const @@ -1015,7 +1017,8 @@ def get_value_arg_decl(self, name, shape, dtype, is_written): def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from cgen import RestrictPointer, Const - arg_decl = RestrictPointer(POD(self, dtype, name)) + arg_decl = RestrictPointer(POD(self.target.dtype_to_typename(dtype), + dtype, name)) if not is_written: arg_decl = Const(arg_decl) @@ -1034,7 +1037,8 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import RestrictPointer, Const - arg_decl = RestrictPointer(POD(self, dtype, name)) + arg_decl = RestrictPointer(POD(self.target.dtype_to_typename(dtype), + dtype, name)) if not is_written: arg_decl = Const(arg_decl) @@ -1167,7 +1171,8 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, return For( InlineInitializer( - POD(self, iname_dtype, iname), + POD(self.target.dtype_to_typename(iname_dtype), + iname_dtype, iname), ecm(lbound, PREC_NONE, "i")), ecm( Comparison( diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 94564420d..60974b362 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -410,7 +410,8 @@ def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from cgen import Const from cgen.cuda import CudaRestrictPointer - arg_decl = CudaRestrictPointer(POD(self, dtype, name)) + arg_decl = CudaRestrictPointer(POD(self.target.dtype_to_typename(dtype), + dtype, name)) if not is_written: arg_decl = Const(arg_decl) @@ -432,7 +433,8 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): from cgen import RestrictPointer, Const from cgen.cuda import CudaConstant - arg_decl = RestrictPointer(POD(self, dtype, name)) + arg_decl = RestrictPointer(POD(self.target.dtype_to_typename(dtype), + dtype, name)) if not is_written: arg_decl = Const(arg_decl) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index e569f26f4..810031217 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -308,7 +308,8 @@ def emit_barrier(self, synchronization_kind, mem_kind, comment): def get_temporary_decl(self, kernel, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type - temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + temp_var_decl = POD(self.target.dtype_to_typename(decl_info.dtype), + decl_info.dtype, decl_info.name) shape = decl_info.shape @@ -338,7 +339,8 @@ def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from cgen import Const from cgen.ispc import ISPCUniformPointer, ISPCUniform - arg_decl = ISPCUniformPointer(POD(self, dtype, name)) + arg_decl = ISPCUniformPointer(POD(self.target.dtype_to_typename(dtype), + dtype, name)) if not is_written: arg_decl = Const(arg_decl) @@ -501,7 +503,8 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, return For( InlineInitializer( - ISPCUniform(POD(self, iname_dtype, iname)), + ISPCUniform(POD(self.target.dtype_to_typename(iname_dtype), + iname_dtype, iname)), ecm(lbound, PREC_NONE, "i")), ecm( p.Comparison(var(iname), "<=", ubound), diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 1f6d67294..f9a613401 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -734,7 +734,8 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): from cgen import RestrictPointer, Const from cgen.opencl import CLConstant - arg_decl = RestrictPointer(POD(self, dtype, name)) + arg_decl = RestrictPointer(POD(self.target.dtype_to_typename(dtype), + dtype, name)) if not is_written: arg_decl = Const(arg_decl) @@ -841,11 +842,13 @@ def emit_atomic_update(self, kernel, var_subst_map, lhs_atomicity, lhs_var, new_val = "*(%s *) &" % ctype + new_val cast_str = f"({var_kind} {ctype} *) " + lhs_dtype = NumpyType(lhs_dtype.dtype) + return Block([ - POD(self, NumpyType(lhs_dtype.dtype), - old_val_var), - POD(self, NumpyType(lhs_dtype.dtype), - new_val_var), + POD(self.target.dtype_to_typename(lhs_dtype), + lhs_dtype, old_val_var), + POD(self.target.dtype_to_typename(lhs_dtype), + lhs_dtype, new_val_var), DoWhile( "%(func_name)s(" "%(cast_str)s&(%(lhs_expr)s), " From 2b89b25306b8134b38dbc1f1adaa14c10467f4fe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 27 May 2021 17:40:51 -0500 Subject: [PATCH 062/109] cleans implementation; adds docs --- loopy/schedule/tree.py | 73 ++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index ab469d1a4..f8beda8a2 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -287,6 +287,12 @@ def __call__(self, expr, *args, **kwargs): class IdentityMapper(Mapper): def combine(self, values): + """ + Mapper methods might want to split a node into multiple nodes, in which + case they return an instance of :class:`GroupedChildren`. This method + "flattens" the contents of any such grouped of schedule nodes with + their corresponding siblings. + """ result = [] for val in values: if isinstance(val, GroupedChildren): @@ -483,6 +489,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, & make_slab(implemented_domain.space, dim_name, lbound, lbound + size)) + assert implemented_domain.dim(dim_type.set) == 0 return implemented_domain.params() @@ -586,11 +593,9 @@ def map_polyhedral_loop(self, expr, context): context.lsize) domain = _align_and_intersect(expr.domain, implemented_domain) - downstream_domain = _align_and_intersect(domain, implemented_domain) - downstream_domain = downstream_domain.move_dims(dim_type.param, - (downstream_domain - .dim(dim_type.param)), - dim_type.set, 0, 1).params() + downstream_domain = domain.move_dims(dim_type.param, + domain.dim(dim_type.param), + dim_type.set, 0, 1).params() children = [self.rec(child, (context .copy(implemented_domain=downstream_domain))) for child in expr.children] @@ -711,10 +716,12 @@ def _max_val_on_pwaff_for_unrolling(bset, pwaff, iname_to_unr): max_vals = [(bset & set_i).max_val(aff_i) for set_i, aff_i in pwaff.get_pieces()] - if any(max_val.is_infty() for max_val in max_vals): + if any((max_val.is_infty() or max_val.is_neginfty() or max_val.is_nan()) + for max_val in max_vals): raise LoopyError(f"{iname_to_unr} doesn't have an integral loop length" " => cannot unroll.") - return max(max_val.to_python() for max_val in max_vals) + from math import ceil + return ceil(max(max_val.to_python() for max_val in max_vals)) class Unroller(PolyhedronLoopifier): @@ -739,19 +746,19 @@ def map_polyhedral_loop(self, expr, context): return super().map_polyhedral_loop(expr, context) implemented_domain = _implement_hw_axes_in_domains(context - .implemented_domain, - expr.domain, - self.kernel, - context.gsize, - context.lsize) + .implemented_domain, + expr.domain, + self.kernel, + context.gsize, + context.lsize) domain = _align_and_intersect(expr.domain, implemented_domain) lbound = domain.dim_min(0) loop_length_pw_aff = domain.dim_max(0) - lbound + 1 loop_length = _max_val_on_pwaff_for_unrolling(implemented_domain, - loop_length_pw_aff, - expr.iname) + loop_length_pw_aff, + expr.iname) result = [] for i in range(loop_length): @@ -761,14 +768,14 @@ def map_polyhedral_loop(self, expr, context): continue dwnstrm_dom = _align_and_intersect(unrll_dom, - implemented_domain) + implemented_domain) dwnstrm_dom = dwnstrm_dom.move_dims(dim_type.param, (dwnstrm_dom .dim(dim_type.param)), dim_type.set, 0, 1).params() children = [self.rec(child, (context - .copy(implemented_domain=dwnstrm_dom))) + .copy(implemented_domain=dwnstrm_dom))) for child in expr.children] result.append(PolyhedralLoop(iname=expr.iname, @@ -792,9 +799,9 @@ def map_instruction_block(self, expr, context): for child in expr.children}) == 1 inames, = {self.kernel.id_to_insn[child.insn_id].within_inames - for child in expr.children} + for child in expr.children} predicates, = {self.kernel.id_to_insn[child.insn_id].predicates - for child in expr.children} + for child in expr.children} # {{{ compute the predicates due to the hardware inames @@ -812,10 +819,9 @@ def map_instruction_block(self, expr, context): context.gsize, context.lsize) domain = (domain - .move_dims(dim_type.param, domain.dim(dim_type.param), - dim_type.set, 0, domain.dim(dim_type.set))) - unimplemented_domain = (isl.align_spaces(domain, impl_domain) - .gist(impl_domain)) + .move_dims(dim_type.param, domain.dim(dim_type.param), + dim_type.set, 0, domain.dim(dim_type.set))) + unimplemented_domain = _align_and_gist(domain, impl_domain) if not unimplemented_domain.is_universe(): predicates |= {set_to_cond_expr(unimplemented_domain)} @@ -833,16 +839,23 @@ def map_instruction_block(self, expr, context): def map_polyhedral_loop(self, expr, context): from loopy.symbolic import pw_aff_to_expr - from loopy.isl_helpers import (static_min_of_pw_aff, - static_max_of_pw_aff, make_slab) + from loopy.isl_helpers import (make_slab, static_min_of_pw_aff, + static_max_of_pw_aff) + base_poly_loop = super().map_polyhedral_loop(expr, context) assert expr.domain.dim(dim_type.set) == 1 - domain = _align_and_intersect(expr.domain, context.implemented_domain) + domain = base_poly_loop.domain + impl_domain = _implement_hw_axes_in_domains(context.implemented_domain, + domain, + self.kernel, + context.gsize, + context.lsize) + lb = domain.dim_min(0) ub = domain.dim_max(0) set_implemented_in_loop = make_slab(expr.domain.space, expr.iname, static_min_of_pw_aff(lb, False), - static_max_of_pw_aff(ub, False)+1) + static_max_of_pw_aff(ub, False) + 1) # Removing divs because all we want is a projection, with no remaining # constraints from the eliminated variables. @@ -852,7 +865,7 @@ def map_polyhedral_loop(self, expr, context): .remove_divs(), _align_and_intersect( set_implemented_in_loop, - context.implemented_domain)) + impl_domain)) inner_condition = _align_and_gist(domain.affine_hull(), set_implemented_in_loop) @@ -863,9 +876,7 @@ def map_polyhedral_loop(self, expr, context): upper_bound=pw_aff_to_expr(ub), step=step, children=_wrap_in_if(inner_condition, - (super() - .map_polyhedral_loop(expr, context) - .children))) + base_poly_loop.children)) if outer_condition.is_universe(): return for_ @@ -964,7 +975,7 @@ def insert_predicates_into_schedule(kernel): schedule = PolyhedronLoopifier(kernel)(kernel.schedule) unvectorizable_inames = UnvectorizableInamesCollector(kernel)(schedule) - # FIXME: (For now) unvectorizable inames always fallback to unrolling this + # TODO: (For now) unvectorizable inames always fallback to unrolling this # should be selected based on the target. schedule = Unroller(kernel, unvectorizable_inames)(schedule) schedule = PredicateInsertionMapper(kernel)(schedule) From f54e4eafab8bedd0e76fae73256ebdb07c02ca43 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 27 May 2021 17:41:29 -0500 Subject: [PATCH 063/109] accepts equivalent alternative string for bounds comparison --- test/test_loopy.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index f17376bf9..c97cda9ea 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2040,12 +2040,18 @@ def test_tight_loop_bounds_codegen(): cgr = lp.generate_code_v2(knl) #print(cgr.device_code()) - for_loop = \ + for_loop1 = \ "for (int j = " \ "(gid(0) == 0 && lid(0) == 0 ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \ "j <= (-1 + gid(0) == 0 && lid(0) == 0 ? 9 : 2 * lid(0)); ++j)" - assert for_loop in cgr.device_code() + for_loop2 = \ + "for (int j = " \ + "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ + "j <= (gid(0) == 0 && lid(0) >= 0 && 4 + -1 * lid(0) >= 0 ?" \ + " 10 * gid(0) + 2 * lid(0) : 9); ++j)" + + assert (for_loop1 in cgr.device_code() or for_loop2 in cgr.device_code()) def test_unscheduled_insn_detection(): From 93de0c531cc53480a52b0c704098ccd311c78ff8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 27 May 2021 17:59:13 -0500 Subject: [PATCH 064/109] add a mapper to remove empty loops --- loopy/schedule/tree.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index f8beda8a2..2a32aae70 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -604,6 +604,17 @@ def map_polyhedral_loop(self, expr, context): children=self.combine(children), domain=domain) +class EmptyLoopRemover(PolyhedronLoopifier): + """ + Mapper to remove any loops with empty domain. + """ + def map_polyhedral_loop(self, expr, context): + if expr.domain.is_empty(): + return GroupedChildren([]) + + return super().map_polyhedral_loop(expr, context) + + class UnvectorizableInamesCollector(CombineMapper): """ Mapper to gather all insn ids. @@ -973,6 +984,7 @@ def insert_predicates_into_schedule(kernel): # }}} schedule = PolyhedronLoopifier(kernel)(kernel.schedule) + schedule = EmptyLoopRemover(kernel)(schedule) unvectorizable_inames = UnvectorizableInamesCollector(kernel)(schedule) # TODO: (For now) unvectorizable inames always fallback to unrolling this From 804c0a335275c067077445dacfb1cb9fdfc4bbdb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 27 May 2021 18:02:13 -0500 Subject: [PATCH 065/109] account for the fact that 'idis' argument already includes the idis corresponding to extra_args --- loopy/codegen/result.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 38199fd53..bc6f84757 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -336,7 +336,9 @@ def map_function(self, expr, context): is_generating_device_code=True)) host_ast = self.host_ast_builder.get_kernel_call(self.kernel, expr.name, idis, - expr.extra_args) + # 'idis' include the + # "extra_args" + extra_args=[]) # }}} From abe14bfeab570fd7fe36f4a1475474dc3ef8bbf2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 27 May 2021 18:30:38 -0500 Subject: [PATCH 066/109] pass unvectorizable_inames to CodeGenMapper --- loopy/codegen/__init__.py | 4 ++-- loopy/codegen/result.py | 8 +++++--- loopy/schedule/tree.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e70d06c96..68f933ef3 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -247,12 +247,12 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, from loopy.schedule.tree import (make_schedule_tree, insert_predicates_into_schedule) kernel = make_schedule_tree(kernel) - kernel = insert_predicates_into_schedule(kernel) + kernel, unvectorizable_inames = insert_predicates_into_schedule(kernel) # }}} from loopy.codegen.result import get_idis_for_kernel, CodeGenMapper - codegen_mapper = CodeGenMapper(kernel) + codegen_mapper = CodeGenMapper(kernel, unvectorizable_inames) codegen_result = codegen_mapper(kernel.schedule) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index bc6f84757..ce5389d55 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -211,8 +211,9 @@ def copy(self, *, in_device=None, iname_exprs=None, class CodeGenMapper(CombineMapper): - def __init__(self, kernel): + def __init__(self, kernel, unvectorizable_inames): self.kernel = kernel + self.unvectorizable_inames = unvectorizable_inames self.host_ast_builder = kernel.target.get_host_ast_builder() self.device_ast_builder = kernel.target.get_device_ast_builder() @@ -423,7 +424,8 @@ def map_for(self, expr, context): InOrderSequentialSequentialTag) ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 - if self.kernel.iname_tags_of_type(expr.iname, vec_tags): + if (self.kernel.iname_tags_of_type(expr.iname, vec_tags) + and expr.iname not in self.unvectorizable_inames): assert isinstance(expr.lower_bound, int) assert isinstance(expr.upper_bound, int) assert expr.step == 1 @@ -436,7 +438,7 @@ def map_for(self, expr, context): else: assert (len(self.kernel.inames[expr.iname].tags) == 0 or self.kernel.iname_tags_of_type(expr.iname, - seq_tags+unr_tags)) + seq_tags+unr_tags+vec_tags)) assert expr.step == 1 if expr.upper_bound != expr.lower_bound: diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 2a32aae70..6f8b39f0e 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -991,7 +991,7 @@ def insert_predicates_into_schedule(kernel): # should be selected based on the target. schedule = Unroller(kernel, unvectorizable_inames)(schedule) schedule = PredicateInsertionMapper(kernel)(schedule) - return kernel.copy(schedule=schedule) + return kernel.copy(schedule=schedule), unvectorizable_inames def get_insns_in_function(kernel, name): From 0bca616ca6effeaa3d75584f72183e6fa21807dc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 28 May 2021 16:53:39 -0500 Subject: [PATCH 067/109] take care of loop bound computations for loops with barriers --- loopy/kernel/tools.py | 12 --- loopy/schedule/tree.py | 172 ++++++++++++++++++++++++++++++----------- loopy/target/python.py | 7 +- 3 files changed, 134 insertions(+), 57 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 8382fd002..18c0e9d96 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2064,18 +2064,6 @@ def get_outer_params(domains): # }}} -@memoize_on_first_arg -def get_all_inames_tagged_with(kernel, tag_type): - """ - Returns :class:`frozenset` of all iname traversing across the target - hardware's execution grid. - """ - from loopy.kernel.data import filter_iname_tags_by_type - return frozenset(iname.name - for iname in kernel.inames.values() - if filter_iname_tags_by_type(iname.tags, tag_type)) - - @memoize_on_first_arg def has_complex_dtyped_var(kernel): """ diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 6f8b39f0e..65469f593 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -1,13 +1,13 @@ import pymbolic.primitives as prim import loopy.schedule as schedule import islpy as isl -from typing import List, Union, Any, Optional, Tuple +from typing import List, Union, Any, Optional, Tuple, FrozenSet from dataclasses import dataclass, field from functools import reduce from islpy import dim_type +from pytools import memoize_on_first_arg from loopy.diagnostic import LoopyError from loopy.kernel import KernelState -from loopy.kernel.tools import get_all_inames_tagged_with # {{{ LoopKernel.schedule a tree @@ -179,7 +179,7 @@ def make_and_enter_function(self, name, extra_args, extra_inames): # end of instruction block self._build_stack.pop() - assert isinstance(self.current_node, Schedule) + assert isinstance(self.current_node, (Schedule, Loop)) new_function = Function(name, extra_args, extra_inames, []) self.current_node.children.append(new_function) self.make_current_node(new_function) @@ -439,7 +439,7 @@ def _wrap_in_if(cond, nodes): def _implement_hw_axes_in_domains(implemented_domain, domain, - kernel, gsize, lsize): + kernel, hw_inames, gsize, lsize): """ If *domain* contains any inames going along hardware inames account for those in *implemented_domain*. @@ -453,10 +453,8 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, from loopy.kernel.data import AxisTag, GroupIndexTag, LocalIndexTag from loopy.isl_helpers import make_slab, static_min_of_pw_aff - all_hw_inames = get_all_inames_tagged_with(kernel, AxisTag) - for dim_name in domain.get_var_dict(): - if dim_name in all_hw_inames: + if dim_name in hw_inames: if dim_name in implemented_domain.get_var_dict(): # this hardware dim is already implemented => ignore continue @@ -496,20 +494,26 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, @dataclass(frozen=True) class PolyhedronLoopifierContext: implemented_domain: isl.BasicSet + hw_inames: FrozenSet[str] gsize: Optional[Tuple[prim.Expression, ...]] = None lsize: Optional[Tuple[prim.Expression, ...]] = None - def copy(self, *, implemented_domain=None, gsize=None, lsize=None): + def copy(self, *, implemented_domain=None, hw_inames=None, gsize=None, + lsize=None): if implemented_domain is None: implemented_domain = self.implemented_domain + if hw_inames is None: + hw_inames = self.hw_inames + if gsize is None: gsize = self.gsize if lsize is None: lsize = self.lsize - return PolyhedronLoopifierContext(implemented_domain, gsize, lsize) + return PolyhedronLoopifierContext(implemented_domain, + hw_inames, gsize, lsize) class PolyhedronLoopifier(IdentityMapper): @@ -519,17 +523,22 @@ def __init__(self, kernel): def map_schedule(self, expr): impl_domain = self.kernel.assumptions return super().map_schedule(expr, - PolyhedronLoopifierContext(impl_domain)) + PolyhedronLoopifierContext(impl_domain, + hw_inames=frozenset() + )) def map_function(self, expr, context): + from loopy.kernel.data import AxisTag # get the implemented domain for the insn ids in this kernel # Shouldn't be difficult to write a combine mapper for it. gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids( InstructionGatherer()(expr)) - # FIXME: Somehow we need to get rid of allowing the hardware inames to - # be slabbed. + + hw_inames = get_all_inames_tagged_with(self.kernel, expr.name, AxisTag) + return super().map_function(expr, context.copy(gsize=gsize, - lsize=lsize)) + lsize=lsize, + hw_inames=hw_inames)) def map_loop(self, expr, context): implemented_domain = context.implemented_domain @@ -540,6 +549,7 @@ def map_loop(self, expr, context): implemented_domain = _implement_hw_axes_in_domains(implemented_domain, domain, self.kernel, + context.hw_inames, context.gsize, context.lsize) @@ -589,6 +599,7 @@ def map_polyhedral_loop(self, expr, context): .implemented_domain, expr.domain, self.kernel, + context.hw_inames, context.gsize, context.lsize) @@ -760,6 +771,7 @@ def map_polyhedral_loop(self, expr, context): .implemented_domain, expr.domain, self.kernel, + context.hw_inames, context.gsize, context.lsize) @@ -800,7 +812,31 @@ def map_loop(self, expr, context): " polyhedral loops.") +class BarrieredLoopsCollector(CombineMapper): + + def __call__(self, expr): + return super().__call__(expr, frozenset()) + + def combine(self, values): + assert all(isinstance(value, frozenset) for value in values) + return reduce(frozenset.union, values, frozenset()) + + def map_polyhedral_loop(self, expr, loop_nesting): + return self.combine([self.rec(child, loop_nesting | {expr.iname}) + for child in expr.children]) + + def map_instruction_block(self, expr, loop_nesting): + return frozenset() + + def map_barrier(self, expr, loop_nesting): + return loop_nesting + + class PredicateInsertionMapper(PolyhedronLoopifier): + def __init__(self, kernel, loops_containing_barrier): + super().__init__(kernel) + self.loops_containing_barriers = loops_containing_barrier + def map_instruction_block(self, expr, context): from loopy.symbolic import set_to_cond_expr @@ -814,30 +850,22 @@ def map_instruction_block(self, expr, context): predicates, = {self.kernel.id_to_insn[child.insn_id].predicates for child in expr.children} - # {{{ compute the predicates due to the hardware inames + impl_domain = context.implemented_domain + domain = self.kernel.get_inames_domain(inames) + impl_domain = _implement_hw_axes_in_domains(impl_domain, + domain, + self.kernel, + context.hw_inames, + context.gsize, + context.lsize) + domain = domain.project_out_except(names=inames, types=[dim_type.set]) + domain = domain.move_dims(dim_type.param, domain.dim(dim_type.param), + dim_type.set, 0, domain.dim(dim_type.set)) - from loopy.kernel.data import AxisTag - hw_inames = inames & get_all_inames_tagged_with(self.kernel, AxisTag) - - if hw_inames: - impl_domain = context.implemented_domain - domain = (self.kernel.get_inames_domain(hw_inames) - .project_out_except(types=[dim_type.set], - names=hw_inames)) - impl_domain = _implement_hw_axes_in_domains(impl_domain, - domain, - self.kernel, - context.gsize, - context.lsize) - domain = (domain - .move_dims(dim_type.param, domain.dim(dim_type.param), - dim_type.set, 0, domain.dim(dim_type.set))) - unimplemented_domain = _align_and_gist(domain, impl_domain) - - if not unimplemented_domain.is_universe(): - predicates |= {set_to_cond_expr(unimplemented_domain)} + unimplemented_domain = _align_and_gist(domain, impl_domain) - # }}} + if not unimplemented_domain.is_universe(): + predicates |= {set_to_cond_expr(unimplemented_domain)} new_insn_block = InstructionBlock([self.rec(child, context) for child in expr.children]) @@ -852,16 +880,34 @@ def map_polyhedral_loop(self, expr, context): from loopy.symbolic import pw_aff_to_expr from loopy.isl_helpers import (make_slab, static_min_of_pw_aff, static_max_of_pw_aff) - - base_poly_loop = super().map_polyhedral_loop(expr, context) assert expr.domain.dim(dim_type.set) == 1 - domain = base_poly_loop.domain - impl_domain = _implement_hw_axes_in_domains(context.implemented_domain, + assert context.implemented_domain.dim(dim_type.set) == 0 + domain = expr.domain + + impl_domain = _implement_hw_axes_in_domains(context + .implemented_domain, domain, self.kernel, + context.hw_inames, context.gsize, context.lsize) + if expr.iname in self.loops_containing_barriers: + hw_inames = (set(domain.get_var_dict()) + & context.hw_inames) + while hw_inames: + hw_iname = hw_inames.pop() + dt, pos = domain.get_var_dict()[hw_iname] + domain = domain.project_out(dt, pos, 1) + + domain = _align_and_intersect(domain, impl_domain) + downstream_domain = domain.move_dims(dim_type.param, + domain.dim(dim_type.param), + dim_type.set, 0, 1).params() + children = [self.rec(child, (context + .copy(implemented_domain=downstream_domain))) + for child in expr.children] + lb = domain.dim_min(0) ub = domain.dim_max(0) set_implemented_in_loop = make_slab(expr.domain.space, expr.iname, @@ -887,7 +933,7 @@ def map_polyhedral_loop(self, expr, context): upper_bound=pw_aff_to_expr(ub), step=step, children=_wrap_in_if(inner_condition, - base_poly_loop.children)) + children)) if outer_condition.is_universe(): return for_ @@ -896,6 +942,24 @@ def map_polyhedral_loop(self, expr, context): return If(set_to_cond_expr(outer_condition), [for_]) +class FunctionCollector(CombineMapper): + """ + Mapper to gather all functions in a :class:`ScheduleNode`. + """ + def combine(self, values): + assert all(isinstance(value, list) for value in values) + return sum(values, start=[]) + + def map_function(self, expr): + return [expr] + + def map_run_instruction(self, expr): + return [] + + def map_barrier(self, expr): + return [] + + class InstructionGatherer(CombineMapper): """ Mapper to gather all insn ids. @@ -990,11 +1054,33 @@ def insert_predicates_into_schedule(kernel): # TODO: (For now) unvectorizable inames always fallback to unrolling this # should be selected based on the target. schedule = Unroller(kernel, unvectorizable_inames)(schedule) - schedule = PredicateInsertionMapper(kernel)(schedule) + loops_containing_barrier = BarrieredLoopsCollector()(schedule) + schedule = PredicateInsertionMapper(kernel, loops_containing_barrier)(schedule) return kernel.copy(schedule=schedule), unvectorizable_inames +@memoize_on_first_arg def get_insns_in_function(kernel, name): - function, = [child for child in kernel.schedule.children - if isinstance(child, Function) and child.name == name] + function, = [fn + for fn in FunctionCollector()(kernel.schedule) + if fn.name == name] return InstructionGatherer()(function) + + +@memoize_on_first_arg +def get_all_inames_tagged_with(kernel, func_name, tag_type): + """ + Returns :class:`frozenset` of all iname traversing across the target + hardware's execution grid. + + :arg func_name: name of the function within which the inames are to be considered + """ + from loopy.schedule.tree import get_insns_in_function + + insn_ids = get_insns_in_function(kernel, func_name) + all_inames = reduce(frozenset.union, (kernel.id_to_insn[id_].within_inames + for id_ in insn_ids), + frozenset()) + return frozenset(iname + for iname in all_inames + if kernel.iname_tags_of_type(iname, tag_type)) diff --git a/loopy/target/python.py b/loopy/target/python.py index 6ffa42aec..52b911540 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -237,7 +237,8 @@ def ast_block_scope_class(self): def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, inner, var_subst_map): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info=None) from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM from genpy import For @@ -277,9 +278,11 @@ def can_implement_conditionals(self): def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): assert vectorization_info is None from genpy import If + from pymbolic.mapper.stringifier import PREC_NONE + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, vectorization_info) - return If(ecm(condition), ast) + return If(ecm(condition, prec=PREC_NONE), ast) def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): if insn.atomicity: From 1dc54e2a449f46d92b6f32b61609fb232af62a46 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 28 May 2021 18:47:34 -0500 Subject: [PATCH 068/109] minor bug fixes - pieces in incomplete part of ISPCASTBuilder, NumbaTarget --- loopy/schedule/device_mapping.py | 5 +++-- loopy/target/c/__init__.py | 6 +++++- loopy/target/ispc.py | 23 ++++++++++++++--------- loopy/target/python.py | 6 ++++-- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 35c73b775..3cb81db11 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -39,12 +39,13 @@ def map_schedule_onto_host_or_device(kernel): + kernel.target.device_program_name_suffix) if not kernel.target.split_kernel_at_global_barriers(): + new_name = device_prog_name_gen() new_schedule = ( - [CallKernel(kernel_name=device_prog_name_gen(), + [CallKernel(kernel_name=new_name, extra_args=[], extra_inames=[])] + list(kernel.linearization) + - [ReturnFromKernel(kernel_name=kernel.name)]) + [ReturnFromKernel(kernel_name=new_name)]) kernel = kernel.copy(linearization=new_schedule) else: kernel = map_schedule_onto_host_or_device_impl( diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index c08f70cce..517bf694a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -782,9 +782,13 @@ def emit_array_literal(self, kernel, array, value): for d_i in data))) def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - return None + return self.emit_blank_line() def get_temporary_decls(self, kernel, subkernel_name): + if subkernel_name is None: + # => host program => no temp dels + return [] + from loopy.kernel.data import AddressSpace ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 810031217..10e8b1ba9 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -230,12 +230,13 @@ def _arg_names_and_decls(self, kernel, implemented_data_info): # {{{ top-level codegen - def get_function_declaration(self, name, kernel, implemented_data_info, + def get_function_declaration(self, kernel, name, implemented_data_info, is_generating_device_code): from cgen import (FunctionDeclaration, Value) from cgen.ispc import ISPCExport, ISPCTask - arg_names, arg_decls = self._arg_names_and_decls(kernel) + arg_names, arg_decls = self._arg_names_and_decls(kernel, + implemented_data_info) if is_generating_device_code: result = ISPCTask( @@ -254,7 +255,8 @@ def get_function_declaration(self, name, kernel, implemented_data_info, # }}} def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) from loopy.schedule.tree import get_insns_in_function from pymbolic.mapper.stringifier import PREC_NONE @@ -270,7 +272,8 @@ def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): "assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE))) - arg_names, arg_decls = self._arg_names_and_decls(kernel) + arg_names, arg_decls = self._arg_names_and_decls(kernel, + implemented_data_info) from cgen.ispc import ISPCLaunch result.append( @@ -322,7 +325,8 @@ def get_temporary_decl(self, kernel, temp_var, decl_info): if shape: from cgen import ArrayOf - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) temp_var_decl = ArrayOf( temp_var_decl, ecm(p.flattened_product(shape), @@ -376,9 +380,9 @@ def get_value_arg_decl(self, name, shape, dtype, is_written): return ISPCUniform(result) def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): - raise NotImplementedError - ecm = self.expression_to_code_mapper(kernel, var_subst_map, - vectorization_info) + + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info) assignee_var_name, = insn.assignee_var_names() @@ -499,7 +503,8 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, from cgen import For, InlineInitializer from cgen.ispc import ISPCUniform - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info=None) return For( InlineInitializer( diff --git a/loopy/target/python.py b/loopy/target/python.py index 52b911540..05168de35 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -181,7 +181,8 @@ def get_function_definition(self, kernel, name, implemented_data_info, function_body) def get_temporary_decls(self, kernel, subkernel_name): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) result = [] @@ -294,7 +295,8 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): from pymbolic.mapper.stringifier import PREC_NONE from genpy import Assign - ecm = self.expression_to_code_mapper(kernel, var_subst_map) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + vectorization_info) return Assign( ecm(insn.assignee, prec=PREC_NONE, type_context=None), From 97b38228b03c0e97a902d825704589775548f84d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 28 May 2021 19:33:23 -0500 Subject: [PATCH 069/109] while finding inner condition take into account the implemented_domain --- loopy/schedule/tree.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 65469f593..6c810d519 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -924,7 +924,9 @@ def map_polyhedral_loop(self, expr, context): set_implemented_in_loop, impl_domain)) inner_condition = _align_and_gist(domain.affine_hull(), - set_implemented_in_loop) + _align_and_intersect( + set_implemented_in_loop, + impl_domain)) step = 1 # TODO: from inner_condition try to guess the step From 389a513f93ec547ac3b35814fcd333cc25d13f96 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 16:41:01 -0500 Subject: [PATCH 070/109] remove redundant constraints before emitting expressions --- loopy/schedule/tree.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 6c810d519..12005cb28 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -451,7 +451,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, hardware inames in *domain* to their corresponding """ from loopy.kernel.data import AxisTag, GroupIndexTag, LocalIndexTag - from loopy.isl_helpers import make_slab, static_min_of_pw_aff + from loopy.isl_helpers import make_slab for dim_name in domain.get_var_dict(): if dim_name in hw_inames: @@ -462,9 +462,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, tag, = kernel.iname_tags_of_type(dim_name, AxisTag) assert isinstance(tag, (GroupIndexTag, LocalIndexTag)) - lbound = static_min_of_pw_aff(kernel - .get_iname_bounds(dim_name) - .lower_bound_pw_aff, constants_only=False) + lbound = kernel.get_iname_bounds(dim_name).lower_bound_pw_aff size = (gsize[tag.axis] if isinstance(tag, GroupIndexTag) @@ -855,7 +853,8 @@ def map_instruction_block(self, expr, context): impl_domain = _implement_hw_axes_in_domains(impl_domain, domain, self.kernel, - context.hw_inames, + (context.hw_inames + & inames), context.gsize, context.lsize) domain = domain.project_out_except(names=inames, types=[dim_type.set]) @@ -923,6 +922,12 @@ def map_polyhedral_loop(self, expr, context): _align_and_intersect( set_implemented_in_loop, impl_domain)) + + lb = _align_and_gist(lb, _align_and_intersect(outer_condition, + impl_domain).params()) + ub = _align_and_gist(ub, _align_and_intersect(outer_condition, + impl_domain).params()) + inner_condition = _align_and_gist(domain.affine_hull(), _align_and_intersect( set_implemented_in_loop, From ab4ea24f9070f998d3a0c5291579c0d904b12488 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 17:05:37 -0500 Subject: [PATCH 071/109] [formatting] minor bug fixes to placate linters --- loopy/codegen/__init__.py | 2 -- loopy/target/c/__init__.py | 9 ++++++--- loopy/target/numba.py | 3 ++- loopy/target/python.py | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 68f933ef3..e78f6a7b4 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -50,8 +50,6 @@ .. autoclass:: SeenFunction -.. autoclass:: CodeGenerationState - .. autoclass:: TranslationUnitCodeGenerationResult .. automodule:: loopy.codegen.result diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 517bf694a..d9737fdc4 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -277,8 +277,7 @@ def default_value(self): class ScopingBlock(Block): - """A block that is mandatory for scoping and may not be simplified away - by :func:`loopy.codegen.result.merge_codegen_results`. + """A block that is mandatory for scoping. """ @@ -1101,8 +1100,12 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): raise ValueError("unexpected lhs atomicity type: %s" % type(lhs_atomicity).__name__) + def emit_atomic_init(self, kernel, var_subst_map, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + raise NotImplementedError("atomic updates in %s" % type(self).__name__) + def emit_atomic_update(self, kernel, var_subst_map, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype): + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): raise NotImplementedError("atomic updates in %s" % type(self).__name__) def emit_tuple_assignment(self, kernel, callables_table, insn, diff --git a/loopy/target/numba.py b/loopy/target/numba.py index c18b07cbd..9ed781769 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -67,7 +67,8 @@ def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): from genpy import Statement implemented_data_info = implemented_data_info - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}) + ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + vectorization_info=None) from loopy.schedule.tree import get_insns_in_function gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( diff --git a/loopy/target/python.py b/loopy/target/python.py index 05168de35..518f7cdc7 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -228,7 +228,7 @@ def ast_for_class(self): @property def ast_if_class(self): - from genpyt import If + from genpy import If return If @property From a206f51c9af32354043914dcfc27c9d879288c53 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 17:30:07 -0500 Subject: [PATCH 072/109] guard some intersects with align --- loopy/schedule/tree.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 12005cb28..93c44066e 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -471,6 +471,8 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, lsize[tag.axis]) + implemented_domain, lbound = isl.align_two(implemented_domain, + lbound) if not isinstance(size, int): lbound, size = isl.align_two(lbound, size) @@ -783,8 +785,11 @@ def map_polyhedral_loop(self, expr, context): result = [] for i in range(loop_length): - unrll_dom = make_slab(domain.space, expr.iname, lbound+i, - lbound+i+1) & domain + unrll_dom = _align_and_intersect(make_slab(domain.space, + expr.iname, + lbound+i, + lbound+i+1), + domain) if unrll_dom.is_empty(): continue From eede3e58cc20d0e38f9cba2776a37938e485fe03 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 17:34:35 -0500 Subject: [PATCH 073/109] corrects the string for checking bound pwaffs --- test/test_loopy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index c97cda9ea..ee85b3683 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2048,8 +2048,7 @@ def test_tight_loop_bounds_codegen(): for_loop2 = \ "for (int j = " \ "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ - "j <= (gid(0) == 0 && lid(0) >= 0 && 4 + -1 * lid(0) >= 0 ?" \ - " 10 * gid(0) + 2 * lid(0) : 9); ++j)" + "j <= (gid(0) == 0 ? 2 * lid(0) : 9); ++j)" assert (for_loop1 in cgr.device_code() or for_loop2 in cgr.device_code()) From 156c7eb89051f2792808811d83897e40e75852a8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 17:41:55 -0500 Subject: [PATCH 074/109] don't align too soon --- loopy/schedule/tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 93c44066e..88bdbad24 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -471,11 +471,11 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, lsize[tag.axis]) - implemented_domain, lbound = isl.align_two(implemented_domain, - lbound) if not isinstance(size, int): lbound, size = isl.align_two(lbound, size) + implemented_domain, lbound = isl.align_two(implemented_domain, + lbound) implemented_domain = (implemented_domain .add_dims(dim_type.param, 1) .set_dim_name(dim_type.param, From bc92f123b5cb17d5bd63ccd4730e75ed73343bb8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 17:47:01 -0500 Subject: [PATCH 075/109] pwaffs addition only when spaces match --- loopy/schedule/tree.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 88bdbad24..dee9635c3 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -474,6 +474,8 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, if not isinstance(size, int): lbound, size = isl.align_two(lbound, size) + ubound = lbound + size + implemented_domain, lbound = isl.align_two(implemented_domain, lbound) implemented_domain = (implemented_domain @@ -485,7 +487,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, implemented_domain = (implemented_domain & make_slab(implemented_domain.space, dim_name, - lbound, lbound + size)) + lbound, ubound)) assert implemented_domain.dim(dim_type.set) == 0 return implemented_domain.params() From 2901046af5aef4862f0fee28e854bb6d7921468b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 30 May 2021 18:14:58 -0500 Subject: [PATCH 076/109] gets rid of unnecessary affine hull computation while figuring out the inner condition --- loopy/schedule/tree.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index dee9635c3..3c830bd37 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -935,10 +935,12 @@ def map_polyhedral_loop(self, expr, context): ub = _align_and_gist(ub, _align_and_intersect(outer_condition, impl_domain).params()) - inner_condition = _align_and_gist(domain.affine_hull(), - _align_and_intersect( - set_implemented_in_loop, - impl_domain)) + inner_condition = _align_and_gist( + domain, + _align_and_intersect( + _align_and_intersect(set_implemented_in_loop, + impl_domain), + outer_condition)) step = 1 # TODO: from inner_condition try to guess the step From acd691e56e92d845f77a683964d52627b0cb8f3b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 10:06:53 -0500 Subject: [PATCH 077/109] bugfix: make implemented_domain a params-only domain --- loopy/schedule/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 3c830bd37..252c4d2e0 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -483,7 +483,7 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, .set_dim_name(dim_type.param, implemented_domain .dim(dim_type.param), - dim_name)) + dim_name)).params() implemented_domain = (implemented_domain & make_slab(implemented_domain.space, dim_name, From 65f504a96e8ee543e8297dda55a8f156175f55e3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 15:25:09 -0500 Subject: [PATCH 078/109] adds license, copyright header --- loopy/schedule/tree.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 252c4d2e0..d164d51de 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -1,3 +1,25 @@ +__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + import pymbolic.primitives as prim import loopy.schedule as schedule import islpy as isl From ffa1d07cdeb1989541f6bd6b91648096e40070bd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 20:12:11 -0500 Subject: [PATCH 079/109] cast domains to be an instance of :class:`LoopKernelDomains` --- loopy/transform/precompute.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 6b3722628..8ffa362e2 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -712,7 +712,10 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, for idom, dom in enumerate(kernel.domains) if idom not in domain_indices] domains_after_combining.insert(domain_indices[-1], combined_domain) - kernel = kernel.copy(domains=domains_after_combining) + + from loopy.kernel import make_loop_kernel_domains + kernel = (kernel + .copy(domains=make_loop_kernel_domains(domains_after_combining))) # }}} From bf60372d6ffd24f1e76cf0258c587b354ae4178d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 22:39:38 -0500 Subject: [PATCH 080/109] corrects test_loopy::test_check_variable_access_ordered_with_aliasing - adds '...' to the kernel_data to account for extra kernel arguments --- test/test_loopy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index ee85b3683..0ff16b7b5 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2589,6 +2589,7 @@ def test_check_for_variable_access_ordering_with_aliasing(): [ lp.TemporaryVariable("a", shape="n+1", base_storage="tmp"), lp.TemporaryVariable("b", shape="n+1", base_storage="tmp"), + ... ]) from loopy.diagnostic import VariableAccessNotOrdered From 6464138d9e9f8c83e6df125fda2e782069f21501 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 10:09:59 -0500 Subject: [PATCH 081/109] corrects the domain combining invocation --- loopy/transform/precompute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 8ffa362e2..fe5a6385c 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -705,13 +705,13 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, # manipulated to be a single domain. domain_indices = tuple(sorted({kernel.get_home_domain_index(i) - for i in change_inames}, reverse=True)) + for i in change_inames})) combined_domain = kernel.combine_domains(domain_indices) domains_after_combining = [dom for idom, dom in enumerate(kernel.domains) if idom not in domain_indices] - domains_after_combining.insert(domain_indices[-1], combined_domain) + domains_after_combining.insert(domain_indices[0], combined_domain) from loopy.kernel import make_loop_kernel_domains kernel = (kernel From 90229d0718c92cc3f365ffc05a7e063bc6092298 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 11:28:21 -0500 Subject: [PATCH 082/109] allow multiple parents per domain; all_ppd = transitive_closure(ppd) --- loopy/kernel/__init__.py | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 98215d57f..6dd2358ff 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -853,7 +853,7 @@ def id_to_insn(self): @memoize_method def parents_per_domain(self): """Return a list corresponding to self.domains (by index) - containing domain indices which are nested around this + containing a list of domain indices which are nested around this domain. Each domains nest list walks from the leaves of the nesting @@ -863,7 +863,7 @@ def parents_per_domain(self): # {{{ exit early strategy: all domains are roots if self.domains.param_dims <= self.get_unwritten_value_args(): - return [None, ] * len(self.domains) + return [frozenset(), ] * len(self.domains) # }}} @@ -873,12 +873,6 @@ def parents_per_domain(self): for idom, dom in enumerate(self.domains): idom_param_vars = (frozenset(dom.get_var_names(dim_type.param)) - self.get_unwritten_value_args()) - if len(idom_param_vars) == 0: - # idom doesn't depend on any inames/variables - # => doesn't impose any nesting criteria - result.append(None) - continue - # outer_inames: inames that must be nested outside the 'set dims' # of 'dom' outer_inames = set() @@ -894,16 +888,9 @@ def parents_per_domain(self): writer_insn, = writer_insns outer_inames.update(self.insn_inames(writer_insn)) - parent_idoms = {hdm[iname] for iname in outer_inames} + parent_idoms = frozenset({hdm[iname] for iname in outer_inames}) - if len(parent_idoms) == 0: - result.append(None) - elif len(parent_idoms) > 1: - raise NotImplementedError("Only one parent per domain supported" - " for now.") - else: - parent_idom, = parent_idoms - result.append(parent_idom) + result.append(parent_idoms) return result @@ -916,27 +903,19 @@ def all_parents_per_domain(self): Each domains nest list walks from the leaves of the nesting tree to the root. """ - - result = [] ppd = self.parents_per_domain() # {{{ exit early strategy: all domains are roots - if set(ppd) == {None}: - return [[], ] * len(self.domains) + if set(ppd) == {frozenset()}: + return [frozenset(), ] * len(self.domains) # }}} - for dom, parent in zip(self.domains, ppd): - # keep walking up tree to find *all* parents - dom_result = [] - while parent is not None: - dom_result.insert(0, parent) - parent = ppd[parent] + from pytools.graph import compute_transitive_closure - result.append(dom_result) - - return result + all_ppd = compute_transitive_closure(dict(enumerate(ppd))) + return [all_ppd[i] for i in range(len(all_ppd))] @memoize_method def _get_home_domain_map(self): From a904b49be427c09ba10bf859a55fc7b1d3a9e90b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 11:30:09 -0500 Subject: [PATCH 083/109] cleans up test_apps::test_domain_tree_nesting - "depth" is correctly defined anywhere in loopy and shouldn't be tested - avoid calling cl_exp on 'int's => ambiguous --- test/test_apps.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/test/test_apps.py b/test/test_apps.py index 6e49e73fa..2f3a383fc 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -619,7 +619,7 @@ def add_types(knl): parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) -def test_domain_tree_nesting(): +def test_domain_tree_nesting(ctx_factory): # From https://github.com/inducer/loopy/issues/78 AS = lp.AddressSpace # noqa @@ -632,7 +632,7 @@ def test_domain_tree_nesting(): TV = lp.TemporaryVariable # noqa - knl = lp.make_kernel(["{[i]: 0 <= i < 12}", + knl = lp.make_kernel(["{[i]: 0 <= i < 2}", "{[j]: 0 <= j < 100}", "{[a_count]: 0 <= a_count < a_end}", "{[b_count]: 0 <= b_count < b_end}"], @@ -647,7 +647,7 @@ def test_domain_tree_nesting(): for b_count <>val = vals[offset + b_count] {dep=offset} end - b_sum = exp(b_sum) {id=b_final} + b_sum = b_sum**2 {id=b_final} out[j,i] = b_sum {dep=b_final} end @@ -661,20 +661,9 @@ def test_domain_tree_nesting(): address_space=AS.PRIVATE), TV("num_vals_offset", initializer=num_vals_offset, read_only=True, address_space=AS.PRIVATE), - lp.GlobalArg("B", shape=(100, 31), dtype=np.float64), - lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)], - name="nested_domain") + ...], seq_dependencies=True, name="nested_domain") - parents_per_domain = knl["nested_domain"].parents_per_domain() - - def depth(i): - if parents_per_domain[i] is None: - return 0 - else: - return 1 + depth(parents_per_domain[i]) - - for i in range(len(parents_per_domain)): - assert depth(i) < 2 + lp.auto_test_vs_ref(knl, ctx_factory(), knl) def test_prefetch_through_indirect_access(): From ea5532117228ffabf2bb99c4b8955fcfdb2f4e61 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 11:40:53 -0500 Subject: [PATCH 084/109] fixup! allow multiple parents per domain; all_ppd = transitive_closure(ppd) - pytools.graph.compute_transitive_closure expects a MutableSet, earlier implementation had passed in a frozenset --- loopy/kernel/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6dd2358ff..870d6fb7e 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -914,8 +914,9 @@ def all_parents_per_domain(self): from pytools.graph import compute_transitive_closure - all_ppd = compute_transitive_closure(dict(enumerate(ppd))) - return [all_ppd[i] for i in range(len(all_ppd))] + all_ppd = compute_transitive_closure({i: set(p) + for i, p in enumerate(ppd)}) + return [frozenset(all_ppd[i]) for i in range(len(all_ppd))] @memoize_method def _get_home_domain_map(self): From 6aaa86741e3d72de95be701cbb1611684e231979 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 11:41:36 -0500 Subject: [PATCH 085/109] fix tests now that domains can have multiple parents --- test/test_domain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_domain.py b/test/test_domain.py index 03f1bbc2f..52a9d21c9 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -196,7 +196,7 @@ def test_dependent_loop_bounds_3(ctx_factory): target=lp.PyOpenCLTarget(ctx.devices[0]), name="loopy_kernel") - assert knl["loopy_kernel"].parents_per_domain()[1] == 0 + assert knl["loopy_kernel"].parents_per_domain()[1] == frozenset({0}) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -270,7 +270,7 @@ def test_independent_multi_domain(ctx_factory): inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl["loopy_kernel"].parents_per_domain() == 2*[None] + assert knl["loopy_kernel"].parents_per_domain() == 2*[frozenset()] n = 50 evt, (a, b) = knl(queue, n=n, out_host=True) From 94128385fca75f970079ce2915613ab6859fa405 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 12:04:20 -0500 Subject: [PATCH 086/109] strongVolumeKernels: make 'Nq' a kernel argument --- test/strongVolumeKernels.f90 | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/strongVolumeKernels.f90 b/test/strongVolumeKernels.f90 index b88f88e50..cfca584be 100644 --- a/test/strongVolumeKernels.f90 +++ b/test/strongVolumeKernels.f90 @@ -1,10 +1,11 @@ ! straight from gNUMA, do not modify in loopy subroutine strongVolumeKernelR(elements, & - volumeGeometricFactors, D, Q, gradQ, rhsQ) + volumeGeometricFactors, D, Q, gradQ, rhsQ, Nq) implicit none integer*4 elements + integer*4 Nq integer*4 e,i,j,k,n datafloat volumeGeometricFactors(Nq, Nq, Nq, 11, elements) datafloat D(Nq,Nq) @@ -75,10 +76,11 @@ subroutine strongVolumeKernelR(elements, & end subroutine strongVolumeKernelR subroutine strongVolumeKernelS(elements, & - volumeGeometricFactors, D, Q, gradQ, rhsQ) + volumeGeometricFactors, D, Q, gradQ, rhsQ, Nq) implicit none integer*4 elements + integer*4 Nq integer*4 e,i,j,k,n datafloat volumeGeometricFactors(Nq, Nq, Nq, 11, elements) datafloat D(Nq,Nq) @@ -149,10 +151,11 @@ subroutine strongVolumeKernelS(elements, & end subroutine strongVolumeKernelS subroutine strongVolumeKernelT(elements, & - volumeGeometricFactors, D, Q, gradQ, rhsQ) + volumeGeometricFactors, D, Q, gradQ, rhsQ, Nq) implicit none integer*4 elements + integer*4 Nq datafloat volumeGeometricFactors(Nq, Nq, Nq, 11, elements) datafloat D(Nq,Nq) datafloat Q(Nq, Nq, Nq, 8, elements) @@ -216,4 +219,3 @@ subroutine strongVolumeKernelT(elements, & end do end do end subroutine strongVolumeKernelT - From 277a2e784ab159a7faaeacfe17891af07b906279 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 12:27:26 -0500 Subject: [PATCH 087/109] sharpens test_reduction_with_conditional 'if' could dominate the 'for' loop iff the domain was constrained such that the loop couldn't be entered if 'l > 1'. This sort of hoisting is ill-defined and the users need to be more explicit about their requirement. --- test/test_reduction.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/test_reduction.py b/test/test_reduction.py index 065d3de46..9936fdfff 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -419,7 +419,7 @@ def test_parallel_multi_output_reduction(ctx_factory): assert max_index == np.argmax(np.abs(a)) -def test_reduction_with_conditional(): +def test_reduction_with_conditional(ctx_factory): # The purpose of the 'l' iname is to force the entire kernel (including the # predicate) into device code. @@ -429,16 +429,14 @@ def test_reduction_with_conditional(): if l > 0 b[l] = sum(i, l*a[i]) end - """, - [lp.ValueArg("n", dtype=np.int32), "..."]) - - knl = lp.tag_inames(knl, "l:g.0") + """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + ref_knl = knl + knl = lp.tag_inames(knl, "l:g.0") code = lp.generate_code_v2(knl).device_code() print(code) - # Check that the if appears before the loop that realizes the reduction. - assert code.index("if") < code.index("for") + lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl) def test_any_all(ctx_factory): From fdeadff797ab8cd944615805f0a5a878510bdc39 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Jun 2021 16:25:55 -0500 Subject: [PATCH 088/109] newly created precompute domain must project out all other inames except the outer/sweep inames --- loopy/transform/precompute.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index fe5a6385c..b174994a8 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -890,6 +890,8 @@ def add_assumptions(d): new_kernel_domains = new_kernel_domains + [new_inames_domain .drop_unused_params()] + new_domain_added_idx = len(new_kernel_domains) - 1 + # }}} else: @@ -898,6 +900,7 @@ def add_assumptions(d): non1_storage_axis_names = [] abm = NoOpArrayToBufferMap() + new_domain_added_idx = None kernel = kernel.copy(domains=new_kernel_domains) @@ -1044,7 +1047,22 @@ def add_assumptions(d): precompute_outer_inames = precompute_outer_inames \ | frozenset(non1_storage_axis_names) + if new_domain_added_idx: + parent_domains = kernel.all_parents_per_domain()[new_domain_added_idx] + old_domain = kernel.combine_domains(tuple(sorted(parent_domains)) + + (new_domain_added_idx, )) + + updated_added_domain = (old_domain + .project_out_except( + types=[isl.dim_type.param, isl.dim_type.set], + names=(precompute_outer_inames + | kernel.get_unwritten_value_args()))) + new_domains = kernel.domains.swap(new_domain_added_idx, updated_added_domain) + else: + new_domains = kernel.domains + kernel = kernel.copy( + domains=new_domains, instructions=[ insn.copy(within_inames=precompute_outer_inames) if insn.id == compute_insn_id From cedf7dc8db3b011bc89cc75dd7bff1ca9c29c4c5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Jun 2021 09:33:32 -0500 Subject: [PATCH 089/109] corrects the order of combining domains. there should be exactly one home domain associated with each iname --- loopy/transform/precompute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index b174994a8..04defdb20 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -1049,8 +1049,9 @@ def add_assumptions(d): if new_domain_added_idx: parent_domains = kernel.all_parents_per_domain()[new_domain_added_idx] - old_domain = kernel.combine_domains(tuple(sorted(parent_domains)) - + (new_domain_added_idx, )) + old_domain = kernel.combine_domains((new_domain_added_idx, ) + + tuple(sorted(parent_domains, + reverse=True))) updated_added_domain = (old_domain .project_out_except( From ad579f2afb222015287e6cd197b0c883def330ef Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 21 Jun 2021 21:42:17 -0500 Subject: [PATCH 090/109] misc minor fixes post rebase --- loopy/codegen/__init__.py | 10 +- loopy/codegen/instruction.py | 25 ++--- loopy/codegen/result.py | 147 ++++++++++++++++----------- loopy/kernel/__init__.py | 14 +-- loopy/library/function.py | 4 +- loopy/schedule/tools.py | 8 +- loopy/schedule/tree.py | 52 +++++----- loopy/target/__init__.py | 30 +++--- loopy/target/c/__init__.py | 61 ++++++----- loopy/target/c/codegen/expression.py | 8 +- loopy/target/cuda.py | 32 ++++-- loopy/target/ispc.py | 24 +++-- loopy/target/numba.py | 8 +- loopy/target/opencl.py | 29 ++++-- loopy/target/pyopencl.py | 32 ++++-- loopy/target/python.py | 48 +++++---- loopy/transform/callable.py | 4 +- 17 files changed, 316 insertions(+), 220 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e78f6a7b4..4a98a876a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -245,14 +245,16 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, from loopy.schedule.tree import (make_schedule_tree, insert_predicates_into_schedule) kernel = make_schedule_tree(kernel) - kernel, unvectorizable_inames = insert_predicates_into_schedule(kernel) + kernel, unvectorizable_inames = insert_predicates_into_schedule(kernel, + callables_table) # }}} from loopy.codegen.result import get_idis_for_kernel, CodeGenMapper - codegen_mapper = CodeGenMapper(kernel, unvectorizable_inames) + codegen_mapper = CodeGenMapper(kernel, callables_table, is_entrypoint, + unvectorizable_inames) - codegen_result = codegen_mapper(kernel.schedule) + codegen_result = codegen_mapper(kernel.linearization) seen_dtypes = (codegen_mapper.device_ast_builder.seen_dtypes | codegen_mapper.host_ast_builder.seen_dtypes) @@ -471,7 +473,7 @@ def generate_code_v2(program): implemented_data_infos[func_id] = cgr.implemented_data_info else: assert len(cgr.device_programs) == 1 - callee_fdecls.append(cgr.device_programs[0].ast.fdecl) + callee_fdecls.append(cgr.device_programs[0].ast.contents[0].fdecl) device_programs.extend(cgr.device_programs) device_preambles.extend(cgr.device_preambles) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 7edaf008b..1da5ef6b6 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -29,10 +29,10 @@ from pymbolic.mapper.stringifier import PREC_NONE -def generate_assignment_instruction_code(kernel, insn, ast_builder, - hw_inames_expr, vinfo): - ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr, - vinfo) +def generate_assignment_instruction_code(kernel, callables_table, insn, + ast_builder, hw_inames_expr, vinfo): + ecm = ast_builder.get_expression_to_code_mapper(kernel, callables_table, + hw_inames_expr, vinfo) from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript @@ -58,7 +58,8 @@ def generate_assignment_instruction_code(kernel, insn, ast_builder, del lhs - result = ast_builder.emit_assignment(kernel, insn, hw_inames_expr, vinfo) + result = ast_builder.emit_assignment(kernel, callables_table, insn, + hw_inames_expr, vinfo) # {{{ tracing @@ -127,10 +128,10 @@ def generate_assignment_instruction_code(kernel, insn, ast_builder, return result -def generate_call_code(kernel, insn, ast_builder, +def generate_call_code(kernel, callables_table, insn, ast_builder, hw_inames_expr, vinfo): - result = ast_builder.emit_multiple_assignment(kernel, insn, hw_inames_expr, - vinfo) + result = ast_builder.emit_multiple_assignment(kernel, callables_table, + insn, hw_inames_expr, vinfo) # {{{ vectorization handling @@ -150,10 +151,10 @@ def generate_call_code(kernel, insn, ast_builder, return result -def generate_c_instruction_code(kernel, insn, ast_builder, +def generate_c_instruction_code(kernel, callables_table, insn, ast_builder, hw_inames_expr, vinfo): - ecm = ast_builder.get_expression_to_code_mapper(kernel, hw_inames_expr, - vinfo) + ecm = ast_builder.get_expression_to_code_mapper(kernel, callables_table, + hw_inames_expr, vinfo) assert vinfo is None @@ -182,7 +183,7 @@ def generate_c_instruction_code(kernel, insn, ast_builder, return Block(body) -def generate_nop_instruction_code(kernel, insn, ast_builder, +def generate_nop_instruction_code(kernel, callables_table, insn, ast_builder, hw_inames_expr, vinfo): assert vinfo is None return ast_builder.emit_comment("no-op (insn=%s)" % (insn.id)) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index ce5389d55..ca41f1aaf 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -211,8 +211,11 @@ def copy(self, *, in_device=None, iname_exprs=None, class CodeGenMapper(CombineMapper): - def __init__(self, kernel, unvectorizable_inames): + def __init__(self, kernel, callables_table, is_entrypoint, + unvectorizable_inames): self.kernel = kernel + self.callables_table = callables_table + self.is_entrypoint = is_entrypoint self.unvectorizable_inames = unvectorizable_inames self.host_ast_builder = kernel.target.get_host_ast_builder() self.device_ast_builder = kernel.target.get_device_ast_builder() @@ -266,7 +269,7 @@ def map_schedule(self, expr): # {{{ emit global temporary declarations/initializations tv_init_host_asts = self.host_ast_builder.get_temporary_decls( - self.kernel, subkernel_name=None) + self.kernel, self.callables_table, subkernel_name=None) tv_init_device_asts = [] for tv in sorted((tv for tv in self.kernel.temporary_variables.values() @@ -278,11 +281,12 @@ def map_schedule(self, expr): decl_info, = tv.decl_info(self.kernel.target, index_dtype=self.kernel.index_dtype) decl = self.device_ast_builder.wrap_global_constant( - self.device_ast_builder.get_temporary_decl(self.kernel, tv, + self.device_ast_builder.get_temporary_decl(self.kernel, + self.callables_table, + tv, decl_info)) - rhs = self.device_ast_builder.emit_array_literal(self.kernel, - tv, - tv.initializer) + rhs = self.device_ast_builder.emit_array_literal( + self.kernel, self.callables_table, tv, tv.initializer) init_ast = self.device_ast_builder.emit_initializer(decl, rhs) tv_init_device_asts.append(init_ast) @@ -297,27 +301,36 @@ def map_schedule(self, expr): # }}} - host_fn_body_ast = self.host_ast_builder.ast_block_class(host_tgt_prelude - + tv_init_host_asts - + host_ast) - idis = get_idis_for_kernel(self.kernel) - host_fn_name = (self.kernel.target.host_program_name_prefix - + self.kernel.name - + self.kernel.target.host_program_name_suffix) - host_fn_decl = (self - .host_ast_builder - .get_function_declaration(self.kernel, host_fn_name, idis, - is_generating_device_code=True)) - host_fn_ast = (self - .host_ast_builder - .get_function_definition(self.kernel, host_fn_name, idis, - host_fn_decl, host_fn_body_ast)) - - host_prog = GeneratedProgram(name=host_fn_name, is_device_program=False, - ast=host_fn_ast) - - return CodeGenerationResult(host_prog, device_programs, idis) + + if self.is_entrypoint: + host_fn_body_ast = self.host_ast_builder.ast_block_class( + host_tgt_prelude + tv_init_host_asts + host_ast) + + host_fn_name = (self.kernel.target.host_program_name_prefix + + self.kernel.name + + self.kernel.target.host_program_name_suffix) + host_fn_decl = (self + .host_ast_builder + .get_function_declaration(self.kernel, + self.callables_table, + host_fn_name, idis, + is_generating_device_code=True, + is_entrypoint=True) + ) + host_fn_ast = (self + .host_ast_builder + .get_function_definition(self.kernel, host_fn_name, + idis, host_fn_decl, + host_fn_body_ast)) + + host_prog = GeneratedProgram(name=host_fn_name, is_device_program=False, + ast=host_fn_ast) + return CodeGenerationResult(host_prog, device_programs, idis) + else: + return CodeGenerationResult(host_program=None, + device_programs=device_programs, + implemented_data_info=idis) def map_function(self, expr, context): from loopy.codegen.control import synthesize_idis_for_extra_args @@ -326,20 +339,26 @@ def map_function(self, expr, context): # {{{ Host-side: call the kernel from loopy.schedule.tree import InstructionGatherer - gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids_as_exprs( - InstructionGatherer()(expr)) idis = (get_idis_for_kernel(self.kernel) + synthesize_idis_for_extra_args(self.kernel, expr)) dev_fn_decl = (self .device_ast_builder - .get_function_declaration(self.kernel, expr.name, idis, - is_generating_device_code=True)) - host_ast = self.host_ast_builder.get_kernel_call(self.kernel, - expr.name, idis, - # 'idis' include the - # "extra_args" - extra_args=[]) + .get_function_declaration(self.kernel, + self.callables_table, + expr.name, idis, + is_generating_device_code=True, + is_entrypoint=self.is_entrypoint)) + + if self.is_entrypoint: + host_ast = [self.host_ast_builder.get_kernel_call(self.kernel, + self.callables_table, + expr.name, idis, + # 'idis' include the + # "extra_args" + extra_args=[])] + else: + host_ast = [] # }}} @@ -348,7 +367,7 @@ def map_function(self, expr, context): # {{{ record the iname_exprs for downstream elements from functools import reduce - from loopy.kernel.data import GroupIndexTag, LocalIndexTag + from loopy.kernel.data import GroupInameTag, LocalInameTag from loopy.isl_helpers import static_min_of_pw_aff from loopy.symbolic import (GroupHardwareAxisIndex, LocalHardwareAxisIndex, @@ -360,23 +379,23 @@ def map_function(self, expr, context): frozenset()) def _hw_iname_expr(iname): - tag, = self.kernel.iname_tags_of_type(iname, (GroupIndexTag, - LocalIndexTag)) + tag, = self.kernel.iname_tags_of_type(iname, (GroupInameTag, + LocalInameTag)) lbound = static_min_of_pw_aff(self .kernel.get_iname_bounds(iname) .lower_bound_pw_aff, constants_only=False) return pw_aff_to_expr(lbound) + (GroupHardwareAxisIndex(tag.axis) - if isinstance(tag, GroupIndexTag) + if isinstance(tag, GroupInameTag) else LocalHardwareAxisIndex(tag.axis)) iname_exprs = {iname: _hw_iname_expr(iname) for iname in all_inames if self.kernel.iname_tags_of_type(iname, - (LocalIndexTag, - GroupIndexTag))} + (LocalInameTag, + GroupInameTag))} # }}} @@ -384,12 +403,15 @@ def _hw_iname_expr(iname): dev_fn_decl = (self .device_ast_builder - .get_function_declaration(self.kernel, expr.name, idis, - is_generating_device_code=True)) + .get_function_declaration(self.kernel, + self.callables_table, + expr.name, idis, + is_generating_device_code=True, + is_entrypoint=self.is_entrypoint)) tgt_prelude = self.device_ast_builder.generate_top_of_body(self.kernel) - temp_decls_asts = self.device_ast_builder.get_temporary_decls(self.kernel, - expr.name) + temp_decls_asts = self.device_ast_builder.get_temporary_decls( + self.kernel, self.callables_table, expr.name) children_res = self.combine([self.rec(child, dwnstrm_ctx) for child in expr.children]) dev_fn_body_ast = self.device_ast_builder.ast_block_class(tgt_prelude @@ -408,7 +430,7 @@ def _hw_iname_expr(iname): # }}} - return CodeGenMapperAccumulator([host_ast], [dev_prog]) + return CodeGenMapperAccumulator(host_ast, [dev_prog]) # {{{ for loop @@ -458,12 +480,14 @@ def map_for(self, expr, context): if expr.upper_bound != expr.lower_bound: loop_body = ast_builder.ast_block_class(body_ast) - loop_ast = [ast_builder.emit_sequential_loop(self.kernel, expr.iname, - self.kernel.index_dtype, - expr.lower_bound, - expr.upper_bound, - loop_body, - context.iname_exprs)] + loop_ast = [ast_builder.emit_sequential_loop(self.kernel, + self.callables_table, + expr.iname, + self.kernel.index_dtype, + expr.lower_bound, + expr.upper_bound, + loop_body, + context.iname_exprs)] else: # special case: if ubound == lbound => just have the body loop_ast = body_ast @@ -488,7 +512,8 @@ def map_if(self, expr, context): if context.in_device else children_res.host_ast) - if_ast = ast_builder.emit_if(self.kernel, expr.condition, if_body, + if_ast = ast_builder.emit_if(self.kernel, self.callables_table, + expr.condition, if_body, context.iname_exprs, context.vectorization_info) @@ -532,26 +557,34 @@ def map_run_instruction(self, expr, context): insn = self.kernel.id_to_insn[expr.insn_id] if isinstance(insn, CallInstruction): - insn_ast = generate_call_code(self.kernel, insn, + insn_ast = generate_call_code(self.kernel, + self.callables_table, + insn, ast_builder, context.iname_exprs, (context .vectorization_info)) elif isinstance(insn, Assignment): - insn_ast = generate_assignment_instruction_code(self.kernel, insn, + insn_ast = generate_assignment_instruction_code(self.kernel, + self.callables_table, + insn, ast_builder, (context .iname_exprs), (context .vectorization_info)) elif isinstance(insn, CInstruction): - insn_ast = generate_c_instruction_code(self.kernel, insn, + insn_ast = generate_c_instruction_code(self.kernel, + self.callables_table, + insn, ast_builder, context.iname_exprs, (context .vectorization_info)) elif isinstance(insn, NoOpInstruction): - insn_ast = generate_nop_instruction_code(self.kernel, insn, + insn_ast = generate_nop_instruction_code(self.kernel, + self.callables_table, + insn, ast_builder, context.iname_exprs, (context diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 870d6fb7e..55954fdd1 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -708,11 +708,11 @@ def __init__(self, domains, instructions, args=None, # these should not both be present raise ValueError( "received both `schedule` and `linearization` args, " - "'LoopKernel.linearization' is deprecated. " + "'LoopKernel.schedule' is deprecated. " "Use 'LoopKernel.linearization'.") elif schedule is not None: warn( - "'LoopKernel.linearization' is deprecated. " + "'LoopKernel.schedule' is deprecated. " "Use 'LoopKernel.linearization'.", DeprecationWarning, stacklevel=2) linearization = schedule @@ -994,7 +994,7 @@ def _get_inames_domain_backend(self, inames): @property def schedule(self): warn( - "LoopKernel.linearization is deprecated. " + "LoopKernel.schedule is deprecated. " "Call LoopKernel.linearization instead, " "will be unsupported in 2022.", DeprecationWarning, stacklevel=2) @@ -1911,14 +1911,16 @@ def get_copy_kwargs(self, **kwargs): domains = kwargs.get("domains", self.domains) kwargs["inames"] = make_iname_dict({k: Iname(k, v) for k, v in iname_to_tags.items()}, - self.domains.set_dims) + domains.set_dims) del kwargs["iname_to_tags"] if "domains" in kwargs: inames = kwargs.get("inames", self.inames) domains = kwargs["domains"] - kwargs["inames"] = {name: inames.get(name, Iname(name, frozenset())) - for name in _get_inames_from_domains(domains)} + kwargs["inames"] = make_iname_dict({k: Iname(k, v.tags) + for k, v in inames.items() + if v.tags}, + domains.set_dims) assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) diff --git a/loopy/library/function.py b/loopy/library/function.py index d7558960a..0e37f6631 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -72,8 +72,8 @@ def emit_call(self, expression_to_code_mapper, expression, target): from pymbolic import evaluate access_info = get_access_info(expression_to_code_mapper.kernel.target, ary, arg.index, lambda expr: evaluate(expr, - expression_to_code_mapper.codegen_state.var_subst_map), - expression_to_code_mapper.codegen_state.vectorization_info) + expression_to_code_mapper.var_subst_map), + expression_to_code_mapper.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 719d9337b..cf5638020 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -57,10 +57,10 @@ def temporaries_read_in_subkernel(kernel, subkernel): from loopy.kernel.tools import get_subkernel_to_insn_id_map from loopy.schedule.tree import Schedule, get_insns_in_function - if isinstance(kernel.schedule, Schedule): + if isinstance(kernel.linearization, Schedule): insn_ids = get_insns_in_function(kernel, subkernel) else: - assert isinstance(kernel.schedule, list) + assert isinstance(kernel.linearization, list) insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] return frozenset(tv @@ -73,10 +73,10 @@ def temporaries_written_in_subkernel(kernel, subkernel): from loopy.kernel.tools import get_subkernel_to_insn_id_map from loopy.schedule.tree import Schedule, get_insns_in_function - if isinstance(kernel.schedule, Schedule): + if isinstance(kernel.linearization, Schedule): insn_ids = get_insns_in_function(kernel, subkernel) else: - assert isinstance(kernel.schedule, list) + assert isinstance(kernel.linearization, list) insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] return frozenset(tv diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index d164d51de..6b3bc506f 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -23,7 +23,7 @@ import pymbolic.primitives as prim import loopy.schedule as schedule import islpy as isl -from typing import List, Union, Any, Optional, Tuple, FrozenSet +from typing import List, Union, Any, Optional, Mapping, FrozenSet from dataclasses import dataclass, field from functools import reduce from islpy import dim_type @@ -260,7 +260,7 @@ def make_schedule_tree(kernel): # bob: the schedule builder bob = ScheduleTreeBuilder.new() - for sched_item in kernel.schedule: + for sched_item in kernel.linearization: if isinstance(sched_item, schedule.CallKernel): bob.make_and_enter_function(sched_item.kernel_name, sched_item.extra_args, @@ -283,7 +283,7 @@ def make_schedule_tree(kernel): else: raise NotImplementedError(type(sched_item)) - kernel = kernel.copy(schedule=bob.exit()) + kernel = kernel.copy(linearization=bob.exit()) return kernel # }}} @@ -466,13 +466,11 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, If *domain* contains any inames going along hardware inames account for those in *implemented_domain*. - :arg gsize: A tuple of :class:`isl.PwAff` denoting the size of the - :returns: An instance of :class:`isl.BasicSet` that includes constraints from *implemented_domain* and constraints arising from constraining hardware inames in *domain* to their corresponding """ - from loopy.kernel.data import AxisTag, GroupIndexTag, LocalIndexTag + from loopy.kernel.data import AxisTag, GroupInameTag, LocalInameTag from loopy.isl_helpers import make_slab for dim_name in domain.get_var_dict(): @@ -482,12 +480,12 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, continue tag, = kernel.iname_tags_of_type(dim_name, AxisTag) - assert isinstance(tag, (GroupIndexTag, LocalIndexTag)) + assert isinstance(tag, (GroupInameTag, LocalInameTag)) lbound = kernel.get_iname_bounds(dim_name).lower_bound_pw_aff size = (gsize[tag.axis] - if isinstance(tag, GroupIndexTag) + if isinstance(tag, GroupInameTag) else @@ -519,8 +517,8 @@ def _implement_hw_axes_in_domains(implemented_domain, domain, class PolyhedronLoopifierContext: implemented_domain: isl.BasicSet hw_inames: FrozenSet[str] - gsize: Optional[Tuple[prim.Expression, ...]] = None - lsize: Optional[Tuple[prim.Expression, ...]] = None + gsize: Optional[Mapping[int, prim.Expression]] = None + lsize: Optional[Mapping[int, prim.Expression]] = None def copy(self, *, implemented_domain=None, hw_inames=None, gsize=None, lsize=None): @@ -541,8 +539,9 @@ def copy(self, *, implemented_domain=None, hw_inames=None, gsize=None, class PolyhedronLoopifier(IdentityMapper): - def __init__(self, kernel): + def __init__(self, kernel, callables_table): self.kernel = kernel + self.callables_table = callables_table def map_schedule(self, expr): impl_domain = self.kernel.assumptions @@ -556,7 +555,7 @@ def map_function(self, expr, context): # get the implemented domain for the insn ids in this kernel # Shouldn't be difficult to write a combine mapper for it. gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids( - InstructionGatherer()(expr)) + InstructionGatherer()(expr), self.callables_table, return_dict=True) hw_inames = get_all_inames_tagged_with(self.kernel, expr.name, AxisTag) @@ -778,8 +777,8 @@ class Unroller(PolyhedronLoopifier): usual suspects tagged with 'unr`. One use-case could be unrolling could be a fallback implementation for other iname implementations. """ - def __init__(self, kernel, extra_unroll_inames): - super().__init__(kernel) + def __init__(self, kernel, callables_table, extra_unroll_inames): + super().__init__(kernel, callables_table) self.extra_unroll_inames = extra_unroll_inames def map_polyhedral_loop(self, expr, context): @@ -860,8 +859,8 @@ def map_barrier(self, expr, loop_nesting): class PredicateInsertionMapper(PolyhedronLoopifier): - def __init__(self, kernel, loops_containing_barrier): - super().__init__(kernel) + def __init__(self, kernel, callables_table, loops_containing_barrier): + super().__init__(kernel, callables_table) self.loops_containing_barriers = loops_containing_barrier def map_instruction_block(self, expr, context): @@ -1067,17 +1066,17 @@ def homogenize_instruction_blocks(kernel): # TODO: Could be generalized by taking the homogenization criterion as an # argument. - new_schedule = InstructionBlockHomogenizer(kernel)(kernel.schedule) - return kernel.copy(schedule=new_schedule) + new_schedule = InstructionBlockHomogenizer(kernel)(kernel.linearization) + return kernel.copy(linearization=new_schedule) -def insert_predicates_into_schedule(kernel): +def insert_predicates_into_schedule(kernel, callables_table): if (kernel.iname_slab_increments and (set(kernel.iname_slab_increments.values()) != {(0, 0)})): raise NotImplementedError assert kernel.state >= KernelState.LINEARIZED - assert isinstance(kernel.schedule, Schedule) + assert isinstance(kernel.linearization, Schedule) # {{{ preprocessing before beginning the predicate insertion. @@ -1085,22 +1084,23 @@ def insert_predicates_into_schedule(kernel): # }}} - schedule = PolyhedronLoopifier(kernel)(kernel.schedule) - schedule = EmptyLoopRemover(kernel)(schedule) + schedule = PolyhedronLoopifier(kernel, callables_table)(kernel.linearization) + schedule = EmptyLoopRemover(kernel, callables_table)(schedule) unvectorizable_inames = UnvectorizableInamesCollector(kernel)(schedule) # TODO: (For now) unvectorizable inames always fallback to unrolling this # should be selected based on the target. - schedule = Unroller(kernel, unvectorizable_inames)(schedule) + schedule = Unroller(kernel, callables_table, unvectorizable_inames)(schedule) loops_containing_barrier = BarrieredLoopsCollector()(schedule) - schedule = PredicateInsertionMapper(kernel, loops_containing_barrier)(schedule) - return kernel.copy(schedule=schedule), unvectorizable_inames + schedule = PredicateInsertionMapper(kernel, callables_table, + loops_containing_barrier)(schedule) + return kernel.copy(linearization=schedule), unvectorizable_inames @memoize_on_first_arg def get_insns_in_function(kernel, name): function, = [fn - for fn in FunctionCollector()(kernel.schedule) + for fn in FunctionCollector()(kernel.linearization) if fn.name == name] return InstructionGatherer()(function) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 496a97698..daa39faf3 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -192,17 +192,19 @@ def get_function_definition(self, kernel, name, implemented_data_info, function_decl, function_body): raise NotImplementedError - def get_function_declaration(self, kernel, name, implemented_data_info, + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, is_generating_device_code): raise NotImplementedError def generate_top_of_body(self, kernel): return [] - def get_temporary_decls(self, kernel, subkernel_name): + def get_temporary_decls(self, kernel, callables_table, subkernel_name): raise NotImplementedError - def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + def get_kernel_call(self, kernel, callables_table, name, + implemented_data_info, extra_args): raise NotImplementedError @property @@ -244,20 +246,21 @@ def get_global_arg_decl(self, name, shape, dtype, is_written): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() - def emit_array_literal(self, kernel, array, value): + def emit_array_literal(self, kernel, callables_table, array, value): """ :arg ary: An instance of :class:`loopy.kernel.array.ArrayBase`. """ raise NotImplementedError - def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): + def emit_assignment(self, kernel, callables_table, insn, var_subst_map, + vectorization_info): raise NotImplementedError() - def emit_multiple_assignment(self, kernel, insn, var_subst_map, + def emit_multiple_assignment(self, kernel, callables_table, insn, var_subst_map, vectorization_info): raise NotImplementedError() - def emit_sequential_loop(self, kernel, iname, iname_dtype, + def emit_sequential_loop(self, kernel, callables_table, iname, iname_dtype, lbound, ubound, inner): raise NotImplementedError() @@ -265,7 +268,8 @@ def emit_sequential_loop(self, kernel, iname, iname_dtype, def can_implement_conditionals(self): return False - def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): + def emit_if(self, kernel, callables_table, condition, ast, var_subst_map, + vectorization_info): raise NotImplementedError() def emit_initializer(self, decl, val): @@ -315,18 +319,20 @@ def get_function_definition(self, kernel, name, implemented_data_info, function_decl, function_body): return function_body - def get_function_declaration(self, kernel, name, implemented_data_info, - is_generating_device_code): + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, + is_generating_device_code, is_entrypoint): return None - def get_temporary_decls(self, kernel, subkernel_name): + def get_temporary_decls(self, kernel, callables_table, subkernel_name): return [] def get_expression_to_code_mapper(self, kernel, var_subst_map, vectorization_info): return _DummyExpressionToCodeMapper() - def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + def get_kernel_call(self, kernel, callables_table, name, + implemented_data_info, extra_args): return _DummyASTBlock([]) @property diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index d9737fdc4..165387f39 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -761,12 +761,13 @@ def get_function_declaration(self, kernel, callables_table, name, [self.idi_to_cgen_declarator(kernel, idi) for idi in implemented_data_info])) - def emit_array_literal(self, kernel, array, value): + def emit_array_literal(self, kernel, callables_table, array, value): """ :arg ary: An instance of :class:`loopy.kernel.array.ArrayBase`. """ data = generate_linearized_array(array, value) - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) from loopy.expression import dtype_to_type_context @@ -780,17 +781,19 @@ def emit_array_literal(self, kernel, array, value): ecm.map_constant(d_i, type_context) for d_i in data))) - def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + def get_kernel_call(self, kernel, callables_table, name, + implemented_data_info, extra_args): return self.emit_blank_line() - def get_temporary_decls(self, kernel, subkernel_name): + def get_temporary_decls(self, kernel, callables_table, subkernel_name): if subkernel_name is None: # => host program => no temp dels return [] from loopy.kernel.data import AddressSpace - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) base_storage_decls = [] temp_decls = [] @@ -823,13 +826,13 @@ def get_temporary_decls(self, kernel, subkernel_name): tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( - kernel, tv, idi), + kernel, callables_table, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, self.emit_array_literal( - kernel, tv, tv.initializer)) + kernel, callables_table, tv, tv.initializer)) temp_decls.append(decl) @@ -973,7 +976,7 @@ def get_c_expression_to_code_mapper(self): from loopy.target.c.codegen.expression import CExpressionToCodeMapper return CExpressionToCodeMapper() - def get_temporary_decl(self, kernel, temp_var, decl_info): + def get_temporary_decl(self, kernel, callables_table, temp_var, decl_info): temp_var_decl = POD(self.target.dtype_to_typename(decl_info.dtype), decl_info.dtype, decl_info.name) @@ -983,7 +986,8 @@ def get_temporary_decl(self, kernel, temp_var, decl_info): if decl_info.shape: from cgen import ArrayOf - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) temp_var_decl = ArrayOf(temp_var_decl, ecm(p.flattened_product(decl_info.shape), @@ -1048,9 +1052,11 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): return arg_decl - def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): + def emit_assignment(self, kernel, callables_table, insn, var_subst_map, + vectorization_info): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info) assignee_var_name, = insn.assignee_var_names() @@ -1084,17 +1090,18 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): elif isinstance(lhs_atomicity, AtomicInit): self.seen_atomic_dtypes.add(lhs_dtype) - return self.emit_atomic_init( - kernel, var_subst_map, lhs_atomicity, lhs_var, - insn.assignee, insn.expression, - lhs_dtype, rhs_type_context) + return self.emit_atomic_init(kernel, callables_table, + var_subst_map, lhs_atomicity, lhs_var, + insn.assignee, insn.expression, + lhs_dtype, rhs_type_context) elif isinstance(lhs_atomicity, AtomicUpdate): self.seen_atomic_dtypes.add(lhs_dtype) - return self.emit_atomic_update( - kernel, var_subst_map, lhs_atomicity, lhs_var, - insn.assignee, insn.expression, - lhs_dtype, rhs_type_context) + return self.emit_atomic_update(kernel, callables_table, + var_subst_map, lhs_atomicity, + lhs_var, insn.assignee, + insn.expression, lhs_dtype, + rhs_type_context) else: raise ValueError("unexpected lhs atomicity type: %s" @@ -1136,7 +1143,8 @@ def emit_tuple_assignment(self, kernel, callables_table, insn, def emit_multiple_assignment(self, kernel, callables_table, insn, var_subst_map, vectorization_info): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info) func_id = insn.expression.function.name @@ -1166,9 +1174,10 @@ def emit_multiple_assignment(self, kernel, callables_table, insn, CExpression(self.get_c_expression_to_code_mapper(), in_knl_callable_as_call)) - def emit_sequential_loop(self, kernel, iname, iname_dtype, lbound, ubound, - inner, var_subst_map): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + def emit_sequential_loop(self, kernel, callables_table, iname, iname_dtype, + lbound, ubound, inner, var_subst_map): + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info=None) from pymbolic import var @@ -1213,10 +1222,12 @@ def emit_collection(self, asts): def can_implement_conditionals(self): return True - def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): + def emit_if(self, kernel, callables_table, condition, ast, var_subst_map, + vectorization_info): assert vectorization_info is None, "cannot be vectorizable if we see an if" from cgen import If - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info=None) return If(ecm(condition), ast) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index adc9243ba..cd3a788ac 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -473,7 +473,7 @@ def map_constant(self, expr, type_context): def map_call(self, expr, type_context): return ( - self.codegen_state.callables_table[ + self.callables_table[ expr.function.name].emit_call( expression_to_code_mapper=self, expression=expr, @@ -531,10 +531,10 @@ def map_power(self, expr, type_context): self.rec(expr.exponent, type_context)) else: from loopy.codegen import SeenFunction - clbl = self.codegen_state.ast_builder.known_callables["pow"] + clbl = self.ast_builder.known_callables["pow"] clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype}, - self.codegen_state.callables_table)[0] - self.codegen_state.seen_functions.add( + self.callables_table)[0] + self.ast_builder.seen_functions.add( SeenFunction( clbl.name, clbl.name_in_target, (base_dtype, exponent_dtype), diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 60974b362..9f88dc523 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -25,7 +25,7 @@ import numpy as np -from pytools import memoize_method +from pytools import memoize_method, UniqueNameGenerator from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -309,6 +309,10 @@ class CUDACASTBuilder(CFamilyASTBuilder): preamble_function_qualifier = "inline __device__" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.var_name_generator = UniqueNameGenerator() + # {{{ library @property @@ -324,9 +328,10 @@ def known_callables(self): def get_function_declaration(self, kernel, callables_table, name, implemented_data_info, is_generating_device_code, is_entrypoint): - fdecl = super().get_function_declaration(kernel, name, + fdecl = super().get_function_declaration(kernel, callables_table, name, implemented_data_info, - is_generating_device_code) + is_generating_device_code, + is_entrypoint) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) @@ -364,9 +369,11 @@ def preamble_generators(self): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): - return ExpressionToCudaCExpressionMapper(kernel, self, var_subst_map, + return ExpressionToCudaCExpressionMapper(kernel, callables_table, self, + var_subst_map, vectorization_info) _VEC_AXES = "xyzw" @@ -443,8 +450,9 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): # {{{ code generation for atomic update - def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + def emit_atomic_update(self, kernel, callables_table, var_subst_map, + lhs_atomicity, lhs_var, lhs_expr, rhs_expr, + lhs_dtype, rhs_type_context): from pymbolic.primitives import Sum from cgen import Statement @@ -454,7 +462,9 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, np.int32, np.int64, np.float32, np.float64]: # atomicAdd if isinstance(rhs_expr, Sum): - ecm = self.get_expression_to_code_mapper(codegen_state) + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map=var_subst_map, + vectorization_info=None) new_rhs_expr = Sum(tuple(c for c in rhs_expr.children if c != lhs_expr)) @@ -466,11 +476,11 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, else: from cgen import Block, DoWhile, Assign from loopy.target.c import POD - old_val_var = codegen_state.var_name_generator("loopy_old_val") - new_val_var = codegen_state.var_name_generator("loopy_new_val") + old_val_var = self.var_name_generator("loopy_old_val") + new_val_var = self.var_name_generator("loopy_new_val") from loopy.kernel.data import TemporaryVariable - ecm = codegen_state.expression_to_code_mapper.with_assignments( + ecm = self.expression_to_code_mapper.with_assignments( { old_val_var: TemporaryVariable(old_val_var, lhs_dtype), new_val_var: TemporaryVariable(new_val_var, lhs_dtype), diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 10e8b1ba9..0619a2bc1 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -230,8 +230,9 @@ def _arg_names_and_decls(self, kernel, implemented_data_info): # {{{ top-level codegen - def get_function_declaration(self, kernel, name, implemented_data_info, - is_generating_device_code): + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, + is_generating_device_code, is_entrypoint): from cgen import (FunctionDeclaration, Value) from cgen.ispc import ISPCExport, ISPCTask @@ -254,8 +255,10 @@ def get_function_declaration(self, kernel, name, implemented_data_info, # }}} - def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + def get_kernel_call(self, kernel, callables_table, name, + implemented_data_info, extra_args): + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) from loopy.schedule.tree import get_insns_in_function @@ -263,7 +266,7 @@ def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): from cgen import Statement as S, Block gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insns_in_function(kernel, name)) + get_insns_in_function(kernel, name), callables_table) result = [] if lsize: @@ -288,9 +291,11 @@ def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): - return ExprToISPCExprMapper(kernel, self, var_subst_map, vectorization_info) + return ExprToISPCExprMapper(kernel, callables_table, self, + var_subst_map, vectorization_info) def add_vector_access(self, access_expr, index): return access_expr[index] @@ -309,7 +314,7 @@ def emit_barrier(self, synchronization_kind, mem_kind, comment): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, kernel, temp_var, decl_info): + def get_temporary_decl(self, kernel, callables_table, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self.target.dtype_to_typename(decl_info.dtype), decl_info.dtype, decl_info.name) @@ -325,7 +330,8 @@ def get_temporary_decl(self, kernel, temp_var, decl_info): if shape: from cgen import ArrayOf - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) temp_var_decl = ArrayOf( temp_var_decl, diff --git a/loopy/target/numba.py b/loopy/target/numba.py index 9ed781769..a168446a4 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -62,17 +62,19 @@ def get_function_definition(self, kernel, name, implemented_data_info, def get_python_function_decorators(self): return () - def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): + def get_kernel_call(self, kernel, callables_table, name, + implemented_data_info, extra_args): from pymbolic.mapper.stringifier import PREC_NONE from genpy import Statement implemented_data_info = implemented_data_info - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) from loopy.schedule.tree import get_insns_in_function gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insns_in_function(kernel, name)) + get_insns_in_function(kernel, name), callables_table) return Statement( "{}[{}, {}]({})".format( diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index f9a613401..4f66aa564 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -608,9 +608,11 @@ def get_function_declaration(self, kernel, callables_table, name, is_generating_device_code, is_entrypoint): assert is_generating_device_code - fdecl = super().get_function_declaration(kernel, name, + fdecl = super().get_function_declaration(kernel, callables_table, + name, implemented_data_info, - is_generating_device_code) + is_generating_device_code, + is_entrypoint) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) @@ -653,9 +655,11 @@ def generate_top_of_body(self, kernel): # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): - return ExpressionToOpenCLCExpressionMapper(kernel, self, var_subst_map, + return ExpressionToOpenCLCExpressionMapper(kernel, callables_table, + self, var_subst_map, vectorization_info) def add_vector_access(self, access_expr, index): @@ -744,23 +748,26 @@ def get_constant_arg_decl(self, name, shape, dtype, is_written): # {{{ - def emit_atomic_init(self, kernel, var_subst_map, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + def emit_atomic_init(self, kernel, callables_table, var_subst_map, + lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, + rhs_type_context): # for the CL1 flavor, this is as simple as a regular update with whatever # the RHS value is... - return self.emit_atomic_update(kernel, var_subst_map, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) + return self.emit_atomic_update(kernel, callables_table, var_subst_map, + lhs_atomicity, lhs_var, lhs_expr, + rhs_expr, lhs_dtype, rhs_type_context) # }}} # {{{ code generation for atomic update - def emit_atomic_update(self, kernel, var_subst_map, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + def emit_atomic_update(self, kernel, callables_table, var_subst_map, + lhs_atomicity, lhs_var, lhs_expr, rhs_expr, + lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE - ecm = self.get_expression_to_code_mapper(kernel, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, var_subst_map=var_subst_map, vectorization_info=None) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index b9a433373..5a2642f68 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -695,8 +695,10 @@ def get_function_definition(self, kernel, name, implemented_data_info, Return("_lpy_evt"), ])) - def get_function_declaration(self, kernel, name, implemented_data_info, - is_generating_device_code): + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, + is_generating_device_code, + is_entrypoint): # no such thing in Python return None @@ -708,7 +710,7 @@ def _get_global_temporaries(self, kernel): if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) - def get_temporary_decls(self, kernel, subkernel_name): + def get_temporary_decls(self, kernel, callables_table, subkernel_name): from genpy import Assign, Comment, Line from collections import defaultdict from numbers import Number @@ -720,7 +722,8 @@ def alloc_nbytes(tv): return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from pymbolic.mapper.stringifier import PREC_NONE - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) global_temporaries = self._get_global_temporaries(kernel) @@ -772,13 +775,15 @@ def alloc_nbytes(tv): return code_lines - def get_kernel_call(self, kernel, name, implemented_data_info, extra_args): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + def get_kernel_call(self, kernel, callables_table, name, + implemented_data_info, extra_args): + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) from loopy.schedule.tree import get_insns_in_function gsize, lsize = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insns_in_function(kernel, name)) + get_insns_in_function(kernel, name), callables_table) if not gsize: gsize = (1,) @@ -865,9 +870,11 @@ def preamble_generators(self): # }}} - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): - return ExpressionToPyOpenCLCExpressionMapper(kernel, self, var_subst_map, + return ExpressionToPyOpenCLCExpressionMapper(kernel, callables_table, + self, var_subst_map, vectorization_info) # }}} @@ -875,11 +882,14 @@ def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, # {{{ volatile mem acccess target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): from loopy.target.opencl import \ VolatileMemExpressionToOpenCLCExpressionMapper - return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, self, + return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, + callables_table, + self, var_subst_map, vectorization_info) diff --git a/loopy/target/python.py b/loopy/target/python.py index 518f7cdc7..fb05703cb 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -29,15 +29,16 @@ from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase -from genpy import Suite, Collection +from genpy import Suite # {{{ expression to code class ExpressionToPythonMapper(StringifyMapper): - def __init__(self, kernel, ast_builder, var_subst_map, vectorization_info, - type_inf_mapper=None): + def __init__(self, kernel, callables_table, ast_builder, var_subst_map, + vectorization_info, type_inf_mapper=None): self.kernel = kernel + self.callables_table = callables_table self.ast_builder = ast_builder self.var_subst_map = var_subst_map @@ -48,7 +49,7 @@ def __init__(self, kernel, ast_builder, var_subst_map, vectorization_info, if type_inf_mapper is None: type_inf_mapper = TypeReader(self.kernel, - self.codegen_state.callables_table) + self.callables_table) self.type_inf_mapper = type_inf_mapper self.seen_functions = set() @@ -85,15 +86,13 @@ def map_subscript(self, expr, enclosing_prec): def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - identifier_name = self.codegen_state.callables_table[ - expr.function.name].name + identifier_name = self.callables_table[expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - clbl = self.codegen_state.callables_table[ - expr.function.name] + clbl = self.callables_table[expr.function.name] str_parameters = None number_of_assignees = len([key for key in @@ -165,7 +164,8 @@ def ast_module(self): import genpy return genpy - def get_function_declaration(self, kernel, name, implemented_data_info, + def get_function_declaration(self, kernel, callables_table, name, + implemented_data_info, is_generating_device_code): return None @@ -180,8 +180,9 @@ def get_function_definition(self, kernel, name, implemented_data_info, [idi.name for idi in implemented_data_info], function_body) - def get_temporary_decls(self, kernel, subkernel_name): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map={}, + def get_temporary_decls(self, kernel, callables_table, subkernel_name): + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map={}, vectorization_info=None) result = [] @@ -207,10 +208,10 @@ def get_temporary_decls(self, kernel, subkernel_name): return result - def get_expression_to_code_mapper(self, kernel, var_subst_map, - vectorization_info): - return ExpressionToPythonMapper(kernel, self, var_subst_map, - vectorization_info) + def get_expression_to_code_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): + return ExpressionToPythonMapper(kernel, callables_table, self, + var_subst_map, vectorization_info) @property def ast_base_class(self): @@ -236,9 +237,10 @@ def ast_block_scope_class(self): from genpy import Collection return Collection - def emit_sequential_loop(self, kernel, iname, iname_dtype, + def emit_sequential_loop(self, kernel, callables_table, iname, iname_dtype, lbound, ubound, inner, var_subst_map): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info=None) from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM @@ -276,16 +278,19 @@ def emit_collection(self, asts): def can_implement_conditionals(self): return True - def emit_if(self, kernel, condition, ast, var_subst_map, vectorization_info): + def emit_if(self, kernel, callables_table, condition, ast, var_subst_map, + vectorization_info): assert vectorization_info is None from genpy import If from pymbolic.mapper.stringifier import PREC_NONE - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info) return If(ecm(condition, prec=PREC_NONE), ast) - def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): + def emit_assignment(self, kernel, callables_table, insn, var_subst_map, + vectorization_info): if insn.atomicity: raise NotImplementedError("atomic ops in Python") @@ -295,7 +300,8 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): from pymbolic.mapper.stringifier import PREC_NONE from genpy import Assign - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info) return Assign( diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 1f6c9b2b3..a13041be2 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -286,7 +286,7 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): for old_name, callee_iname in callee_knl.inames.items(): new_name = name_map[old_name] - new_inames[new_name] = callee_iname.copy(name=new_name) + new_inames = new_inames.set(new_name, callee_iname.copy(name=new_name)) # }}} @@ -321,7 +321,7 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): # {{{ process domains/assumptions # rename inames - new_domains = callee_knl.domains.copy() + new_domains = callee_knl.domains for old_iname in callee_knl.all_inames(): new_domains = [rename_iname(dom, old_iname, name_map[old_iname]) for dom in new_domains] From 59bc19059eb9960c0c601bb4b10d4f8c20d0e664 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 22 Jun 2021 04:03:23 -0500 Subject: [PATCH 091/109] placate flake8-bugbear --- loopy/codegen/result.py | 6 +++++- loopy/kernel/__init__.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index ca41f1aaf..f7214a7e0 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -101,7 +101,11 @@ class CodeGenerationResult(ImmutableRecord): Only added at the very end of code generation. """ def __init__(self, host_program, device_programs, implemented_data_info, - host_preambles=[], device_preambles=[]): + host_preambles=None, device_preambles=None): + if host_preambles is None: + host_preambles = [] + if device_preambles is None: + device_preambles = [] super().__init__(host_program=host_program, device_programs=device_programs, implemented_data_info=implemented_data_info, diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 55954fdd1..1fc83a4f2 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -870,7 +870,7 @@ def parents_per_domain(self): result = [] hdm = self._get_home_domain_map() - for idom, dom in enumerate(self.domains): + for dom in self.domains: idom_param_vars = (frozenset(dom.get_var_names(dim_type.param)) - self.get_unwritten_value_args()) # outer_inames: inames that must be nested outside the 'set dims' From 40762de9e344681904ce9085b0f63aab3c7aec08 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 22 Jun 2021 04:13:07 -0500 Subject: [PATCH 092/109] placate pylint --- loopy/target/c/__init__.py | 10 ++++++---- loopy/target/cuda.py | 2 +- loopy/target/ispc.py | 11 +++++++---- loopy/target/numba.py | 7 ++++--- loopy/target/opencl.py | 7 +++++-- loopy/target/python.py | 2 +- 6 files changed, 24 insertions(+), 15 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 165387f39..47a48dc48 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1107,12 +1107,14 @@ def emit_assignment(self, kernel, callables_table, insn, var_subst_map, raise ValueError("unexpected lhs atomicity type: %s" % type(lhs_atomicity).__name__) - def emit_atomic_init(self, kernel, var_subst_map, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + def emit_atomic_init(self, kernel, callables_table, var_subst_map, + lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, + rhs_type_context): raise NotImplementedError("atomic updates in %s" % type(self).__name__) - def emit_atomic_update(self, kernel, var_subst_map, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + def emit_atomic_update(self, kernel, callables_table, var_subst_map, + lhs_atomicity, lhs_var, lhs_expr, rhs_expr, + lhs_dtype, rhs_type_context): raise NotImplementedError("atomic updates in %s" % type(self).__name__) def emit_tuple_assignment(self, kernel, callables_table, insn, diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 9f88dc523..91c1ec597 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -480,7 +480,7 @@ def emit_atomic_update(self, kernel, callables_table, var_subst_map, new_val_var = self.var_name_generator("loopy_new_val") from loopy.kernel.data import TemporaryVariable - ecm = self.expression_to_code_mapper.with_assignments( + ecm = ecm.with_assignments( { old_val_var: TemporaryVariable(old_val_var, lhs_dtype), new_val_var: TemporaryVariable(new_val_var, lhs_dtype), diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 0619a2bc1..8a97debf9 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -385,9 +385,11 @@ def get_value_arg_decl(self, name, shape, dtype, is_written): from cgen.ispc import ISPCUniform return ISPCUniform(result) - def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): + def emit_assignment(self, kernel, callables_table, insn, var_subst_map, + vectorization_info): - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info) assignee_var_name, = insn.assignee_var_names() @@ -502,14 +504,15 @@ def emit_assignment(self, kernel, insn, var_subst_map, vectorization_info): from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) - def emit_sequential_loop(self, kernel, iname, iname_dtype, + def emit_sequential_loop(self, kernel, callables_table, iname, iname_dtype, lbound, ubound, inner, var_subst_map): from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer from cgen.ispc import ISPCUniform - ecm = self.get_expression_to_code_mapper(kernel, var_subst_map, + ecm = self.get_expression_to_code_mapper(kernel, callables_table, + var_subst_map, vectorization_info=None) return For( diff --git a/loopy/target/numba.py b/loopy/target/numba.py index a168446a4..9a87c6ed8 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -167,9 +167,10 @@ def preamble_generators(self): def get_python_function_decorators(self): return ("@_lpy_ncu.jit",) - def get_expression_to_code_mapper(self, kernel, var_subst_map, - vectorization_info): - return NumbaCudaExpressionToPythonMapper(kernel, self, var_subst_map, + def get_expression_to_code_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): + return NumbaCudaExpressionToPythonMapper(kernel, callables_table, self, + var_subst_map, vectorization_info) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 4f66aa564..a5ee310bb 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -913,9 +913,12 @@ def make_subscript(self, array, base_expr, subscript): class VolatileMemOpenCLCASTBuilder(OpenCLCASTBuilder): - def get_expression_to_c_expression_mapper(self, kernel, var_subst_map, + def get_expression_to_c_expression_mapper(self, kernel, callables_table, + var_subst_map, vectorization_info): - return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, self, + return VolatileMemExpressionToOpenCLCExpressionMapper(kernel, + callables_table, + self, var_subst_map, vectorization_info) diff --git a/loopy/target/python.py b/loopy/target/python.py index fb05703cb..8c265b0f6 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -166,7 +166,7 @@ def ast_module(self): def get_function_declaration(self, kernel, callables_table, name, implemented_data_info, - is_generating_device_code): + is_generating_device_code, is_entrypoint): return None def get_function_definition(self, kernel, name, implemented_data_info, From 36542004d9ddf8234ad7a3b8bc45f0017cce180e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 29 Jun 2021 18:37:49 -0500 Subject: [PATCH 093/109] ctarget: do not emit unnecessary braces --- loopy/target/c/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 47a48dc48..077aed1d6 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1178,6 +1178,10 @@ def emit_multiple_assignment(self, kernel, callables_table, insn, def emit_sequential_loop(self, kernel, callables_table, iname, iname_dtype, lbound, ubound, inner, var_subst_map): + from cgen import Block + if isinstance(inner, Block) and len(inner.contents) == 1: + inner, = inner.contents + ecm = self.get_expression_to_code_mapper(kernel, callables_table, var_subst_map, vectorization_info=None) @@ -1227,7 +1231,10 @@ def can_implement_conditionals(self): def emit_if(self, kernel, callables_table, condition, ast, var_subst_map, vectorization_info): assert vectorization_info is None, "cannot be vectorizable if we see an if" - from cgen import If + from cgen import If, Block + if isinstance(ast, Block) and len(ast.contents) == 1: + ast, = ast.contents + ecm = self.get_expression_to_code_mapper(kernel, callables_table, var_subst_map, vectorization_info=None) From 6c5962e4aeb5da19e060c9a2f098a5df289489ce Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 29 Jun 2021 19:22:53 -0500 Subject: [PATCH 094/109] kernel tags could be custom tags: relax the assertion --- loopy/codegen/result.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index f7214a7e0..94c5b4308 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -439,15 +439,9 @@ def _hw_iname_expr(iname): # {{{ for loop def map_for(self, expr, context): - from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, - VectorizeTag, LoopedIlpTag, - ForceSequentialTag, - InOrderSequentialSequentialTag) + from loopy.kernel.data import VectorizeTag - unr_tags = (UnrolledIlpTag, UnrollTag) vec_tags = (VectorizeTag, ) - seq_tags = (LoopedIlpTag, ForceSequentialTag, - InOrderSequentialSequentialTag) ast_builder = self.device_ast_builder if context.in_device else self.host_ast_builder # noqa: E501 if (self.kernel.iname_tags_of_type(expr.iname, vec_tags) @@ -462,9 +456,6 @@ def map_for(self, expr, context): return self.combine([self.rec(child, dwnstrm_ctx) for child in expr.children]) else: - assert (len(self.kernel.inames[expr.iname].tags) == 0 - or self.kernel.iname_tags_of_type(expr.iname, - seq_tags+unr_tags+vec_tags)) assert expr.step == 1 if expr.upper_bound != expr.lower_bound: From 9431fd2d56da399855cdc05861c9329ab244587f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 29 Jun 2021 21:38:26 -0500 Subject: [PATCH 095/109] fix trace_assignments for non-entrypoint kernels --- loopy/codegen/instruction.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 1da5ef6b6..995bb4591 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -74,19 +74,20 @@ def generate_assignment_instruction_code(kernel, callables_table, insn, from cgen import Statement as S # noqa - gs, ls = kernel.get_grid_size_upper_bounds(callables_table) + gs, ls = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) printf_format = "{}.{}[{}][{}]: {}".format( kernel.name, insn.id, - ", ".join("gid%d=%%d" % i for i in range(len(gs))), - ", ".join("lid%d=%%d" % i for i in range(len(ls))), + ", ".join("gid%d=%%d" % i for i in gs), + ", ".join("lid%d=%%d" % i for i in ls), assignee_var_name) printf_args = ( - ["gid(%d)" % i for i in range(len(gs))] + ["gid(%d)" % i for i in gs] + - ["lid(%d)" % i for i in range(len(ls))] + ["lid(%d)" % i for i in ls] ) if assignee_indices: From 9ad53068697ae9f77287ccd7c3e6f96b5bbaabda Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 30 Jun 2021 15:25:45 -0500 Subject: [PATCH 096/109] do not access preamble_info's codegen_state --- loopy/target/c/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 077aed1d6..360dc90a2 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -113,7 +113,7 @@ def c99_preamble_generator(preamble_info): inf_or_nan_recorder = InfOrNanInExpressionRecorder() - for insn in preamble_info.codegen_state.kernel.instructions: + for insn in preamble_info.kernel.instructions: insn.with_transformed_expressions(inf_or_nan_recorder) if inf_or_nan_recorder.saw_inf_or_nan: From b9ecf9ffe4dbfe68a3ee20f72d6231cb97d059be Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 4 Jul 2021 01:01:30 -0500 Subject: [PATCH 097/109] move iname tags inside LoopKernelDomains --- loopy/kernel/__init__.py | 122 ++++++++++++++++++++++++--------------- loopy/kernel/creation.py | 5 +- loopy/transform/iname.py | 10 +--- 3 files changed, 81 insertions(+), 56 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 1fc83a4f2..07c82f056 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -43,6 +43,7 @@ from typing import FrozenSet from dataclasses import dataclass, fields from warnings import warn +from functools import reduce # {{{ unique var names @@ -270,6 +271,7 @@ class LoopKernelDomains: _domains: PVector param_to_idoms: PMap home_domain_map: PMap + inames: PMap def __getitem__(self, key): return self._domains[key] @@ -288,25 +290,27 @@ def append(self, dom): | frozenset([idom])) hdm_update = {k: idom for k in dom.get_var_names(dim_type.set)} + inames_update = {k: Iname(k, frozenset()) + for k in dom.get_var_names(dim_type.set)} return LoopKernelDomains(_domains=self._domains.append(dom), param_to_idoms=(self.param_to_idoms .update(param_to_idoms_update)), home_domain_map=(self.home_domain_map .update(hdm_update) - )) + ), + inames=(self.inames.update(inames_update))) def swap(self, idom, domain): """ Returns a copy of *self* with its *idom*-th domain replaced with *domain*. """ + assert domain.get_ctx() == isl.DEFAULT_CONTEXT if domain is self._domains[idom]: return self - from functools import reduce - # {{{ swap dim names in home_domain_map new_domains = self._domains.set(idom, domain) @@ -315,6 +319,15 @@ def swap(self, idom, domain): reduce(lambda acc, y: acc.remove(y), self._domains[idom].get_var_names(dim_type.set), self.home_domain_map)) + + inames = reduce(lambda acc, y: acc.set(y, + self.inames.get(y, + Iname(y, frozenset())) + ), + domain.get_var_names(dim_type.set), + reduce(lambda acc, y: acc.remove(y), + self._domains[idom].get_var_names(dim_type.set), + self.inames)) # }}} param_to_idoms = self.param_to_idoms @@ -348,7 +361,8 @@ def swap(self, idom, domain): return LoopKernelDomains(_domains=new_domains, home_domain_map=hdm, - param_to_idoms=param_to_idoms) + param_to_idoms=param_to_idoms, + inames=inames) def delete(self, idom): """ @@ -361,7 +375,6 @@ def delete(self, idom): of calling :meth:`LoopKernelDomains.delete` and :meth:`LoopKernelDomains.insert`. """ - from functools import reduce new_domains = self._domains.delete(idom) param_to_idoms = self.param_to_idoms @@ -413,9 +426,14 @@ def delete(self, idom): # }}} + inames = reduce(lambda acc, x: acc.remove(x), + self._domains[idom].get_var_names(dim_type.set), + self.inames) + return LoopKernelDomains(_domains=new_domains, home_domain_map=hdm, - param_to_idoms=param_to_idoms) + param_to_idoms=param_to_idoms, + inames=inames) def insert(self, idom, domain): """ @@ -425,7 +443,6 @@ def insert(self, idom, domain): raise NotImplementedError def extend(self, domains): - from functools import reduce return reduce(lambda x, y: x.append(y), domains, self) def __add__(self, other): @@ -435,6 +452,7 @@ def __add__(self, other): return NotImplemented def __radd__(self, other): + if not isinstance(other, (list, PVector)): return NotImplemented @@ -450,16 +468,20 @@ def __radd__(self, other): # }}} + inames = self.inames + for idom, dom in enumerate(other): for dim in dom.get_var_names(dim_type.set): home_domain_map[dim] = idom + inames = inames.set(dim, Iname(dim)) for dim in dom.get_var_names(dim_type.param): param_to_idoms[dim] = idom return LoopKernelDomains(_domains=other+self._domains, param_to_idoms=pmap(param_to_idoms), - home_domain_map=pmap(home_domain_map)) + home_domain_map=pmap(home_domain_map), + inames=inames) def __iter__(self): return iter(self._domains) @@ -486,8 +508,20 @@ def update_persistent_hash(self, key_hash, key_builder): for field in fields(self): key_builder.rec(key_hash, getattr(self, field.name)) + def with_iname(self, iname): + assert isinstance(iname, Iname) + if iname.name not in self.set_dims: + raise LoopyError(f"Cannot set unknown iname {iname.name}.") + + return LoopKernelDomains(_domains=self._domains, + inames=self.inames.set(iname.name, iname), + home_domain_map=self.home_domain_map, + param_to_idoms=self.param_to_idoms) + def make_loop_kernel_domains(domains): + assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT + for dom in domains) param_to_idoms = defaultdict(frozenset) for idom, dom in enumerate(domains): for var in dom.get_var_names(dim_type.param): @@ -497,9 +531,13 @@ def make_loop_kernel_domains(domains): for i_domain, dom in enumerate(domains) for iname in dom.get_var_names(dim_type.set)}) + inames = pmap({iname: Iname(iname) + for iname in home_domain_map}) + return LoopKernelDomains(_domains=pvector(domains), param_to_idoms=pmap(param_to_idoms), - home_domain_map=home_domain_map) + home_domain_map=home_domain_map, + inames=inames) class _not_provided: # noqa: N801 @@ -664,19 +702,33 @@ def __init__(self, domains, instructions, args=None, from loopy.kernel.tools import SetOperationCacheManager cache_manager = SetOperationCacheManager() - if iname_to_tags is not None: - warn("Providing iname_to_tags is deprecated, pass inames instead. " - "Will be unsupported in 2022.", - DeprecationWarning, stacklevel=2) + assert isinstance(domains, LoopKernelDomains) + if inames is not None: + warn("Providing inames is deprecated, pass iname tags with domains" + " instead. Will be unsupported in 2022.", + DeprecationWarning, stacklevel=2) + + domains = reduce(lambda acc, iname: acc.with_iname(iname), + inames.values(), + domains) + + if iname_to_tags is not None: if inames is not None: raise LoopyError("Cannot provide both iname_to_tags and inames to " "LoopKernel.__init__") - inames = make_iname_dict({k: Iname(v) for k, v in iname_to_tags.items()}, - self.domain.set_dims) + warn("Providing iname_to_tags is deprecated, pass inames instead. " + "Will be unsupported in 2022.", + DeprecationWarning, stacklevel=2) - assert isinstance(inames, InameDict) + domains = reduce(lambda acc, iname: acc.with_iname( + Iname(iname, + (iname_to_tags + .get(iname, frozenset())) + )), + domains.set_dims, + domains) if index_dtype is None: index_dtype = np.int32 @@ -733,7 +785,6 @@ def __init__(self, domains, instructions, args=None, silenced_warnings=silenced_warnings, temporary_variables=temporary_variables, local_sizes=local_sizes, - inames=inames, substitutions=substitutions, cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, @@ -1039,7 +1090,7 @@ def all_inames(self): """ Returns a :class:`frozenset` of the names of all the inames in the kernel. """ - return frozenset(self.inames.keys()) + return self.domains.set_dims @memoize_method def all_params(self): @@ -1830,7 +1881,6 @@ def __setstate__(self, state): "assumptions", "local_sizes", "temporary_variables", - "inames", "substitutions", "iname_slab_increments", "loop_priority", @@ -1897,33 +1947,11 @@ def __ne__(self, other): # }}} - def get_copy_kwargs(self, **kwargs): - if "iname_to_tags" in kwargs: - if "inames" in kwargs: - raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to " - "LoopKernel.get_copy_kwargs") - - warn("Providing iname_to_tags is deprecated, pass inames instead. " - "Will be unsupported in 2022.", - DeprecationWarning, stacklevel=2) - - iname_to_tags = kwargs["iname_to_tags"] - domains = kwargs.get("domains", self.domains) - kwargs["inames"] = make_iname_dict({k: Iname(k, v) - for k, v in iname_to_tags.items()}, - domains.set_dims) - del kwargs["iname_to_tags"] - - if "domains" in kwargs: - inames = kwargs.get("inames", self.inames) - domains = kwargs["domains"] - kwargs["inames"] = make_iname_dict({k: Iname(k, v.tags) - for k, v in inames.items() - if v.tags}, - domains.set_dims) - - assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) + @property + def inames(self): + return self.domains.inames + def get_copy_kwargs(self, **kwargs): if "instructions" in kwargs: # Avoid carrying over an invalid cache when instructions are # modified. @@ -1951,6 +1979,10 @@ def copy(self, **kwargs): return super().copy(**kwargs) + def with_iname(self, iname): + new_domains = self.domains.with_iname(iname) + return self.copy(domains=new_domains) + # }}} # vim: foldmethod=marker diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 2a6a98d1c..b4a891d9a 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2431,9 +2431,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): raise LoopyError("assumptions must be either 'str' or BasicSet") # }}} - from loopy.kernel import (_get_inames_from_domains, - make_loop_kernel_domains, make_iname_dict) - inames = make_iname_dict({}, _get_inames_from_domains(domains)) + from loopy.kernel import make_loop_kernel_domains arg_guesser = ArgumentGuesser(domains, instructions, temporary_variables, substitutions, @@ -2454,7 +2452,6 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): options=options, target=target, tags=tags, - inames=inames, assumptions=assumptions, **kwargs) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 913b2822a..dc6377abf 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -799,7 +799,6 @@ def parse_tag(tag): # }}} - knl_inames = kernel.inames.copy() for name, new_tag in iname_to_tag.items(): if not new_tag: continue @@ -807,9 +806,9 @@ def parse_tag(tag): if name not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % name) - knl_inames = knl_inames.set(name, knl_inames[name].tagged(new_tag)) + kernel = kernel.with_iname(kernel.inames[name].tagged(new_tag)) - return kernel.copy(inames=knl_inames) + return kernel # }}} @@ -1307,9 +1306,7 @@ def remove_unused_inames(kernel, inames=None): # {{{ remove them domains = kernel.domains - new_inames = kernel.inames for iname in unused_inames: - new_inames = new_inames.remove(iname) # {{{ easy update: iname is only a set dim @@ -1333,8 +1330,7 @@ def remove_unused_inames(kernel, inames=None): domains = domains.swap(idom, dom) - kernel = kernel.copy(domains=domains, - inames=new_inames) + kernel = kernel.copy(domains=domains) # }}} From b478747e202bf0700d370ee3818d7aa1fd331bed Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 4 Jul 2021 11:28:27 -0500 Subject: [PATCH 098/109] removes InamesDict --- loopy/kernel/__init__.py | 146 ++++++++++++-------------------------- loopy/transform/fusion.py | 9 ++- 2 files changed, 51 insertions(+), 104 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 07c82f056..aa52304c5 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -40,7 +40,6 @@ from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname from pyrsistent import pmap, pvector, PVector, PMap -from typing import FrozenSet from dataclasses import dataclass, fields from warnings import warn from functools import reduce @@ -153,95 +152,6 @@ def _get_inames_from_domains(domains): return domains.set_dims -@dataclass(frozen=True) -class InameDict: - """ - A mapping from iname names to corresponding instances of - :class:`loopy.kernel.data.Iname`. - - :attr data: An instance of :class:`pyrsistent.PMap` from iname names - to instances of :class:`~loopy.kernel.data.Iname`. - :attr all_inames: A :class:`frozenset` of names of all inames in a - :class:`~loopy.LoopKernel` - - .. note:: - - * Inames that are not a part of *data*, but are seen in - :attr`InameDict.all_inames` are realized as instances of - :class:`~loopy.kernel.data.Iname` with no tags. - - * This class was introduced to cut-down the operation and storage - overhead that comes with maintaining default instances of - :class:`~loopy.kernel.data.Iname`. - - .. automethod:: set - .. automethod:: remove - .. automethod:: discard - """ - data: PMap - all_inames: FrozenSet - - def copy(self, data=None, all_inames=None): - if all_inames is None: - all_inames = self.all_inames - - if data is None: - data = self.data - - return InameDict(data=data, all_inames=all_inames) - - def __getitem__(self, key): - try: - return self.data[key] - except KeyError: - if key in self.all_inames: - return Iname(key, frozenset()) - else: - raise KeyError - - def set(self, key, val): - assert isinstance(val, Iname) - return self.copy(self.data.set(key, val), - self.all_inames | frozenset([val.name])) - - def remove(self, key): - if key not in self.all_inames: - raise LoopyError(f"Cannot remove unknown iname '{key}'") - - return self.copy(self.data.discard(key), - self.all_inames - frozenset([key])) - - def discard(self, key): - return self.copy(self.data.discard(key), - self.all_inames - frozenset([key])) - - def __iter__(self): - return iter(self.all_inames) - - def keys(self): - return iter(self.all_inames) - - def items(self): - return ((k, self[k]) for k in self.keys()) - - def values(self): - return (self[k] for k in self.keys()) - - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - """ - for field in fields(self): - key_builder.rec(key_hash, getattr(self, field.name)) - - -def make_iname_dict(tagged_inames, all_inames): - assert set(tagged_inames) <= all_inames - assert isinstance(tagged_inames, dict) - assert isinstance(all_inames, frozenset) - return InameDict(data=pmap(tagged_inames), all_inames=all_inames) - - @dataclass(frozen=True) class LoopKernelDomains: """ @@ -704,15 +614,6 @@ def __init__(self, domains, instructions, args=None, assert isinstance(domains, LoopKernelDomains) - if inames is not None: - warn("Providing inames is deprecated, pass iname tags with domains" - " instead. Will be unsupported in 2022.", - DeprecationWarning, stacklevel=2) - - domains = reduce(lambda acc, iname: acc.with_iname(iname), - inames.values(), - domains) - if iname_to_tags is not None: if inames is not None: raise LoopyError("Cannot provide both iname_to_tags and inames to " @@ -730,6 +631,17 @@ def __init__(self, domains, instructions, args=None, domains.set_dims, domains) + if inames is not None: + warn("Providing inames is deprecated, pass iname tags with domains" + " instead. Will be unsupported in 2022.", + DeprecationWarning, stacklevel=2) + + domains = reduce(lambda acc, iname: acc.with_iname( + inames.get(iname, + Iname(iname))), + domains.set_dims, + domains) + if index_dtype is None: index_dtype = np.int32 @@ -1952,6 +1864,42 @@ def inames(self): return self.domains.inames def get_copy_kwargs(self, **kwargs): + + if "iname_to_tags" in kwargs: + if "inames" in kwargs: + raise LoopyError("Cannot provide both iname_to_tags and inames to " + "LoopKernel.__init__") + + warn("Providing iname_to_tags is deprecated, pass inames instead. " + "Will be unsupported in 2022.", + DeprecationWarning, stacklevel=2) + + domains = kwargs.get("domains", self.domains) + iname_to_tags = kwargs.pop("iname_to_tags") + + domains = reduce(lambda acc, iname: acc.with_iname( + Iname(iname, + (iname_to_tags + .get(iname, frozenset())) + )), + domains.set_dims, + domains) + kwargs["domains"] = domains + + if "inames" in kwargs: + warn("Providing inames is deprecated, pass iname tags with domains" + " instead. Will be unsupported in 2022.", + DeprecationWarning, stacklevel=2) + + domains = kwargs.get("domains", self.domains) + inames = kwargs.pop("inames") + domains = reduce(lambda acc, iname: acc.with_iname( + inames.get(iname, + Iname(iname))), + domains.set_dims, + domains) + kwargs["domains"] = domains + if "instructions" in kwargs: # Avoid carrying over an invalid cache when instructions are # modified. diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index b8820c43f..86a83f0ff 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -231,7 +231,7 @@ def _fuse_two_kernels(kernela, kernelb): # }}} - from loopy.kernel import LoopKernel, make_iname_dict + from loopy.kernel import LoopKernel from pyrsistent import thaw return LoopKernel( domains=new_domains, @@ -245,10 +245,9 @@ def _fuse_two_kernels(kernela, kernelb): local_sizes=_merge_dicts( "local size", kernela.local_sizes, kernelb.local_sizes), temporary_variables=new_temporaries, - inames=make_iname_dict(_merge_dicts("inames", - thaw(kernela.inames.data), - thaw(kernelb.inames.data)), - new_domains.set_dims), + inames=_merge_dicts("inames", + thaw(kernela.inames), + thaw(kernelb.inames)), substitutions=_merge_dicts( "substitution", kernela.substitutions, From c2126323a4811bc03dfffba448fb810d9a7f74ce Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 4 Jul 2021 12:26:24 -0500 Subject: [PATCH 099/109] adds with_inames to easily swap the inames in LoopKernelDomains --- loopy/kernel/__init__.py | 8 ++++++++ loopy/transform/precompute.py | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index aa52304c5..8ee32f790 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -418,6 +418,14 @@ def update_persistent_hash(self, key_hash, key_builder): for field in fields(self): key_builder.rec(key_hash, getattr(self, field.name)) + def with_inames(self, inames): + assert isinstance(inames, PMap) + assert set(inames.keys()) == self.set_dims + return LoopKernelDomains(_domains=self._domains, + inames=inames, + home_domain_map=self.home_domain_map, + param_to_idoms=self.param_to_idoms) + def with_iname(self, iname): assert isinstance(iname, Iname) if iname.name not in self.set_dims: diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 04defdb20..bede68ac3 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -715,7 +715,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, from loopy.kernel import make_loop_kernel_domains kernel = (kernel - .copy(domains=make_loop_kernel_domains(domains_after_combining))) + .copy(domains=(make_loop_kernel_domains(domains_after_combining) + .with_inames(kernel.inames)))) # }}} From d13e09d4d8974dfc4f79984b8cf4e8d97b998249 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 6 Jul 2021 09:55:25 -0500 Subject: [PATCH 100/109] make_loop_kernel_domains takes iname tag info --- loopy/kernel/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 8ee32f790..e599ef566 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -437,9 +437,12 @@ def with_iname(self, iname): param_to_idoms=self.param_to_idoms) -def make_loop_kernel_domains(domains): +def make_loop_kernel_domains(domains, inames=None): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) + if inames is None: + inames = {} + param_to_idoms = defaultdict(frozenset) for idom, dom in enumerate(domains): for var in dom.get_var_names(dim_type.param): @@ -449,7 +452,7 @@ def make_loop_kernel_domains(domains): for i_domain, dom in enumerate(domains) for iname in dom.get_var_names(dim_type.set)}) - inames = pmap({iname: Iname(iname) + inames = pmap({iname: inames.get(iname, Iname(iname)) for iname in home_domain_map}) return LoopKernelDomains(_domains=pvector(domains), From 1fee4b1d5aefcfc20c5bcc2be3355f15895fcdc4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 6 Jul 2021 14:13:18 -0500 Subject: [PATCH 101/109] define ArrayBase.depends_on --- loopy/kernel/array.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index b6ed5518d..464306532 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1050,6 +1050,31 @@ def none_pass_mapper(s): else: return self + def depends_on(self): + import loopy as lp + from functools import reduce + from loopy.symbolic import get_dependencies + + if self.shape is not None and self.shape is not lp.auto: + shape_deps = reduce(frozenset.union, + (get_dependencies(s) for s in self.shape + if s is not None), + frozenset()) + else: + shape_deps = frozenset() + + if self.dim_tags is not None: + stride_deps = reduce(frozenset.union, + (dim_tag.depends_on() + for dim_tag in self.dim_tags), + frozenset()) + else: + stride_deps = frozenset() + + # offset is not an expression, do not map. + + return shape_deps | stride_deps + def vector_size(self, target): """Return the size of the vector type used for the array divided by the basic data type. From fed7f474e416a9789bda96448c87f9efef3ed987 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 6 Jul 2021 14:14:03 -0500 Subject: [PATCH 102/109] include only relevant deps as kernel's inputs/outputs --- loopy/codegen/result.py | 62 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 94c5b4308..6c6af6562 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -179,6 +179,63 @@ def get_idis_for_kernel(kernel): return implemented_data_info +def _filter_idis_for_function(idis, kernel, function): + from functools import reduce + from loopy.schedule.tree import Function, InstructionGatherer + from loopy.kernel.array import ArrayBase + from loopy.kernel.data import InameArg + import islpy as isl + assert isinstance(function, Function) + + name2idi = {idi.name: idi for idi in idis} + insn_ids = InstructionGatherer()(function) + + vars_accessed = reduce(frozenset.union, + (kernel.id_to_insn[insn_id].dependency_names() + for insn_id in insn_ids), + frozenset()) + + # {{{ deps from vars' shape expressions + + shape_expr_deps = set() + + for var_name in vars_accessed: + var = (kernel + .arg_dict.get(var_name, + kernel.temporary_variables.get(var_name))) + if isinstance(var, ArrayBase): + shape_expr_deps.update(var.depends_on()) + + # }}} + + # {{{ domain_deps + + inames_in_fn = reduce(frozenset.union, + (kernel.insn_inames(insn_id) + for insn_id in insn_ids), + frozenset()) + idoms = {kernel.get_home_domain_index(iname) + for iname in inames_in_fn} + dom_deps = reduce(frozenset.union, + (kernel.domains[idom].get_var_names(isl.dim_type.param) + for idom in idoms), + frozenset()) + + # }}} + + all_deps = vars_accessed | shape_expr_deps | dom_deps + + return [idi + for idi in idis + if (idi.name in all_deps + or idi.arg_class is InameArg + or idi.base_name in all_deps + or (idi.offset_for_name is not None + and name2idi[idi.offset_for_name].base_name in all_deps) + or (idi.stride_for_name_and_axis is not None + and idi.stride_for_name_and_axis[0] in all_deps))] + + # {{{ program generation top-level @dataclass(frozen=True) @@ -346,6 +403,11 @@ def map_function(self, expr, context): idis = (get_idis_for_kernel(self.kernel) + synthesize_idis_for_extra_args(self.kernel, expr)) + if self.is_entrypoint: + # filter idis iff the function is an entrypoint. For non-entrypoint + # kernels the callsite isn't handled here. + idis = _filter_idis_for_function(idis, self.kernel, expr) + dev_fn_decl = (self .device_ast_builder .get_function_declaration(self.kernel, From e8d2a57a7f0b183bdb5f5e0f89c8bc4a5dde08ea Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Jul 2021 19:15:44 -0500 Subject: [PATCH 103/109] remove refereence to codegen_state.is_generating_device_code --- loopy/target/c/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 360dc90a2..9dbdbefbd 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -838,8 +838,7 @@ def get_temporary_decls(self, kernel, callables_table, subkernel_name): else: assert tv.initializer is None - if (tv.address_space == AddressSpace.GLOBAL - and codegen_state.is_generating_device_code): + if tv.address_space == AddressSpace.GLOBAL: # global temps trigger no codegen in the device code continue From f664e305a3f32453005b8637dd65a5aaa381dd3f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Jul 2021 19:16:52 -0500 Subject: [PATCH 104/109] get_idis_for_kernel: memoize --- loopy/codegen/result.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 6c6af6562..fbd39b512 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -25,7 +25,7 @@ from loopy.schedule.tree import CombineMapper from dataclasses import dataclass from typing import Optional, Any, List, Union, Mapping -from pytools import ImmutableRecord +from pytools import ImmutableRecord, memoize_on_first_arg def process_preambles(preambles): @@ -145,6 +145,7 @@ def all_code(self): # }}} +@memoize_on_first_arg def get_idis_for_kernel(kernel): """ Returns a :class:`list` of :class:`~loopy.codegen.ImplementedDataInfo` for From 37998d0296908fda1f731fdf8f522f0cb1d85537 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Jul 2021 19:18:03 -0500 Subject: [PATCH 105/109] gets rid of a spurious comment --- loopy/schedule/tree.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 6b3bc506f..5c563d703 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -552,8 +552,6 @@ def map_schedule(self, expr): def map_function(self, expr, context): from loopy.kernel.data import AxisTag - # get the implemented domain for the insn ids in this kernel - # Shouldn't be difficult to write a combine mapper for it. gsize, lsize = self.kernel.get_grid_sizes_for_insn_ids( InstructionGatherer()(expr), self.callables_table, return_dict=True) From 5ab48ff859a3c89666051fe68a186654bcb39b5e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Jul 2021 19:22:27 -0500 Subject: [PATCH 106/109] Unvectorizable -> UnvectorizableError --- loopy/schedule/tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index 5c563d703..fef3da2bc 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -664,7 +664,7 @@ def map_polyhedral_loop(self, expr): from loopy.diagnostic import warn from loopy.symbolic import pw_aff_to_expr from loopy.expression import VectorizabilityChecker - from loopy.codegen import Unvectorizable + from loopy.codegen import UnvectorizableError from loopy.kernel.instruction import MultiAssignmentBase if self.kernel.iname_tags_of_type(expr.iname, VectorizeTag): @@ -731,7 +731,7 @@ def map_polyhedral_loop(self, expr): try: lhs_is_vector = vcheck(insn.assignee) rhs_is_vector = vcheck(insn.expression) - except Unvectorizable as e: + except UnvectorizableError as e: warn(self.kernel, "vectorize_failed", f"Vectorization of '{expr.iname}' failed due to '{e}'" f" in '{insn.id}'.") From 961101a322bf5a87602cec604ded1d54d724239d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 16 Jul 2021 19:24:47 -0500 Subject: [PATCH 107/109] linearize only if needed --- loopy/codegen/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 4a98a876a..d5808d1c6 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -441,8 +441,9 @@ def generate_code_v2(program): from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) - from loopy.schedule import linearize - program = linearize(program) + if program.state < KernelState.LINEARIZED: + from loopy.schedule import linearize + program = linearize(program) # Why diverge? Generated code for a non-entrypoint kernel and an entrypoint # kernel isn't same for a general loopy target. For example in OpenCL, a From ddf5a2dafceb18f2ffb1820cadbbbf98c3cc42c6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Aug 2021 15:28:36 -0500 Subject: [PATCH 108/109] inner_condition: remove divs before taking a gist --- loopy/schedule/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index fef3da2bc..a4d79f0d1 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -955,7 +955,7 @@ def map_polyhedral_loop(self, expr, context): impl_domain).params()) inner_condition = _align_and_gist( - domain, + domain.remove_divs(), _align_and_intersect( _align_and_intersect(set_implemented_in_loop, impl_domain), From afb6e7c0d5ea44a1901137a93453e2ea2ec0e1f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 18 Feb 2022 00:28:41 -0600 Subject: [PATCH 109/109] in realize_reduction emit LoopKernelDomains --- loopy/transform/realize_reduction.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index 2f8e3abe8..50dd77dfb 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -49,6 +49,7 @@ from loopy.diagnostic import ( LoopyError, warn_with_kernel, ReductionIsNotTriangularError) from loopy.transform.instruction import replace_instruction_ids_in_insn +from loopy.kernel import make_loop_kernel_domains # {{{ reduction realization context @@ -1750,7 +1751,10 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes, def map_reduction(expr, *, red_realize_ctx, nresults): kernel_with_updated_domains = red_realize_ctx.kernel.copy( - domains=red_realize_ctx.domains) + domains=make_loop_kernel_domains( + red_realize_ctx.domains, + red_realize_ctx.kernel.inames, + )) from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) @@ -1916,7 +1920,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, cb_mapper = RealizeReductionCallbackMapper(map_reduction) insn_queue = kernel.instructions[:] - domains = kernel.domains[:] + domains = list(kernel.domains) inames_added_for_scan = set() @@ -2086,7 +2090,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, kernel = kernel.copy( instructions=finished_insns + insn_queue, temporary_variables=new_temporary_variables, - domains=domains) + domains=make_loop_kernel_domains( + domains, kernel.inames)) from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, red_realize_ctx.additional_iname_tags)