apache · masahi · Feb 18, 2022 · Jan 4, 2022 · Jan 15, 2022 · Jan 19, 2022
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
@@ -666,6 +666,7 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
             and stride_w == 1
             and dilation_h == 1
             and dilation_w == 1
+            and attrs["groups"] == 1
         ):
             strategy.add_implementation(
                 wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
@@ -688,7 +689,7 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                     (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
                     or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
                     or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-                ):
+                ) and out_type == "float16":
                     strategy.add_implementation(
                         wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore),
                         wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc_tensorcore),

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
@@ -545,10 +545,8 @@ def _compute_conv3d(attrs, inputs, out_type):
         (dilation_d, dilation_h, dilation_w) = dilation
         if dilation_d < 1 or dilation_h < 1 or dilation_w < 1:
             raise ValueError("Dilation should be positive value")
-        if groups != 1:
-            raise ValueError("Not support arbitrary group number for conv3d")
 
-        args = [inputs[0], inputs[1], strides, padding, dilation]
+        args = [inputs[0], inputs[1], strides, padding, dilation, groups]
         if need_layout:
             args.append(layout)
         args.append(out_dtype)

diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
@@ -56,7 +56,7 @@ def placeholder(shape, dtype=None, name="placeholder"):
     return _ffi_api.Placeholder(shape, dtype, name)
 
 
-def compute(shape, fcompute, name="compute", tag="", attrs=None):
+def compute(shape, fcompute, name="compute", tag="", attrs=None, varargs_names=None):
     """Construct a new tensor by computing over the shape domain.
 
     The compute rule is result[axis] = fcompute(axis)
@@ -78,6 +78,10 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
     attrs: dict, optional
         The additional auxiliary attributes about the compute.
 
+    varargs_names: list, optional
+        The names to use for each of the varargs. If not supplied, the varargs
+        will be called i1, i2, ...
+
     Returns
     -------
     tensor: Tensor
@@ -97,7 +101,16 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
         arg_names = ["i%d" % i for i in range(out_ndim)]
     elif argspec.varargs is not None:
         # if there is a varargs, it takes the remaining dimensions of out_ndim
-        arg_names = argspec.args + [f"i{i}" for i in range(out_ndim - len(argspec.args))]
+        num_remaining_args = out_ndim - len(argspec.args)
+        if varargs_names is not None:
+            if len(varargs_names) != num_remaining_args:
+                raise RuntimeError(
+                    f"Number of varargs ({num_remaining_args}) does not match number"
+                    f"of varargs_names ({len(varargs_names)})"
+                )
+            arg_names = argspec.args + varargs_names
+        else:
+            arg_names = argspec.args + [f"i{i}" for i in range(out_ndim - len(argspec.args))]
     else:
         arg_names = argspec.args
         # if there are fewer args than out dimensions, the remaining dimensions

diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py
@@ -26,7 +26,7 @@
 
 
 @autotvm.register_topi_compute("conv3d_ncdhw.cuda")
-def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
+def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
     """Conv3D operator in NCDHW layout for cuda backend.
 
     Parameters
@@ -49,6 +49,9 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype="float
     dilation: int or a list/tuple of three ints
         dilation size, or [dilation_depth, dilation_height, dilation_width]
 
+    groups: int
+        Number of groups
+
     out_dtype: str
         The output type. This is used for mixed precision.
 
@@ -57,7 +60,7 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype="float
     output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
-    return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype)
+    return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, groups, out_dtype)
 
 
 @autotvm.register_topi_schedule("conv3d_ncdhw.cuda")
@@ -82,15 +85,15 @@ def schedule_conv3d_ncdhw(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == "conv3d_ncdhw":
+        if "conv3d_ncdhw" in op.tag:
             schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NCDHW", "conv3d_ncdhw.cuda")
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
 @autotvm.register_topi_compute("conv3d_ndhwc.cuda")
-def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
+def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
     """Conv3d operator in NDHWC layout for cuda backend.
 
     Parameters
@@ -110,12 +113,15 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float
     dilation: int or a list/tuple of three ints
         dilation size, or [dilation_depth, dilation_height, dilation_width]
 
+    groups: int
+        Number of groups
+
     Returns
     -------
     Output : tvm.te.Tensor
         5-D with shape [batch, out_depth, out_height, out_width, out_channel]
     """
-    return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, out_dtype)
+    return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, groups, out_dtype)
 
 
 @autotvm.register_topi_schedule("conv3d_ndhwc.cuda")
@@ -140,7 +146,7 @@ def schedule_conv3d_ndhwc(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == "conv3d_ndhwc":
+        if "conv3d_ndhwc" in op.tag:
             schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NDHWC", "conv3d_ndhwc.cuda")
 
     traverse_inline(s, outs[0].op, _callback)
@@ -149,7 +155,7 @@ def _callback(op):
 
 @autotvm.register_topi_compute("conv3d_cudnn.cuda")
 def conv3d_cudnn(
-    cfg, data, kernel, strides, padding, dilation, layout="NCDHW", out_dtype="float32"
+    cfg, data, kernel, strides, padding, dilation, groups, layout="NCDHW", out_dtype="float32"
 ):
     """Conv3D operator for cuda backend.
 
@@ -194,6 +200,8 @@ def conv3d_cudnn(
         raise ValueError("Unsupported layout %s in cudnn" % layout)
     CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
 
+    assert groups == 1, "conv3d_cudnn does not support groups"
+
     # handle dilation
     stride_d, stride_h, stride_w = (
         (strides, strides, strides) if isinstance(strides, int) else strides

diff --git a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
@@ -335,8 +335,9 @@ def get_strides(extents):
 
 
 @autotvm.register_topi_compute("conv3d_ndhwc_tensorcore.cuda")
-def conv3d_ndhwc_tensorcore(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def conv3d_ndhwc_tensorcore(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
     """Compute conv3d with tensorcore for NDHWC layout"""
+    assert groups == 1, "tensorcore conv3d does not support groups"
     return ndhwc_tensorcore_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype)
 
 

diff --git a/python/tvm/topi/cuda/conv3d_winograd.py b/python/tvm/topi/cuda/conv3d_winograd.py
@@ -620,7 +620,9 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
 
 
 @autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
-def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
+    """Conv3d NCDHW using winograd optimization"""
+    assert groups == 1, "conv3d_ncdhw_winograd only supports a single group"
     CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
     # Check if we can transform depth.
     if 2 < KD < 8 and KD == KH:
@@ -650,9 +652,12 @@ def _callback(op):
 
 @autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
 def conv3d_ncdhw_winograd_without_weight_transform(
-    cfg, data, kernel, strides, padding, dilation, out_dtype
+    cfg, data, kernel, strides, padding, dilation, groups, out_dtype
 ):
     """Conv3d NCDHW winograd without weight transform."""
+    assert (
+        groups == 1
+    ), "conv3d_ncdhw_winograd_without_weight_transform does not support more than one group"
     A, B, C, _, _ = get_const_tuple(kernel.shape)
     # Check if we can transform depth.
     if A == B == C:

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
@@ -21,7 +21,7 @@
 
 from collections import namedtuple
 import re
-from typing import Union, Sequence
+from typing import Union, Sequence, Optional
 import numpy as np
 
 import tvm
@@ -313,63 +313,18 @@ def conv2d_nhwc(
     output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    if auto_scheduler_rewritten_layout:
-        # Infer shape for the rewritten layout
-        kernel_h, kernel_w, channel, num_filter = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout, ["ry", "rx", "rc", "ff"]
-        )
-        auto_scheduler.remove_index_check(Filter)
-    else:
-        kernel_h, kernel_w, channel, num_filter = Filter.shape
-
-    batch, in_height, in_width, in_channel = Input.shape
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    Output = te.compute(
-        (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: te.sum(
-            PaddedInput[
-                nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc
-            ].astype(out_dtype)
-            * Filter[ry, rx, rc, ff].astype(out_dtype),
-            axis=[ry, rx, rc],
-        ),
-        name="Conv2dOutput",
-        tag="conv2d_nhwc",
-        attrs={"layout_free_placeholders": [Filter]},
+    return conv(
+        Input,
+        Filter,
+        stride,
+        padding,
+        dilation,
+        1,
+        "NHWC",
+        out_dtype,
+        auto_scheduler_rewritten_layout,
     )
 
-    if auto_scheduler_rewritten_layout:
-        Output = auto_scheduler.rewrite_compute_body(Output, auto_scheduler_rewritten_layout)
-
-    return Output
-
 
 def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="float32"):
     """Conv2D operator for nChw[x]c layout.
@@ -758,6 +713,7 @@ def conv(
     groups: int,
     order: str,
     out_dtype: Union[str, None] = None,
+    auto_scheduler_rewritten_layout: Optional[str] = None,
 ):
     """Convolution operator in NCHW or NHWC layout.
 
@@ -796,6 +752,9 @@ def conv(
         Elements are converted to this type before elementwise multiplication
         and summation.
 
+    auto_scheduler_rewritten_layout: str
+        Layout from autoscheduler's layout rewritting.
+
     Returns
     -------
     Output : tvm.te.Tensor
@@ -840,6 +799,15 @@ def conv(
         permutation_to_kernel
     ].tolist()
 
+    # Autoscheduler may have messed with the input layout, so we extract the
+    # dimensions that it gives us
+    if auto_scheduler_rewritten_layout:
+        num_filter, _, *kernel_dimensions = auto_scheduler.get_shape_from_rewritten_layout(
+            auto_scheduler_rewritten_layout,
+            ["ff", "rc"] + [f"r{i}" for i in ["y", "x", "z"][: len(kernel_dimensions)]],
+        )
+        auto_scheduler.remove_index_check(filt)
+
     assert in_channel % groups == 0, "input channels must divide group size"
     assert num_filter % groups == 0, "output channels must divide group size"
 
@@ -858,15 +826,21 @@ def conv(
     pad_after = list(np.array([0, 0] + pad_end)[permutation_from])
     temp = pad(inp, pad_before, pad_after, name="pad_temp")
     rc = te.reduce_axis((0, in_channel // groups), name="rc")
-    rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in enumerate(kernel_dimensions)]
+    rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in zip(["y", "x", "z"], kernel_dimensions)]
 
     def compute(*args):
         nn, ff, *dim_indices = list(np.array(args)[permutation_to])
+
+        if groups == 1:
+            simplified_channel_index = rc
+        else:
+            simplified_channel_index = ff // (num_filter // groups) * (in_channel // groups) + rc
+
         return te.sum(
             temp.__getitem__(
                 tuple(
                     np.array(
-                        [nn, ff // (num_filter // groups) * (in_channel // groups) + rc]
+                        [nn, simplified_channel_index]
                         + [
                             di * stride + r * dil
                             for di, stride, r, dil in zip(dim_indices, strides, rs, dilations)
@@ -882,13 +856,20 @@ def compute(*args):
             axis=np.array([rc, *rs])[permutation_from_reductions].tolist(),
         )
 
-    return te.compute(
+    out = te.compute(
         list(np.array([batch, out_channel] + out_dimensions)[permutation_from]),
         compute,
         # tag is expected to be lowercase
         tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
         name=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
+        attrs={"layout_free_placeholders": [filt]},
+        varargs_names=list(np.array(["nn", "ff", "yy", "xx", "zz"])[permutation_from]),
     )
+    # if we used autoscheduler's changed layout we need to rewrite the ordering
+    # of the output dimensions
+    if auto_scheduler_rewritten_layout:
+        out = auto_scheduler.rewrite_compute_body(out, auto_scheduler_rewritten_layout)
+    return out
 
 
 def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtype=None):