Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/tvm/relay/op/strategy/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,7 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
and stride_w == 1
and dilation_h == 1
and dilation_w == 1
and attrs["groups"] == 1
):
strategy.add_implementation(
wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
Expand All @@ -688,7 +689,7 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
(N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
):
) and out_type == "float16":
Comment thread
masahi marked this conversation as resolved.
strategy.add_implementation(
wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore),
wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc_tensorcore),
Expand Down
4 changes: 1 addition & 3 deletions python/tvm/relay/op/strategy/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,10 +545,8 @@ def _compute_conv3d(attrs, inputs, out_type):
(dilation_d, dilation_h, dilation_w) = dilation
if dilation_d < 1 or dilation_h < 1 or dilation_w < 1:
raise ValueError("Dilation should be positive value")
if groups != 1:
raise ValueError("Not support arbitrary group number for conv3d")

args = [inputs[0], inputs[1], strides, padding, dilation]
args = [inputs[0], inputs[1], strides, padding, dilation, groups]
if need_layout:
args.append(layout)
args.append(out_dtype)
Expand Down
17 changes: 15 additions & 2 deletions python/tvm/te/operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def placeholder(shape, dtype=None, name="placeholder"):
return _ffi_api.Placeholder(shape, dtype, name)


def compute(shape, fcompute, name="compute", tag="", attrs=None):
def compute(shape, fcompute, name="compute", tag="", attrs=None, varargs_names=None):
"""Construct a new tensor by computing over the shape domain.

The compute rule is result[axis] = fcompute(axis)
Expand All @@ -78,6 +78,10 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
attrs: dict, optional
The additional auxiliary attributes about the compute.

varargs_names: list, optional
The names to use for each of the varargs. If not supplied, the varargs
will be called i1, i2, ...

Returns
-------
tensor: Tensor
Expand All @@ -97,7 +101,16 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
arg_names = ["i%d" % i for i in range(out_ndim)]
elif argspec.varargs is not None:
# if there is a varargs, it takes the remaining dimensions of out_ndim
arg_names = argspec.args + [f"i{i}" for i in range(out_ndim - len(argspec.args))]
num_remaining_args = out_ndim - len(argspec.args)
if varargs_names is not None:
if len(varargs_names) != num_remaining_args:
raise RuntimeError(
f"Number of varargs ({num_remaining_args}) does not match number"
f"of varargs_names ({len(varargs_names)})"
)
arg_names = argspec.args + varargs_names
else:
arg_names = argspec.args + [f"i{i}" for i in range(out_ndim - len(argspec.args))]
else:
arg_names = argspec.args
# if there are fewer args than out dimensions, the remaining dimensions
Expand Down
22 changes: 15 additions & 7 deletions python/tvm/topi/cuda/conv3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


@autotvm.register_topi_compute("conv3d_ncdhw.cuda")
def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
"""Conv3D operator in NCDHW layout for cuda backend.

Parameters
Expand All @@ -49,6 +49,9 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype="float
dilation: int or a list/tuple of three ints
dilation size, or [dilation_depth, dilation_height, dilation_width]

groups: int
Number of groups

out_dtype: str
The output type. This is used for mixed precision.

Expand All @@ -57,7 +60,7 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype="float
output : tvm.te.Tensor
5-D with shape [batch, out_channel, out_depth, out_height, out_width]
"""
return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype)
return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, groups, out_dtype)


@autotvm.register_topi_schedule("conv3d_ncdhw.cuda")
Expand All @@ -82,15 +85,15 @@ def schedule_conv3d_ncdhw(cfg, outs):
s = te.create_schedule([x.op for x in outs])

def _callback(op):
if op.tag == "conv3d_ncdhw":
if "conv3d_ncdhw" in op.tag:
schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NCDHW", "conv3d_ncdhw.cuda")

traverse_inline(s, outs[0].op, _callback)
return s


@autotvm.register_topi_compute("conv3d_ndhwc.cuda")
def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
"""Conv3d operator in NDHWC layout for cuda backend.

Parameters
Expand All @@ -110,12 +113,15 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float
dilation: int or a list/tuple of three ints
dilation size, or [dilation_depth, dilation_height, dilation_width]

groups: int
Number of groups

Returns
-------
Output : tvm.te.Tensor
5-D with shape [batch, out_depth, out_height, out_width, out_channel]
"""
return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, out_dtype)
return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, groups, out_dtype)


@autotvm.register_topi_schedule("conv3d_ndhwc.cuda")
Expand All @@ -140,7 +146,7 @@ def schedule_conv3d_ndhwc(cfg, outs):
s = te.create_schedule([x.op for x in outs])

def _callback(op):
if op.tag == "conv3d_ndhwc":
if "conv3d_ndhwc" in op.tag:
schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NDHWC", "conv3d_ndhwc.cuda")

traverse_inline(s, outs[0].op, _callback)
Expand All @@ -149,7 +155,7 @@ def _callback(op):

@autotvm.register_topi_compute("conv3d_cudnn.cuda")
def conv3d_cudnn(
cfg, data, kernel, strides, padding, dilation, layout="NCDHW", out_dtype="float32"
cfg, data, kernel, strides, padding, dilation, groups, layout="NCDHW", out_dtype="float32"
):
"""Conv3D operator for cuda backend.

Expand Down Expand Up @@ -194,6 +200,8 @@ def conv3d_cudnn(
raise ValueError("Unsupported layout %s in cudnn" % layout)
CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)

assert groups == 1, "conv3d_cudnn does not support groups"

# handle dilation
stride_d, stride_h, stride_w = (
(strides, strides, strides) if isinstance(strides, int) else strides
Expand Down
3 changes: 2 additions & 1 deletion python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,9 @@ def get_strides(extents):


@autotvm.register_topi_compute("conv3d_ndhwc_tensorcore.cuda")
def conv3d_ndhwc_tensorcore(cfg, data, kernel, strides, padding, dilation, out_dtype):
def conv3d_ndhwc_tensorcore(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
"""Compute conv3d with tensorcore for NDHWC layout"""
assert groups == 1, "tensorcore conv3d does not support groups"
return ndhwc_tensorcore_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype)


Expand Down
9 changes: 7 additions & 2 deletions python/tvm/topi/cuda/conv3d_winograd.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,9 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):


@autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, groups, out_dtype):
"""Conv3d NCDHW using winograd optimization"""
assert groups == 1, "conv3d_ncdhw_winograd only supports a single group"
CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
# Check if we can transform depth.
if 2 < KD < 8 and KD == KH:
Expand Down Expand Up @@ -650,9 +652,12 @@ def _callback(op):

@autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
def conv3d_ncdhw_winograd_without_weight_transform(
cfg, data, kernel, strides, padding, dilation, out_dtype
cfg, data, kernel, strides, padding, dilation, groups, out_dtype
):
"""Conv3d NCDHW winograd without weight transform."""
assert (
groups == 1
), "conv3d_ncdhw_winograd_without_weight_transform does not support more than one group"
A, B, C, _, _ = get_const_tuple(kernel.shape)
# Check if we can transform depth.
if A == B == C:
Expand Down
99 changes: 40 additions & 59 deletions python/tvm/topi/nn/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from collections import namedtuple
import re
from typing import Union, Sequence
from typing import Union, Sequence, Optional
import numpy as np

import tvm
Expand Down Expand Up @@ -313,63 +313,18 @@ def conv2d_nhwc(
output : tvm.te.Tensor
4-D with shape [batch, out_height, out_width, out_channel]
"""
assert isinstance(stride, int) or len(stride) == 2
assert isinstance(dilation, int) or len(dilation) == 2

if isinstance(stride, int):
stride_h = stride_w = stride
else:
stride_h, stride_w = stride

if isinstance(dilation, int):
dilation_h = dilation_w = dilation
else:
dilation_h, dilation_w = dilation

if auto_scheduler_rewritten_layout:
# Infer shape for the rewritten layout
kernel_h, kernel_w, channel, num_filter = auto_scheduler.get_shape_from_rewritten_layout(
auto_scheduler_rewritten_layout, ["ry", "rx", "rc", "ff"]
)
auto_scheduler.remove_index_check(Filter)
else:
kernel_h, kernel_w, channel, num_filter = Filter.shape

batch, in_height, in_width, in_channel = Input.shape
# compute the output shape
dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
padding, (dilated_kernel_h, dilated_kernel_w)
)
out_channel = num_filter
out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
pad_before = [0, pad_top, pad_left, 0]
pad_after = [0, pad_down, pad_right, 0]
PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
rc = te.reduce_axis((0, in_channel), name="rc")
ry = te.reduce_axis((0, kernel_h), name="ry")
rx = te.reduce_axis((0, kernel_w), name="rx")
Output = te.compute(
(batch, out_height, out_width, out_channel),
lambda nn, yy, xx, ff: te.sum(
PaddedInput[
nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc
].astype(out_dtype)
* Filter[ry, rx, rc, ff].astype(out_dtype),
axis=[ry, rx, rc],
),
name="Conv2dOutput",
tag="conv2d_nhwc",
attrs={"layout_free_placeholders": [Filter]},
return conv(
Input,
Filter,
stride,
padding,
dilation,
1,
"NHWC",
out_dtype,
auto_scheduler_rewritten_layout,
)

if auto_scheduler_rewritten_layout:
Output = auto_scheduler.rewrite_compute_body(Output, auto_scheduler_rewritten_layout)

return Output


def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="float32"):
"""Conv2D operator for nChw[x]c layout.
Expand Down Expand Up @@ -758,6 +713,7 @@ def conv(
groups: int,
order: str,
out_dtype: Union[str, None] = None,
auto_scheduler_rewritten_layout: Optional[str] = None,
):
"""Convolution operator in NCHW or NHWC layout.

Expand Down Expand Up @@ -796,6 +752,9 @@ def conv(
Elements are converted to this type before elementwise multiplication
and summation.

auto_scheduler_rewritten_layout: str
Layout from autoscheduler's layout rewritting.

Returns
-------
Output : tvm.te.Tensor
Expand Down Expand Up @@ -840,6 +799,15 @@ def conv(
permutation_to_kernel
].tolist()

# Autoscheduler may have messed with the input layout, so we extract the
# dimensions that it gives us
if auto_scheduler_rewritten_layout:
num_filter, _, *kernel_dimensions = auto_scheduler.get_shape_from_rewritten_layout(
auto_scheduler_rewritten_layout,
["ff", "rc"] + [f"r{i}" for i in ["y", "x", "z"][: len(kernel_dimensions)]],
)
auto_scheduler.remove_index_check(filt)

assert in_channel % groups == 0, "input channels must divide group size"
assert num_filter % groups == 0, "output channels must divide group size"

Expand All @@ -858,15 +826,21 @@ def conv(
pad_after = list(np.array([0, 0] + pad_end)[permutation_from])
temp = pad(inp, pad_before, pad_after, name="pad_temp")
rc = te.reduce_axis((0, in_channel // groups), name="rc")
rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in enumerate(kernel_dimensions)]
rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in zip(["y", "x", "z"], kernel_dimensions)]

def compute(*args):
nn, ff, *dim_indices = list(np.array(args)[permutation_to])

if groups == 1:
simplified_channel_index = rc
else:
simplified_channel_index = ff // (num_filter // groups) * (in_channel // groups) + rc

return te.sum(
temp.__getitem__(
tuple(
np.array(
[nn, ff // (num_filter // groups) * (in_channel // groups) + rc]
[nn, simplified_channel_index]
+ [
di * stride + r * dil
for di, stride, r, dil in zip(dim_indices, strides, rs, dilations)
Expand All @@ -882,13 +856,20 @@ def compute(*args):
axis=np.array([rc, *rs])[permutation_from_reductions].tolist(),
)

return te.compute(
out = te.compute(
list(np.array([batch, out_channel] + out_dimensions)[permutation_from]),
compute,
# tag is expected to be lowercase
tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
name=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
attrs={"layout_free_placeholders": [filt]},
varargs_names=list(np.array(["nn", "ff", "yy", "xx", "zz"])[permutation_from]),
)
# if we used autoscheduler's changed layout we need to rewrite the ordering
# of the output dimensions
if auto_scheduler_rewritten_layout:
out = auto_scheduler.rewrite_compute_body(out, auto_scheduler_rewritten_layout)
return out


def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
Expand Down
Loading