import loopy as lp
import numpy as np
from pymbolic.primitives import *
import immutables
p2p_knl = lp.make_kernel(
[
"[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }",
"{ [iknl] : iknl = 0 }",
"[isrc_box_end, isrc_box_start] -> { [isrc_box] : isrc_box_start <= isrc_box < isrc_box_end }",
"{ [idim, idim_0, idim_1] : 0 <= idim <= 1 and 0 <= idim_0 <= 1 and 0 <= idim_1 <= 1 }",
"{ [istrength] : istrength = 0 }",
"{ [inner] : 0 <= inner <= 31 }",
"{ [itgt_offset_outer] : itgt_offset_outer = 0 }",
"{ [iprefetch] : iprefetch = 0 }",
"[inner] -> { [isrc_prefetch_inner] : isrc_prefetch_inner = 0 and 0 <= inner <= 25 }",
"[iprefetch, isrc_end, isrc_start] -> { [isrc_offset] : isrc_offset >= 0 and -26iprefetch <= isrc_offset < -26iprefetch + isrc_end - isrc_start and isrc_offset <= 25 }",
],
'''
knl_0_scaling = (1 / 8)*3.141592653589793**(-1) {id=insn, inames=+inner:itgt_box}
tgt_ibox = target_boxes[itgt_box] {id=insn_0, inames=inner:itgt_box}
itgt_start = box_target_starts[tgt_ibox] {id=insn_1, dep=insn_0, inames=inner:itgt_box}
itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=insn_2, dep=insn_0:insn_1, inames=inner:itgt_box}
isrc_box_start = source_box_starts[itgt_box] {id=insn_3, inames=inner:itgt_box}
isrc_box_end = source_box_starts[itgt_box + 1] {id=insn_4, inames=inner:itgt_box}
itgt_offset = itgt_offset_outer*32 + inner {id=insn_5, inames=inner:itgt_offset_outer:itgt_box}
itgt = itgt_offset + itgt_start {id=insn_6, dep=insn_5:insn_1, inames=inner:itgt_offset_outer:itgt_box}
cond_itgt = itgt < itgt_end {id=insn_7, dep=insn_2:insn_6, inames=inner:itgt_offset_outer:itgt_box}
acc[iknl] = 0 {id=init_acc, inames=iknl:inner:itgt_offset_outer:itgt_box}
tgt_center[idim_0] = targets[idim_0, itgt] {id=prefetch_tgt, dep=insn_7:insn_6, inames=inner:itgt_offset_outer:idim_0:itgt_box}
src_ibox = source_box_lists[isrc_box] {id=src_box_insn_0, inames=inner:itgt_offset_outer:isrc_box:itgt_box}
isrc_start = box_source_starts[src_ibox] {id=src_box_insn_1, dep=src_box_insn_0, inames=inner:itgt_offset_outer:isrc_box:itgt_box}
isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] {id=src_box_insn_2, dep=src_box_insn_1:src_box_insn_0, inames=inner:itgt_offset_outer:isrc_box:itgt_box}
isrc_prefetch_new = isrc_prefetch_inner*32 + inner {id=prefetch_insn1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:isrc_box:itgt_box}
isrc_prefetch = iprefetch*26 + isrc_prefetch_inner*32 + inner {id=prefetch_insn2, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:isrc_box:itgt_box}
cond_isrc = isrc_prefetch < isrc_end + (-1)*isrc_start {id=prefetch_insn3, dep=prefetch_insn2:src_box_insn_2:src_box_insn_1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:isrc_box:itgt_box}
local_isrc[idim_1, isrc_prefetch_new] = sources[idim_1, isrc_prefetch + isrc_start] {id=prefetch_src, dep=prefetch_insn3:prefetch_insn2:prefetch_insn1:src_box_insn_1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:idim_1:isrc_box:itgt_box}
local_isrc[istrength + 2, isrc_prefetch_new] = strength[istrength, isrc_prefetch + isrc_start] {id=prefetch_charge, dep=prefetch_insn3:prefetch_insn2:prefetch_insn1:src_box_insn_1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:istrength:isrc_box:itgt_box}
isrc = isrc_offset + iprefetch*26 + isrc_start {id=insn_8, dep=insn_7:src_box_insn_1, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box}
d[idim] = tgt_center[idim] + (-1)*local_isrc[idim, isrc_offset] {id=insn_9, dep=prefetch_src:insn_7:prefetch_tgt, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:idim:isrc_box:itgt_box}
strength_0 = local_isrc[2, isrc_offset] {id=insn_10, dep=insn_7:prefetch_charge, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box}
expr = d[0]*d[0] + d[1]*d[1] {id=insn_11, dep=insn_7:insn_9, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box}
cse_exprvar = sqrt(expr) {id=insn_12, dep=insn_7:insn_11, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box}
pair_result_0 = expr*log(cse_exprvar)*strength_0 {id=insn_13, dep=insn_11:insn_12:insn_10:insn_7, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box}
acc[0] = acc[0] + pair_result_0 {id=update_acc_0, dep=insn_7:init_acc:insn_13, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box}
result[0, itgt] = knl_0_scaling*acc[0] {id=write_csr, dep=insn_7:update_acc_0:insn_6:insn, inames=inner:itgt_offset_outer:itgt_box}
''', [
lp.GlobalArg(
name="sources", dtype=np.float64,
shape=(2, Variable('nsources')), for_atomic=False),
lp.GlobalArg(
name="sources_s0", dtype=np.float64,
shape=(Variable('nsources'),), for_atomic=False),
lp.GlobalArg(
name="sources_s1", dtype=np.float64,
shape=(Variable('nsources'),), for_atomic=False),
lp.GlobalArg(
name="targets", dtype=np.float64,
shape=(2, Variable('ntargets')), for_atomic=False),
lp.GlobalArg(
name="targets_s0", dtype=np.float64,
shape=(Variable('ntargets'),), for_atomic=False),
lp.GlobalArg(
name="targets_s1", dtype=np.float64,
shape=(Variable('ntargets'),), for_atomic=False),
lp.ValueArg(
name="nsources",
dtype=np.int32),
lp.ValueArg(
name="ntargets",
dtype=np.int32),
lp.GlobalArg(
name="box_target_starts", dtype=np.int32,
shape=None, for_atomic=False),
lp.GlobalArg(
name="box_target_counts_nonchild", dtype=np.int32,
shape=None, for_atomic=False),
lp.GlobalArg(
name="box_source_starts", dtype=np.int32,
shape=None, for_atomic=False),
lp.GlobalArg(
name="box_source_counts_nonchild", dtype=np.int32,
shape=None, for_atomic=False),
lp.GlobalArg(
name="source_box_starts", dtype=np.int32,
shape=None, for_atomic=False),
lp.GlobalArg(
name="source_box_lists", dtype=np.int32,
shape=None, for_atomic=False),
lp.GlobalArg(
name="strength", dtype=np.float64,
shape=(1, Variable('nsources')), for_atomic=False),
lp.GlobalArg(
name="strength_s0", dtype=np.float64,
shape=(Variable('nsources'),), for_atomic=False),
lp.GlobalArg(
name="result", dtype=np.float64,
shape=(1, Variable('ntargets')), for_atomic=False),
lp.GlobalArg(
name="result_s0", dtype=np.float64,
shape=(Variable('ntargets'),), for_atomic=False),
lp.ValueArg(
name="ntgt_boxes",
dtype=np.int32),
lp.GlobalArg(
name="target_boxes", dtype=np.int32,
shape=(Variable('ntgt_boxes'),), for_atomic=False),
lp.TemporaryVariable(
name="tgt_center",
dtype=np.float64,
shape=(2,), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="cse_exprvar",
dtype=np.float64,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="knl_0_scaling",
dtype=np.float64,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="tgt_ibox",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="itgt_start",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="itgt_end",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc_box_start",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc_box_end",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="itgt_offset",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="itgt",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="cond_itgt",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="acc",
dtype=np.float64,
shape=(1,), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="src_ibox",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc_start",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc_end",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc_prefetch_new",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc_prefetch",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="cond_isrc",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="isrc",
dtype=np.int32,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="d",
dtype=np.float64,
shape=(2,), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="strength_0",
dtype=np.float64,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="expr",
dtype=np.float64,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="pair_result_0",
dtype=np.float64,
shape=(), for_atomic=False,
address_space=lp.AddressSpace.PRIVATE,
read_only=False,
),
lp.TemporaryVariable(
name="local_isrc",
dtype=np.float64,
shape=(3, 26), for_atomic=False,
address_space=lp.AddressSpace.LOCAL,
read_only=False,
),
],
lang_version=(2018, 2),
name="p2p",
)
p2p_knl = lp.tag_inames(p2p_knl, "istrength:unr")
p2p_knl = lp.tag_inames(p2p_knl, "idim_1:unr")
p2p_knl = lp.tag_inames(p2p_knl, "idim_0:unr")
p2p_knl = lp.tag_inames(p2p_knl, "itgt_box:g.0")
p2p_knl = lp.tag_inames(p2p_knl, "inner:l.0")
p2p_knl = lp.tag_inames(p2p_knl, "idim:unr")
t_unit = lp.merge([p2p_knl])
lp.generate_code_v2(t_unit).device_code()
Following kernel still fails with
Details