Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
004b759
Added support of Intel OpenCL for FPGA devices
Mar 4, 2020
3e51e49
put resnet18 middle layers to run on vta
Mar 9, 2020
082f64e
adapt to the code base
zhanghaohit Mar 9, 2020
b6bc82a
auto device_copy feature for vta
zhanghaohit Mar 18, 2020
ef153e2
bugfix for AddDeviceCopy pass; add Mul for vta simulation
zhanghaohit Mar 24, 2020
87461d1
intelfocl support in samples
zhanghaohit Mar 26, 2020
bd79e83
sync all insts and uops in one batch
zhanghaohit Apr 3, 2020
f8eaef9
support for static auto-tune
zhanghaohit Apr 8, 2020
82cbd4f
update cost calculation formula
zhanghaohit Apr 9, 2020
a810b85
bugfix for vta add schedule
zhanghaohit Apr 9, 2020
3a8e244
bugfix for insn buffer overflow
zhanghaohit Apr 10, 2020
5c7ead7
tune vta relay refine
zhanghaohit Apr 13, 2020
cc96cbb
separate cost function from general method_methods
zhanghaohit Apr 13, 2020
d880b3b
vta mobilenetG prediction script
zhanghaohit Apr 16, 2020
f80c3e0
quickfix for auto-tune segfault
zhanghaohit Apr 20, 2020
dadf045
add dcgan support (simulation)
zhanghaohit May 6, 2020
bb3dc0e
make sync in batch as an option
zhanghaohit May 6, 2020
cb46477
quickfix for buffer overflow
zhanghaohit May 11, 2020
4ede466
bugfix for allocated_ destructor order
zhanghaohit May 11, 2020
4f375d5
refine device annotation
zhanghaohit May 14, 2020
d16d5ec
auto-tune for vta alu ops
zhanghaohit May 21, 2020
a752638
bugfix: make get_workload consistent with master_op selection
zhanghaohit May 22, 2020
1b7aa58
some fixes after rebase with master
zhanghaohit Jun 10, 2020
127ae4a
update vta-hw commit
zhanghaohit Jun 10, 2020
a6cd975
Rename VTA_MEM_ID_ACC_8 to VTA_MEM_ID_ACC_8BIT
Jun 11, 2020
06af08b
back-compatible other vta hardware impl
zhanghaohit Jun 12, 2020
0855a4a
update vta-hw commit
Jun 12, 2020
6397792
update vta-hw commit
Jun 12, 2020
e43981f
remove unneeded code
zhanghaohit Jun 14, 2020
d699384
refine graphpack and deploy exp
zhanghaohit Jun 14, 2020
4dbcdf5
some bugfix
zhanghaohit Jun 15, 2020
41374c4
remove dcgan and mobilenet tutorial
zhanghaohit Jun 15, 2020
75f7272
some bugfix and code optimize
zhanghaohit Jun 15, 2020
c8a3574
some minor fix and code refine
zhanghaohit Jun 16, 2020
7ca6f40
remove rapidjson dep (use picojson)
zhanghaohit Jun 16, 2020
12554d5
bugfix for tune alu vta
zhanghaohit Jun 16, 2020
b8d842e
cleanup
zhanghaohit Jun 18, 2020
7fc25b0
Merge branch 'master' into feature/opencl
zhanghaohit Jun 18, 2020
02b3ea0
coding style
zhanghaohit Jun 18, 2020
a1cd048
update vta-hw commit
zhanghaohit Jun 18, 2020
6960c6a
lint
zhanghaohit Jun 18, 2020
14020b7
clean up unneeded code
zhanghaohit Jun 20, 2020
b6c1763
Move AOCLUtils from Intel FPGA into 3rdparty directory
Jul 15, 2020
2075649
merge from master
zhanghaohit Jul 18, 2020
c0f918c
remove unnecessary comment
zhanghaohit Jul 18, 2020
348fb91
api to program intelfocl aocx
zhanghaohit Jul 20, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
some bugfix and code optimize
  • Loading branch information
zhanghaohit committed Jun 15, 2020
commit 75f7272552c8f8ff3f76c754b0b5008860f1ca05
32 changes: 13 additions & 19 deletions src/relay/quantize/realize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {

/* \brief Unify the dom scale of arguments */
Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
DataType* dtype_ptr, Expr* scale_ptr) {
DataType* dtype_ptr, Expr* scale_ptr, DataType dtype = DataType::Void()) {
static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
const QConfig& cfg = QConfig::Current();

Expand All @@ -345,27 +345,19 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args

// unify the data type
CHECK_EQ(ref_args.size(), args.size());
DataType dtype;

// FIXME(zhanghao): force to use add(int32, int32) in order to put in VTA ALU
// but this may be not necessary for other devices
// if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
// dtype = cfg->dtype_input;
// } else {
// dtype = cfg->dtype_activation;
// }
dtype = cfg->dtype_activation;
if (dtype.is_void()) {
if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
dtype = cfg->dtype_input;
} else {
dtype = cfg->dtype_activation;
}
}

for (size_t i = 0; i < ret.size(); ++i) {
auto ref_arg = ref_args[i].as<CallNode>();
if (nptrs[i]->dtype != dtype) {
auto new_arg = Cast(ret[i], dtype);

// FIXME(zhanghao): do not fuse float32 cast
if (nptrs[i]->dtype == DataType::Float(32)) {
ret.Set(i, StopFusion(new_arg));
} else {
ret.Set(i, new_arg);
}
ret.Set(i, Cast(ret[i], dtype));
} else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
auto new_arg = Cast(ret[i], cfg->dtype_input);
Expand All @@ -392,7 +384,9 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
DataType dtype;
Expr dom_scale;
Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
// execute the operation with activation data type.
const QConfig& cfg = QConfig::Current();
Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
Expr ret = ForwardOp(ref_call, ret_args);
return QRealizeIntExpr(ret, dom_scale, dtype);
}
Expand Down
8 changes: 5 additions & 3 deletions vta/python/vta/top/op.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def is_cast_op(op):
output = outs[0]
s = te.create_schedule([x.op for x in outs])
te.schedule.AutoInlineInjective(s)
# s[output].fuse(s[output].op.axis)

env = get_env()
# other target does not support alu-only ops
Expand Down Expand Up @@ -190,8 +189,11 @@ def multiply_strategy_vta(attrs, inputs, out_type, target):
return strategy


reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
env = get_env()
# other target does not support alu-only ops
if env.TARGET in ["sim", "tsim", "intelfocl"]:
reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")


@_strategy.conv2d_strategy.register("vta")
Expand Down
20 changes: 7 additions & 13 deletions vta/python/vta/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,9 +381,10 @@ def _fold_buffer_dim(buf, scope, elem_block):

def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
elem_block = elem_bytes * 8 // elem_width
if buf.dtype != dtype:
raise RuntimeError("Expect buffer type to be %s instead of %s" %
(dtype, buf.dtype))
# remove the checking as we have load_int8 insn
Comment thread
zhanghaohit marked this conversation as resolved.
Outdated
# if buf.dtype != dtype:
# raise RuntimeError("Expect buffer type to be %s instead of %s" %
# (dtype, buf.dtype))
shape, strides = buf.shape, buf.strides
if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
Expand Down Expand Up @@ -549,20 +550,13 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):

_check_compact(dst)

# FIXME(zhanghao): optimize
# for int8 -> int32 cast/load
orig_dtype = src.dtype
if src.dtype != data_type:
assert(data_type == "int%d" % env.ACC_WIDTH and \
src.dtype == "int%d" % env.INP_WIDTH)
src.dtype = data_type

x_size, y_size, x_stride, offset = _get_2d_pattern(
src, elem_width, elem_bytes, data_type,
dst.scope, allow_fold=allow_fold)

if orig_dtype != src.dtype:
src.dtype = orig_dtype
if data_type != src.dtype:
assert(data_type == "int%d" % env.ACC_WIDTH and \
src.dtype == "int%d" % env.INP_WIDTH)
mem_type = env.dev.MEM_ID_ACC_8BIT

irb = tvm.tir.ir_builder.create()
Expand Down
9 changes: 4 additions & 5 deletions vta/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
CHECK(fpga_buff_ != nullptr);
CHECK(fpga_buff_phy_);
uint32_t buff_size = dram_buffer_.size() * elem_bytes_;

CHECK(buff_size <= kMaxBytes);
// Copy contents of DRAM buffer to FPGA buff
VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size);
Expand Down Expand Up @@ -1322,7 +1323,6 @@ class CommandQueue {
if (insn_queue_.count() == 0) return;
// Synchronization for the queues
uop_queue_.AutoReadBarrier();

insn_queue_.AutoReadBarrier();
// Dump instructions if debug enabled
if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
Expand All @@ -1333,7 +1333,7 @@ class CommandQueue {
VTA_OPCODE_FINISH);

// Make sure that we don't exceed contiguous physical memory limits
CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER);
int timeout =
VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles);
CHECK_EQ(timeout, 0);
Expand Down Expand Up @@ -1481,9 +1481,8 @@ class CommandQueue {

void CheckInsnOverFlow() {
// At each API call, we can at most commit:
// one pending store, one pending load, and one uop
// FIXME(zhanghao): check why there are 5 insns
if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
// at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH
if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) {
this->AutoSync();
}
}
Expand Down