some bugfix and code optimize

apache · zhanghaohit · Mar 4, 2020 · Mar 9, 2020 · Mar 9, 2020 · Mar 18, 2020
commit 75f7272552c8f8ff3f76c754b0b5008860f1ca05
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
@@ -330,7 +330,7 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 /* \brief Unify the dom scale of arguments */
 Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
-                            DataType* dtype_ptr, Expr* scale_ptr) {
+                            DataType* dtype_ptr, Expr* scale_ptr, DataType dtype = DataType::Void()) {
   static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
@@ -345,27 +345,19 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 
   // unify the data type
   CHECK_EQ(ref_args.size(), args.size());
-  DataType dtype;
 
-  // FIXME(zhanghao): force to use add(int32, int32) in order to put in VTA ALU
-  // but this may be not necessary for other devices
-  // if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
-  //   dtype = cfg->dtype_input;
-  // } else {
-  //   dtype = cfg->dtype_activation;
-  // }
-  dtype = cfg->dtype_activation;
+  if (dtype.is_void()) {
+    if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
+      dtype = cfg->dtype_input;
+    } else {
+      dtype = cfg->dtype_activation;
+    }
+  }
+
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
-      auto new_arg = Cast(ret[i], dtype);
-
-      // FIXME(zhanghao): do not fuse float32 cast
-      if (nptrs[i]->dtype == DataType::Float(32)) {
-        ret.Set(i, StopFusion(new_arg));
-      } else {
-        ret.Set(i, new_arg);
-      }
+      ret.Set(i, Cast(ret[i], dtype));
     } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
                ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
       auto new_arg = Cast(ret[i], cfg->dtype_input);
@@ -392,7 +384,9 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
+    // execute the operation with activation data type.
+    const QConfig& cfg = QConfig::Current();
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
@@ -82,7 +82,6 @@ def is_cast_op(op):
     output = outs[0]
     s = te.create_schedule([x.op for x in outs])
     te.schedule.AutoInlineInjective(s)
-    # s[output].fuse(s[output].op.axis)
 
     env = get_env()
     # other target does not support alu-only ops
@@ -190,8 +189,11 @@ def multiply_strategy_vta(attrs, inputs, out_type, target):
     return strategy
 
 
-reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
-reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
+env = get_env()
+# other target does not support alu-only ops
+if env.TARGET in ["sim", "tsim", "intelfocl"]:
+    reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
+    reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
 
 
 @_strategy.conv2d_strategy.register("vta")

diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
@@ -381,9 +381,10 @@ def _fold_buffer_dim(buf, scope, elem_block):
 
     def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
         elem_block = elem_bytes * 8 // elem_width
-        if buf.dtype != dtype:
-            raise RuntimeError("Expect buffer type to be %s instead of %s" %
-                               (dtype, buf.dtype))
+        # remove the checking as we have load_int8 insn
+        # if buf.dtype != dtype:
+        #     raise RuntimeError("Expect buffer type to be %s instead of %s" %
+        #                        (dtype, buf.dtype))
         shape, strides = buf.shape, buf.strides
         if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
             raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
@@ -549,20 +550,13 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
 
             _check_compact(dst)
 
-            # FIXME(zhanghao): optimize
-            # for int8 -> int32 cast/load
-            orig_dtype = src.dtype
-            if src.dtype != data_type:
-                assert(data_type == "int%d" % env.ACC_WIDTH and \
-                       src.dtype == "int%d" % env.INP_WIDTH)
-                src.dtype = data_type
-
             x_size, y_size, x_stride, offset = _get_2d_pattern(
                 src, elem_width, elem_bytes, data_type,
                 dst.scope, allow_fold=allow_fold)
 
-            if orig_dtype != src.dtype:
-                src.dtype = orig_dtype
+            if data_type != src.dtype:
+                assert(data_type == "int%d" % env.ACC_WIDTH and \
+                       src.dtype == "int%d" % env.INP_WIDTH)
                 mem_type = env.dev.MEM_ID_ACC_8BIT
 
             irb = tvm.tir.ir_builder.create()

diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
@@ -1078,6 +1078,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     CHECK(fpga_buff_ != nullptr);
     CHECK(fpga_buff_phy_);
     uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
+
     CHECK(buff_size <= kMaxBytes);
     // Copy contents of DRAM buffer to FPGA buff
     VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size);
@@ -1322,7 +1323,6 @@ class CommandQueue {
     if (insn_queue_.count() == 0) return;
     // Synchronization for the queues
     uop_queue_.AutoReadBarrier();
-
     insn_queue_.AutoReadBarrier();
     // Dump instructions if debug enabled
     if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
@@ -1333,7 +1333,7 @@ class CommandQueue {
           VTA_OPCODE_FINISH);
 
     // Make sure that we don't exceed contiguous physical memory limits
-    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
+    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER);
     int timeout =
         VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles);
     CHECK_EQ(timeout, 0);
@@ -1481,9 +1481,8 @@ class CommandQueue {
 
   void CheckInsnOverFlow() {
     // At each API call, we can at most commit:
-    // one pending store, one pending load, and one uop
-    // FIXME(zhanghao): check why there are 5 insns
-    if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
+    // at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH
+    if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) {
       this->AutoSync();
     }
   }