diff --git a/ioctl.c b/ioctl.c
index 6a7d6b9..d1fd36a 100644
--- a/ioctl.c
+++ b/ioctl.c
@@ -218,13 +218,31 @@ static long ioctl_send_cmds(struct mx_pci_dev *mx_pdev, unsigned long arg)
 	sq_mbox = mx_pdev->sq_mbox_list[send_cmd.qid];
 
 	mutex_lock(&sq_mbox->lock);
-	if (read_ctrl_from_device(mx_pdev, (char __user *)&ctx.u64, sizeof(uint64_t), (loff_t *)&sq_mbox->r_ctx_addr, IO_OPCODE_SQ_READ) <= 0) {
-		mutex_unlock(&sq_mbox->lock);
-		return -EINTR;
-	}
-	sq_mbox->ctx.head = ctx.head;
 
+	/*
+	 * Cached-head fast path. The cached head only ever lags the real
+	 * device head (device monotonically advances head as it consumes),
+	 * so cached_pushable <= real_pushable. If the cached pushable count
+	 * already covers the requested batch we can skip the synchronous
+	 * PCIe read of the device-side head register entirely.
+	 *
+	 * Skipping the read shaves the per-call cost from ~30us down to
+	 * the same order as ioctl_send_cmd_with_data (~12us), since that
+	 * sibling already uses a cached + busy-poll-on-full pattern. Loss
+	 * case: when real head moved further than cached, we may push less
+	 * than physically possible -- caller resubmits the remainder, no
+	 * correctness impact.
+	 */
 	count = get_pushable_count(sq_mbox);
+	if (count < send_cmd.nr_cmds) {
+		if (read_ctrl_from_device(mx_pdev, (char __user *)&ctx.u64, sizeof(uint64_t), (loff_t *)&sq_mbox->r_ctx_addr, IO_OPCODE_SQ_READ) <= 0) {
+			mutex_unlock(&sq_mbox->lock);
+			return -EINTR;
+		}
+		sq_mbox->ctx.head = ctx.head;
+		count = get_pushable_count(sq_mbox);
+	}
+
 	if (count == 0)
 		goto out;