diff --git a/ioctl.c b/ioctl.c index 6a7d6b9..d1fd36a 100644 --- a/ioctl.c +++ b/ioctl.c @@ -218,13 +218,31 @@ static long ioctl_send_cmds(struct mx_pci_dev *mx_pdev, unsigned long arg) sq_mbox = mx_pdev->sq_mbox_list[send_cmd.qid]; mutex_lock(&sq_mbox->lock); - if (read_ctrl_from_device(mx_pdev, (char __user *)&ctx.u64, sizeof(uint64_t), (loff_t *)&sq_mbox->r_ctx_addr, IO_OPCODE_SQ_READ) <= 0) { - mutex_unlock(&sq_mbox->lock); - return -EINTR; - } - sq_mbox->ctx.head = ctx.head; + /* + * Cached-head fast path. The cached head only ever lags the real + * device head (device monotonically advances head as it consumes), + * so cached_pushable <= real_pushable. If the cached pushable count + * already covers the requested batch we can skip the synchronous + * PCIe read of the device-side head register entirely. + * + * Skipping the read shaves the per-call cost from ~30us down to + * the same order as ioctl_send_cmd_with_data (~12us), since that + * sibling already uses a cached + busy-poll-on-full pattern. Loss + * case: when real head moved further than cached, we may push less + * than physically possible -- caller resubmits the remainder, no + * correctness impact. + */ count = get_pushable_count(sq_mbox); + if (count < send_cmd.nr_cmds) { + if (read_ctrl_from_device(mx_pdev, (char __user *)&ctx.u64, sizeof(uint64_t), (loff_t *)&sq_mbox->r_ctx_addr, IO_OPCODE_SQ_READ) <= 0) { + mutex_unlock(&sq_mbox->lock); + return -EINTR; + } + sq_mbox->ctx.head = ctx.head; + count = get_pushable_count(sq_mbox); + } + if (count == 0) goto out;