From ce6bb5d59c9f9e0e2a8f1cd75677ed225b86c899 Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Wed, 22 Apr 2026 23:06:29 +0900 Subject: [PATCH 1/8] perf(core): run submit/complete handlers as SCHED_FIFO low Lowest RT band keeps the I/O handlers ahead of CFS noise so userspace submissions don't pay CFS wake latency under CPU pressure. Handlers still yield via cond_resched() and sleep in swait_event when idle, so softlockup/RCU stalls remain bounded. --- core_v1.c | 9 +++++++++ core_v2.c | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/core_v1.c b/core_v1.c index 37f1ebd..a8e4d91 100644 --- a/core_v1.c +++ b/core_v1.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: #include +#include #include "mx_dma.h" @@ -313,6 +314,13 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev) pr_err("Failed to create submit thread (err=%ld)\n", PTR_ERR(mx_pdev->submit_thread)); return PTR_ERR(mx_pdev->submit_thread); } + /* + * SCHED_FIFO (lowest RT band) keeps the handler ahead of CFS noise so + * a userspace I/O submission doesn't pay CFS wake latency when the box + * is busy. Handlers still yield via cond_resched() and sleep in + * swait_event when idle, so softlockup/RCU stalls are not a concern. + */ + sched_set_fifo_low(mx_pdev->submit_thread); mx_pdev->complete_thread = kthread_run(mx_complete_handler, &queue->common, "mx_complete_thd%d", mx_pdev->dev_id); if (IS_ERR(mx_pdev->complete_thread)) { @@ -320,6 +328,7 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev) kthread_stop(mx_pdev->submit_thread); return PTR_ERR(mx_pdev->complete_thread); } + sched_set_fifo_low(mx_pdev->complete_thread); mx_pdev->io_queue = (struct mx_queue *)queue; diff --git a/core_v2.c b/core_v2.c index a923ef1..30d5ab3 100644 --- a/core_v2.c +++ b/core_v2.c @@ -2,6 +2,7 @@ // SPDX-License-Identifier: #include +#include #include "mx_dma.h" @@ -455,12 +456,16 @@ static int configure_io_queue(struct mx_pci_dev *mx_pdev) pr_err("Failed to create submit thread (err=%ld)\n", PTR_ERR(mx_pdev->submit_thread)); return PTR_ERR(mx_pdev->submit_thread); } + /* See core_v1.c: SCHED_FIFO (lowest RT band) for low scheduling latency. */ + sched_set_fifo_low(mx_pdev->submit_thread); + mx_pdev->complete_thread = kthread_run(mx_complete_handler, &io_queue->common, "mx_complete_thd%d", mx_pdev->dev_id); if (IS_ERR(mx_pdev->complete_thread)) { pr_err("Failed to create complete thread (err=%ld)\n", PTR_ERR(mx_pdev->complete_thread)); kthread_stop(mx_pdev->submit_thread); return PTR_ERR(mx_pdev->complete_thread); } + sched_set_fifo_low(mx_pdev->complete_thread); mx_pdev->io_queue = (struct mx_queue *)io_queue; From 0e1493fed624205ca1cf78607496fedcba6aa02a Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Wed, 22 Apr 2026 23:06:37 +0900 Subject: [PATCH 2/8] perf(fops): pre-wake io handlers on DMA/ioctl entry Wake sq_wait and cq_wait at the top of every data/context/ioctl path so the handler kthreads start running in parallel with page pinning, DMA mapping, and command construction. The wake is a cheap no-op when the handler is already running, and removes the cold-start component of wake latency when it wasn't. --- fops.c | 5 +++++ mx_dma.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/fops.c b/fops.c index 384d77e..cf3d896 100644 --- a/fops.c +++ b/fops.c @@ -92,6 +92,7 @@ static ssize_t mxdma_device_read_data(struct file *file, char __user *buf, size_ if (ret) return ret; + mx_prewake_handlers(mx_pdev); return read_data_from_device_parallel(mx_pdev, buf, count, pos, IO_OPCODE_DATA_READ); } @@ -115,6 +116,7 @@ static ssize_t mxdma_device_read_context(struct file *file, char __user *buf, si if (ret) return ret; + mx_prewake_handlers(mx_pdev); return read_data_from_device(mx_pdev, buf, count, pos, IO_OPCODE_CONTEXT_READ); } @@ -133,6 +135,7 @@ static ssize_t mxdma_device_write_data(struct file *file, const char __user *buf if (ret) return ret; + mx_prewake_handlers(mx_pdev); return write_data_to_device_parallel(mx_pdev, buf, count, pos, IO_OPCODE_DATA_WRITE, false); } @@ -151,6 +154,7 @@ static ssize_t mxdma_device_write_context(struct file *file, const char __user * if (ret) return ret; + mx_prewake_handlers(mx_pdev); return write_data_to_device(mx_pdev, buf, count, pos, IO_OPCODE_CONTEXT_WRITE, false); } @@ -164,6 +168,7 @@ static long mxdma_device_ioctl(struct file *file, unsigned int cmd, unsigned lon if (ret) return ret; + mx_prewake_handlers(mx_pdev); return ioctl_to_device(mx_pdev, cmd, arg); } diff --git a/mx_dma.h b/mx_dma.h index 0574c76..6d31df4 100644 --- a/mx_dma.h +++ b/mx_dma.h @@ -284,6 +284,22 @@ void mx_stop_queue_threads(struct mx_pci_dev *mx_pdev); int mx_submit_handler(void *arg); int mx_complete_handler(void *arg); +/* + * Wake both submit and complete handlers so they start running in parallel + * with userspace transfer setup (page pinning, DMA mapping, command build). + * Safe to call from any I/O entry point; cheap no-op if handlers are already + * running. + */ +static inline void mx_prewake_handlers(struct mx_pci_dev *mx_pdev) +{ + struct mx_queue *q = mx_pdev ? mx_pdev->io_queue : NULL; + + if (!q) + return; + swake_up_one(&q->sq_wait); + swake_up_one(&q->cq_wait); +} + void register_mx_ops_v1(struct mx_operations *ops); void register_mx_ops_v2(struct mx_operations *ops); From 4b4d77fa732242cbcb00e0ed99234f632fd13471 Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Thu, 23 Apr 2026 11:00:18 +0900 Subject: [PATCH 3/8] perf(core): bind io handlers to device-local NUMA cpumask Restrict mx_submit_thd and mx_complete_thd to the device's NUMA node via set_cpus_allowed_ptr at queue init. Keeps handler cache traffic (descriptor ring, sq/cq_wait, transfer structs) node-local instead of letting the scheduler place them on any CPU in the system. Uses set_cpus_allowed_ptr rather than kthread_bind so operators can still taskset to colocate handlers with a specific userspace CPU for tighter tuning. No-op on devices without NUMA affinity. --- core_v1.c | 2 ++ core_v2.c | 2 ++ mx_dma.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/core_v1.c b/core_v1.c index a8e4d91..68b00eb 100644 --- a/core_v1.c +++ b/core_v1.c @@ -332,6 +332,8 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev) mx_pdev->io_queue = (struct mx_queue *)queue; + mx_bind_handlers_to_numa(mx_pdev); + return 0; } diff --git a/core_v2.c b/core_v2.c index 30d5ab3..3d5c953 100644 --- a/core_v2.c +++ b/core_v2.c @@ -469,6 +469,8 @@ static int configure_io_queue(struct mx_pci_dev *mx_pdev) mx_pdev->io_queue = (struct mx_queue *)io_queue; + mx_bind_handlers_to_numa(mx_pdev); + return 0; } diff --git a/mx_dma.h b/mx_dma.h index 6d31df4..a7bb091 100644 --- a/mx_dma.h +++ b/mx_dma.h @@ -11,8 +11,11 @@ #include #include #include +#include #include +#include #include +#include #include #include @@ -300,6 +303,36 @@ static inline void mx_prewake_handlers(struct mx_pci_dev *mx_pdev) swake_up_one(&q->cq_wait); } +/* + * Restrict io handler kthreads to the device-local NUMA node so their + * cache traffic (descriptor ring, sq/cq_wait, transfer structs) stays + * node-local. This is a soft affinity hint via set_cpus_allowed_ptr, + * not a hard kthread_bind: operators can still override with taskset + * to colocate handlers with a specific userspace CPU. No-op when the + * device has no NUMA affinity or the node cpumask is empty. + */ +static inline void mx_bind_handlers_to_numa(struct mx_pci_dev *mx_pdev) +{ + const struct cpumask *mask; + int node; + + if (!mx_pdev || !mx_pdev->pdev) + return; + + node = dev_to_node(&mx_pdev->pdev->dev); + if (node == NUMA_NO_NODE) + return; + + mask = cpumask_of_node(node); + if (cpumask_empty(mask)) + return; + + if (!IS_ERR_OR_NULL(mx_pdev->submit_thread)) + set_cpus_allowed_ptr(mx_pdev->submit_thread, mask); + if (!IS_ERR_OR_NULL(mx_pdev->complete_thread)) + set_cpus_allowed_ptr(mx_pdev->complete_thread, mask); +} + void register_mx_ops_v1(struct mx_operations *ops); void register_mx_ops_v2(struct mx_operations *ops); From 55d737e71c7b3489527c04c3881d1a46b7201b5a Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Thu, 23 Apr 2026 11:02:03 +0900 Subject: [PATCH 4/8] perf(pci): hold cpu_latency PM QoS to block deep C-states Register a cpu_latency_qos request with a 50us wake-up budget at device probe and release it at device remove. This blocks deep idle states whose exit latency would stretch the freq ramp-up window we observed adding ~12us to cold DMA submissions (governor reaching boost freq after the CPU wakes from a deep idle). Held across the device's lifetime; shallow idle remains allowed so we don't force a polling-idle CPU. Freed on both success and out_fail paths via destroy_mx_pdev. --- init.c | 11 +++++++++++ mx_dma.h | 16 ++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/init.c b/init.c index 3aaaaec..0d663a4 100644 --- a/init.c +++ b/init.c @@ -233,6 +233,9 @@ static void destroy_mx_pdev(struct pci_dev *pdev) if (!mx_pdev) return; + if (cpu_latency_qos_request_active(&mx_pdev->cpu_latency_req)) + cpu_latency_qos_remove_request(&mx_pdev->cpu_latency_req); + mx_pdev->ops.release_queue(mx_pdev); if (!IS_ERR_OR_NULL(mx_pdev->zombie_cleanup_thread)) { @@ -269,6 +272,14 @@ static int create_mx_pdev(struct pci_dev *pdev, int cxl_memdev_id) mx_pdev->pdev = pdev; mx_pdev->dev_id = cxl_memdev_id; + /* + * Hold a cpu_latency PM QoS for the device's lifetime. Blocks deep + * C-states whose exit latency would stretch the freq ramp-up window + * that adds ~12 us to cold DMA submissions in our measurements. + * Removed in destroy_mx_pdev (including the out_fail path). + */ + cpu_latency_qos_add_request(&mx_pdev->cpu_latency_req, MX_CPU_LATENCY_QOS_US); + if (pdev->revision == 0x1) { register_mx_ops_v1(&mx_pdev->ops); pr_info("PCI device revision 1 detected\n"); diff --git a/mx_dma.h b/mx_dma.h index a7bb091..07fdf52 100644 --- a/mx_dma.h +++ b/mx_dma.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,15 @@ #define POLLING_INTERVAL_MSEC 4 #define ZOMBIE_POLL_INTERVAL_MSEC 1000 +/* + * Wake-up latency budget held via cpu_latency_qos for the lifetime of each + * mx device. Blocks deep C-states whose exit latency would stretch the + * freq ramp-up window we observed adding ~12 us to cold DMA submissions. + * Small enough to still allow shallow idle for power; large enough not to + * force a polling-idle CPU. + */ +#define MX_CPU_LATENCY_QOS_US 50 + enum { MX_CDEV_DATA = 0, MX_CDEV_CONTEXT, @@ -250,6 +260,12 @@ struct mx_pci_dev { struct list_head zombie_list; spinlock_t zombie_lock; struct task_struct *zombie_cleanup_thread; + + /* + * Held across the device's lifetime to block deep C-states. Shallow + * idle is still allowed so we don't force a polling-idle CPU. + */ + struct pm_qos_request cpu_latency_req; }; extern struct file_operations *mxdma_fops_array[]; From 36a9f31f0d8eb681b178120644eb913bcf89adb0 Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Thu, 23 Apr 2026 16:19:11 +0900 Subject: [PATCH 5/8] perf(transfer): embed single-page storage in mx_transfer Add pages_inline[1], sg_inline[1], and a 64 B cmd_inline area to struct mx_transfer so the single-page hot path skips kcalloc(pages), sg_alloc_table_from_pages(), and kzalloc(mx_command). Free paths detect inline use by pointer identity and skip the corresponding kfree / sg_free_table. BUILD_BUG_ON guards cmd_inline against future growth of struct mx_command (v1=32 B, v2=64 B today). --- core_v1.c | 17 ++++-------- core_v2.c | 11 ++------ mx_dma.h | 27 ++++++++++++++++++ transfer.c | 80 ++++++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 95 insertions(+), 40 deletions(-) diff --git a/core_v1.c b/core_v1.c index 68b00eb..868912d 100644 --- a/core_v1.c +++ b/core_v1.c @@ -155,12 +155,10 @@ static const struct mx_queue_ops v1_queue_ops = { static struct mx_command *alloc_mx_command(struct mx_transfer *transfer, int opcode) { - struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL); + struct mx_command *comm = (struct mx_command *)transfer->cmd_inline; - if (!comm) { - pr_warn("Failed to allocate mx_command\n"); - return NULL; - } + BUILD_BUG_ON(sizeof(struct mx_command) > MX_CMD_INLINE_SIZE); + memset(comm, 0, sizeof(*comm)); comm->magic = MAGIC_COMMAND; comm->id = transfer->id; @@ -192,7 +190,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer comm->host_addr = sg_dma_address(sg); if (!comm->host_addr) { pr_warn("Failed to get sg_dma_address\n"); - kfree(comm); return NULL; } } else { @@ -200,7 +197,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer comm->prp_entry1 = mx_desc_list_init(mx_pdev, transfer, SINGLE_DMA_SIZE, NUM_OF_DESC_PER_LIST, false); if (!comm->prp_entry1) { pr_warn("Failed to get desc_list_init\n"); - kfree(comm); return NULL; } } @@ -236,12 +232,9 @@ static void *create_mx_command_ctrl(struct mx_transfer *transfer, int opcode) static void *create_mx_command_passthru(struct mx_transfer *transfer, int subopcode) { - struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL); + struct mx_command *comm = (struct mx_command *)transfer->cmd_inline; - if (!comm) { - pr_warn("Failed to allocate mx_command for passthru\n"); - return NULL; - } + memset(comm, 0, sizeof(*comm)); comm->magic = MAGIC_COMMAND; comm->opcode = IO_OPCODE_PASSTHRU; diff --git a/core_v2.c b/core_v2.c index 3d5c953..44bcecb 100644 --- a/core_v2.c +++ b/core_v2.c @@ -211,12 +211,10 @@ static const struct mx_queue_ops v2_queue_ops = { static struct mx_command *alloc_mx_command(struct mx_transfer *transfer, int opcode) { - struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL); + struct mx_command *comm = (struct mx_command *)transfer->cmd_inline; - if (!comm) { - pr_warn("Failed to allocate mx_command\n"); - return NULL; - } + BUILD_BUG_ON(sizeof(struct mx_command) > MX_CMD_INLINE_SIZE); + memset(comm, 0, sizeof(*comm)); comm->opcode = opcode; comm->command_id = transfer->id; @@ -241,7 +239,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer comm->prp_entry1 = sg_dma_address(sg); if (!comm->prp_entry1) { pr_warn("Failed to get sg_dma_address\n"); - kfree(comm); return NULL; } @@ -254,14 +251,12 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer comm->prp_entry2 = sg_dma_address(sg_next(sg)); if (!comm->prp_entry2) { pr_warn("Failed to get sg_dma_address\n"); - kfree(comm); return NULL; } } else { comm->prp_entry2 = mx_desc_list_init(mx_pdev, transfer, SINGLE_DMA_SIZE, NUM_OF_DESC_PER_LIST, true); if (!comm->prp_entry2) { pr_warn("Failed to desc_list_init\n"); - kfree(comm); return NULL; } } diff --git a/mx_dma.h b/mx_dma.h index 07fdf52..9b5825f 100644 --- a/mx_dma.h +++ b/mx_dma.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,22 @@ #define POLLING_INTERVAL_MSEC 4 #define ZOMBIE_POLL_INTERVAL_MSEC 1000 +/* + * Single-page fast path: embed one struct page * and one scatterlist inside + * mx_transfer so the 8 B / sub-page hot path skips kcalloc(pages) and + * sg_alloc_table_from_pages(). Multi-page transfers still fall back to the + * dynamic allocations in map_user_addr_to_sg(). + */ +#define MX_PAGES_INLINE_NR 1 + +/* + * Inline storage for the hardware command struct. Sized to the larger of + * the v1 / v2 struct mx_command definitions (v1=32 B, v2=64 B). Enforced + * by BUILD_BUG_ON in each core_v*.c; bumping either struct past this limit + * fails the build instead of silently overrunning. + */ +#define MX_CMD_INLINE_SIZE 64 + /* * Wake-up latency budget held via cpu_latency_qos for the lifetime of each * mx device. Blocks deep C-states whose exit latency would stretch the @@ -172,6 +189,16 @@ struct mx_transfer { int desc_list_cnt; void **desc_list_va; dma_addr_t *desc_list_ba; + + /* + * Inline fast-path storage. Active when pages_nr <= MX_PAGES_INLINE_NR. + * Free paths detect inline use by pointer identity + * (pages == pages_inline, sgt.sgl == sg_inline, command == cmd_inline) + * and skip the corresponding kfree / sg_free_table. + */ + struct page *pages_inline[MX_PAGES_INLINE_NR]; + struct scatterlist sg_inline[MX_PAGES_INLINE_NR]; + uint8_t cmd_inline[MX_CMD_INLINE_SIZE] __aligned(8); }; struct mx_event { diff --git a/transfer.c b/transfer.c index 2453b43..852d463 100644 --- a/transfer.c +++ b/transfer.c @@ -32,12 +32,20 @@ static void unmap_user_addr_to_sg(struct device *dev, struct mx_transfer *transf if (transfer->pages_nr > 0) unpin_user_pages(transfer->pages, transfer->pages_nr); - sg_free_table(&transfer->sgt); + /* + * Inline SG (sg_inline[]) is embedded in mx_transfer — calling + * sg_free_table() on it would kfree a static array. Only free the + * table when sg_alloc_table_from_pages() backed the sgl. + */ + if (sgt->sgl && sgt->sgl != transfer->sg_inline) + sg_free_table(sgt); + sgt->sgl = NULL; + sgt->nents = 0; + sgt->orig_nents = 0; - if (transfer->pages) { + if (transfer->pages && transfer->pages != transfer->pages_inline) kfree(transfer->pages); - transfer->pages = NULL; - } + transfer->pages = NULL; } static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer) @@ -56,10 +64,18 @@ static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer) if (!pages_nr) return 0; - transfer->pages = kcalloc(pages_nr, sizeof(struct page *), GFP_KERNEL); - if (!transfer->pages) { - pr_warn("Failed to alloc pages\n"); - return -ENOMEM; + /* + * Fast path: single-page transfers reuse the inline array embedded in + * mx_transfer. Only the >MX_PAGES_INLINE_NR case hits the allocator. + */ + if (pages_nr <= MX_PAGES_INLINE_NR) { + transfer->pages = transfer->pages_inline; + } else { + transfer->pages = kcalloc(pages_nr, sizeof(struct page *), GFP_KERNEL); + if (!transfer->pages) { + pr_warn("Failed to alloc pages\n"); + return -ENOMEM; + } } /* Pin user_addr to pages */ @@ -69,7 +85,8 @@ static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer) pinned = pin_user_pages_fast((unsigned long)user_addr, pages_nr, gup_flags, transfer->pages); if (pinned < 0) { pr_warn("pin_user_pages_fast failed (err=%ld)\n", pinned); - kfree(transfer->pages); + if (transfer->pages != transfer->pages_inline) + kfree(transfer->pages); transfer->pages = NULL; return (int)pinned; } @@ -77,26 +94,47 @@ static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer) pr_warn("pin_user_pages_fast partial (req=%u, got=%ld)\n", pages_nr, pinned); if (pinned > 0) unpin_user_pages(transfer->pages, pinned); - kfree(transfer->pages); + if (transfer->pages != transfer->pages_inline) + kfree(transfer->pages); transfer->pages = NULL; return -EFAULT; } transfer->pages_nr = pages_nr; - /* Alloc sg_table as pages_nr */ - ret = sg_alloc_table_from_pages(sgt, transfer->pages, pages_nr, offset, size, GFP_KERNEL); - if (ret) { - pr_warn("sg_alloc_table_from_pages failed (err=%d)\n", ret); - unpin_user_pages(transfer->pages, transfer->pages_nr); - transfer->pages_nr = 0; - return ret; + if (pages_nr <= MX_PAGES_INLINE_NR) { + /* + * Hand-build a single-entry sg_table using the inline scatterlist. + * Skipping sg_alloc_table_from_pages() saves its internal kmalloc + * plus the dynamic sgl free path in unmap_user_addr_to_sg(). + */ + sg_init_table(transfer->sg_inline, MX_PAGES_INLINE_NR); + sg_set_page(&transfer->sg_inline[0], transfer->pages[0], size, offset); + sgt->sgl = transfer->sg_inline; + sgt->orig_nents = pages_nr; + sgt->nents = pages_nr; + } else { + ret = sg_alloc_table_from_pages(sgt, transfer->pages, pages_nr, offset, size, GFP_KERNEL); + if (ret) { + pr_warn("sg_alloc_table_from_pages failed (err=%d)\n", ret); + unpin_user_pages(transfer->pages, transfer->pages_nr); + if (transfer->pages != transfer->pages_inline) + kfree(transfer->pages); + transfer->pages = NULL; + transfer->pages_nr = 0; + return ret; + } } /* Map the given buffer for DMA */ sgt->nents = dma_map_sg(dev, sgt->sgl, sgt->orig_nents, transfer->dir); if (!sgt->nents) { - sg_free_table(sgt); + if (sgt->sgl != transfer->sg_inline) + sg_free_table(sgt); + sgt->sgl = NULL; unpin_user_pages(transfer->pages, transfer->pages_nr); + if (transfer->pages != transfer->pages_inline) + kfree(transfer->pages); + transfer->pages = NULL; pr_warn("Failed to dma_map_sg\n"); return -EIO; } @@ -169,7 +207,8 @@ int desc_list_alloc(struct mx_pci_dev *mx_pdev, struct mx_transfer *transfer, in static void release_mx_transfer(struct mx_transfer *transfer) { transfer_id_free(transfer->id); - kfree(transfer->command); + if (transfer->command && transfer->command != (void *)transfer->cmd_inline) + kfree(transfer->command); kfree(transfer); } @@ -741,7 +780,8 @@ static void drain_zombie_list(struct mx_pci_dev *mx_pdev, struct list_head *list desc_list_free(mx_pdev, transfer); } - kfree(transfer->command); + if (transfer->command && transfer->command != (void *)transfer->cmd_inline) + kfree(transfer->command); kfree(transfer); } } From bb73f2e8c3ccb6ce6cd32a5fec7fee57be7665a9 Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Thu, 23 Apr 2026 16:21:37 +0900 Subject: [PATCH 6/8] perf(transfer): allocate mx_transfer from dedicated kmem_cache Replace the generic kmalloc bucket allocation with a SLAB_HWCACHE_ALIGN kmem_cache sized exactly to struct mx_transfer. The per-cpu slab magazine keeps freshly freed transfers hot for the next allocation, cutting slab-partial contention on repeated small-I/O loops. Cache lifetime is tied to module load: create after class_create() in mxdma_init(), destroy after the PCI / bus teardown that drains all in-flight transfers in mxdma_exit(). --- init.c | 31 ++++++++++++++++++++++++++++++- mx_dma.h | 8 ++++++++ transfer.c | 8 ++++---- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/init.c b/init.c index 0d663a4..979bcbd 100644 --- a/init.c +++ b/init.c @@ -6,6 +6,7 @@ /* Initialization */ /******************************************************************************/ static struct class *mxdma_class; +struct kmem_cache *mx_transfer_cache; #ifndef CONFIG_WO_CXL static LIST_HEAD(mx_device_list_head); @@ -540,10 +541,28 @@ static int mxdma_init(void) mxdma_class->devnode = mxdma_devnode; + mx_transfer_cache = kmem_cache_create("mx_transfer", + sizeof(struct mx_transfer), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!mx_transfer_cache) { + pr_err("Failed to create mx_transfer kmem_cache\n"); + class_destroy(mxdma_class); + return -ENOMEM; + } + pr_info("MXDMA driver is loaded\n"); #ifdef CONFIG_WO_CXL - return pci_register_driver(&pci_driver); + { + int ret = pci_register_driver(&pci_driver); + + if (ret) { + kmem_cache_destroy(mx_transfer_cache); + mx_transfer_cache = NULL; + class_destroy(mxdma_class); + } + return ret; + } #else bus_register_notifier(&pci_bus_type, &mxdma_pci_notifier); return 0; @@ -579,6 +598,16 @@ static void mxdma_exit(void) destroy_device_list(); #endif + /* + * PCI unregister / device-list teardown above completes all in-flight + * transfers (including zombie drain in remove()), so every mx_transfer + * has been returned to the slab before we destroy the cache. + */ + if (mx_transfer_cache) { + kmem_cache_destroy(mx_transfer_cache); + mx_transfer_cache = NULL; + } + if (mxdma_class) class_destroy(mxdma_class); diff --git a/mx_dma.h b/mx_dma.h index 9b5825f..f433fe3 100644 --- a/mx_dma.h +++ b/mx_dma.h @@ -297,6 +297,14 @@ struct mx_pci_dev { extern struct file_operations *mxdma_fops_array[]; +/* + * Dedicated slab cache for struct mx_transfer. Sized exactly to the + * transfer and tagged SLAB_HWCACHE_ALIGN so per-op alloc/free hits a + * hot per-cpu magazine instead of the generic kmalloc-256/512 buckets. + * Created in mxdma_init(), destroyed in mxdma_exit(). + */ +extern struct kmem_cache *mx_transfer_cache; + int transfer_id_alloc(void *ptr); void transfer_id_free(unsigned long id); void *find_transfer_by_id(unsigned long id); diff --git a/transfer.c b/transfer.c index 852d463..f774e24 100644 --- a/transfer.c +++ b/transfer.c @@ -209,7 +209,7 @@ static void release_mx_transfer(struct mx_transfer *transfer) transfer_id_free(transfer->id); if (transfer->command && transfer->command != (void *)transfer->cmd_inline) kfree(transfer->command); - kfree(transfer); + kmem_cache_free(mx_transfer_cache, transfer); } static struct mx_transfer *alloc_mx_transfer(char __user *user_addr, size_t size, uint64_t device_addr, @@ -217,7 +217,7 @@ static struct mx_transfer *alloc_mx_transfer(char __user *user_addr, size_t size { struct mx_transfer *transfer; - transfer = kzalloc(sizeof(struct mx_transfer), GFP_KERNEL); + transfer = kmem_cache_zalloc(mx_transfer_cache, GFP_KERNEL); if (!transfer) { return NULL; } @@ -228,7 +228,7 @@ static struct mx_transfer *alloc_mx_transfer(char __user *user_addr, size_t size transfer->id = transfer_id_alloc(transfer); if (transfer->id < 0) { pr_warn("Failed to alloc transfer_id\n"); - kfree(transfer); + kmem_cache_free(mx_transfer_cache, transfer); return NULL; } @@ -782,7 +782,7 @@ static void drain_zombie_list(struct mx_pci_dev *mx_pdev, struct list_head *list if (transfer->command && transfer->command != (void *)transfer->cmd_inline) kfree(transfer->command); - kfree(transfer); + kmem_cache_free(mx_transfer_cache, transfer); } } From ceb93e19295f19062709784f19bd2e8cfd56db81 Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Thu, 23 Apr 2026 17:28:09 +0900 Subject: [PATCH 7/8] perf(core_v1): pop_mx_command reads only fields used by completion path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1 profile showed memcpy_fromio(sizeof(struct mx_command)) at ~6.5 % of total cycles — four MMIO readq per pop — while the completion path only consumes the header (id / control) and host_addr (result). size and device_addr are producer-side fields that the host never reads on completion. Drop the full 32 B memcpy_fromio for two explicit readq covering just the required words, saving ~500–1000 ns per op. Zero the untouched words so dev_dbg doesn't print stack garbage for them. --- core_v1.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/core_v1.c b/core_v1.c index 868912d..d9efbea 100644 --- a/core_v1.c +++ b/core_v1.c @@ -92,7 +92,19 @@ static void pop_mx_command(struct mx_queue_v1 *queue, struct mx_command *comm) void __iomem *data_addr; data_addr = (void *)mbox->data_addr + get_data_offset(ctx->head); - memcpy_fromio(comm, data_addr, sizeof(struct mx_command)); + + /* + * The completion path consumes only the header (id / control) and + * host_addr (result). size and device_addr are producer-side fields + * unused on the completion side, so skip the extra 2 readq per pop + * (v1 profile shows pop_mx_command memcpy_fromio at ~6.5 % of total). + * Zero the untouched words so any caller that stringifies them (e.g. + * dev_dbg below) prints 0 instead of stack garbage. + */ + comm->header = readq(data_addr); + comm->size = 0; + comm->device_addr = 0; + comm->host_addr = readq(data_addr + offsetof(struct mx_command, host_addr)); dev_dbg(queue->common.dev, "CQ- head=0x%02x id=0x%04x op=%u ha=0x%llx da=0x%llx len=%llu\n", ctx->head, comm->id, comm->opcode, comm->host_addr, comm->device_addr, comm->size); From 98876d5355b0a13d217f13808b06bc20fed712dd Mon Sep 17 00:00:00 2001 From: DAEYEONG LEE Date: Thu, 23 Apr 2026 17:28:31 +0900 Subject: [PATCH 8/8] perf(core_v1): skip MMIO refresh in is_pushable when cache has headroom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit is_pushable() readq of the SQ mbox context showed up at ~2.8 % in the v1 profile because the submit_handler re-checks it on every command. Tail moves only from our own push_mx_command and head only grows as HW consumes, so the locally tracked free_space is a conservative lower bound — if we already see room for at least two commands there is no need to read HW for just this one. Keep the readq for the genuinely-full case so the HW refresh still drives forward progress when the queue fills up. --- core_v1.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/core_v1.c b/core_v1.c index d9efbea..1b0da59 100644 --- a/core_v1.c +++ b/core_v1.c @@ -44,14 +44,22 @@ struct mx_command { /******************************************************************************/ static bool is_pushable(struct mx_queue_v1 *queue) { - static uint32_t data_count = sizeof(struct mx_command) / sizeof(uint64_t); + static const uint32_t data_count = sizeof(struct mx_command) / sizeof(uint64_t); struct mx_mbox *mbox = &queue->sq_mbox; - uint32_t free_space; - mbox->ctx.u64 = readq((void *)mbox->r_ctx_addr); - free_space = get_free_space(mbox); + /* + * Fast path: tail is advanced only by our own push_mx_command and head + * can only grow as HW consumes, so the locally tracked free_space is a + * conservative lower bound on the true value. If the cache still has + * headroom for another full command even after this one, skip the MMIO + * readq entirely — v1 profile shows is_pushable() readq at ~2.8 % of + * total cycles in tight submit loops. + */ + if (get_free_space(mbox) >= data_count * 2) + return true; - return free_space >= data_count; + mbox->ctx.u64 = readq((void *)mbox->r_ctx_addr); + return get_free_space(mbox) >= data_count; } static bool is_popable(struct mx_queue_v1 *queue)