From ce6bb5d59c9f9e0e2a8f1cd75677ed225b86c899 Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Wed, 22 Apr 2026 23:06:29 +0900
Subject: [PATCH 1/8] perf(core): run submit/complete handlers as SCHED_FIFO
 low

Lowest RT band keeps the I/O handlers ahead of CFS noise so userspace
submissions don't pay CFS wake latency under CPU pressure.  Handlers
still yield via cond_resched() and sleep in swait_event when idle,
so softlockup/RCU stalls remain bounded.
---
 core_v1.c | 9 +++++++++
 core_v2.c | 5 +++++
 2 files changed, 14 insertions(+)
diff --git a/core_v1.c b/core_v1.c
index 37f1ebd..a8e4d91 100644
--- a/core_v1.c
+++ b/core_v1.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: <SPDX License Expression>
 
 #include <linux/atomic.h>
+#include <linux/sched.h>
 
 #include "mx_dma.h"
 
@@ -313,6 +314,13 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev)
 		pr_err("Failed to create submit thread (err=%ld)\n", PTR_ERR(mx_pdev->submit_thread));
 		return PTR_ERR(mx_pdev->submit_thread);
 	}
+	/*
+	 * SCHED_FIFO (lowest RT band) keeps the handler ahead of CFS noise so
+	 * a userspace I/O submission doesn't pay CFS wake latency when the box
+	 * is busy.  Handlers still yield via cond_resched() and sleep in
+	 * swait_event when idle, so softlockup/RCU stalls are not a concern.
+	 */
+	sched_set_fifo_low(mx_pdev->submit_thread);
 
 	mx_pdev->complete_thread = kthread_run(mx_complete_handler, &queue->common, "mx_complete_thd%d", mx_pdev->dev_id);
 	if (IS_ERR(mx_pdev->complete_thread)) {
@@ -320,6 +328,7 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev)
 		kthread_stop(mx_pdev->submit_thread);
 		return PTR_ERR(mx_pdev->complete_thread);
 	}
+	sched_set_fifo_low(mx_pdev->complete_thread);
 
 	mx_pdev->io_queue = (struct mx_queue *)queue;
 
diff --git a/core_v2.c b/core_v2.c
index a923ef1..30d5ab3 100644
--- a/core_v2.c
+++ b/core_v2.c
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: <SPDX License Expression>
 
 #include <linux/nvme.h>
+#include <linux/sched.h>
 
 #include "mx_dma.h"
 
@@ -455,12 +456,16 @@ static int configure_io_queue(struct mx_pci_dev *mx_pdev)
 		pr_err("Failed to create submit thread (err=%ld)\n", PTR_ERR(mx_pdev->submit_thread));
 		return PTR_ERR(mx_pdev->submit_thread);
 	}
+	/* See core_v1.c: SCHED_FIFO (lowest RT band) for low scheduling latency. */
+	sched_set_fifo_low(mx_pdev->submit_thread);
+
 	mx_pdev->complete_thread = kthread_run(mx_complete_handler, &io_queue->common, "mx_complete_thd%d", mx_pdev->dev_id);
 	if (IS_ERR(mx_pdev->complete_thread)) {
 		pr_err("Failed to create complete thread (err=%ld)\n", PTR_ERR(mx_pdev->complete_thread));
 		kthread_stop(mx_pdev->submit_thread);
 		return PTR_ERR(mx_pdev->complete_thread);
 	}
+	sched_set_fifo_low(mx_pdev->complete_thread);
 
 	mx_pdev->io_queue = (struct mx_queue *)io_queue;
 

From 0e1493fed624205ca1cf78607496fedcba6aa02a Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Wed, 22 Apr 2026 23:06:37 +0900
Subject: [PATCH 2/8] perf(fops): pre-wake io handlers on DMA/ioctl entry

Wake sq_wait and cq_wait at the top of every data/context/ioctl path
so the handler kthreads start running in parallel with page pinning,
DMA mapping, and command construction.  The wake is a cheap no-op
when the handler is already running, and removes the cold-start
component of wake latency when it wasn't.
---
 fops.c   |  5 +++++
 mx_dma.h | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/fops.c b/fops.c
index 384d77e..cf3d896 100644
--- a/fops.c
+++ b/fops.c
@@ -92,6 +92,7 @@ static ssize_t mxdma_device_read_data(struct file *file, char __user *buf, size_
 	if (ret)
 		return ret;
 
+	mx_prewake_handlers(mx_pdev);
 	return read_data_from_device_parallel(mx_pdev, buf, count, pos, IO_OPCODE_DATA_READ);
 }
 
@@ -115,6 +116,7 @@ static ssize_t mxdma_device_read_context(struct file *file, char __user *buf, si
 	if (ret)
 		return ret;
 
+	mx_prewake_handlers(mx_pdev);
 	return read_data_from_device(mx_pdev, buf, count, pos, IO_OPCODE_CONTEXT_READ);
 }
 
@@ -133,6 +135,7 @@ static ssize_t mxdma_device_write_data(struct file *file, const char __user *buf
 	if (ret)
 		return ret;
 
+	mx_prewake_handlers(mx_pdev);
 	return write_data_to_device_parallel(mx_pdev, buf, count, pos, IO_OPCODE_DATA_WRITE, false);
 }
 
@@ -151,6 +154,7 @@ static ssize_t mxdma_device_write_context(struct file *file, const char __user *
 	if (ret)
 		return ret;
 
+	mx_prewake_handlers(mx_pdev);
 	return write_data_to_device(mx_pdev, buf, count, pos, IO_OPCODE_CONTEXT_WRITE, false);
 }
 
@@ -164,6 +168,7 @@ static long mxdma_device_ioctl(struct file *file, unsigned int cmd, unsigned lon
 	if (ret)
 		return ret;
 
+	mx_prewake_handlers(mx_pdev);
 	return ioctl_to_device(mx_pdev, cmd, arg);
 }
 
diff --git a/mx_dma.h b/mx_dma.h
index 0574c76..6d31df4 100644
--- a/mx_dma.h
+++ b/mx_dma.h
@@ -284,6 +284,22 @@ void mx_stop_queue_threads(struct mx_pci_dev *mx_pdev);
 int mx_submit_handler(void *arg);
 int mx_complete_handler(void *arg);
 
+/*
+ * Wake both submit and complete handlers so they start running in parallel
+ * with userspace transfer setup (page pinning, DMA mapping, command build).
+ * Safe to call from any I/O entry point; cheap no-op if handlers are already
+ * running.
+ */
+static inline void mx_prewake_handlers(struct mx_pci_dev *mx_pdev)
+{
+	struct mx_queue *q = mx_pdev ? mx_pdev->io_queue : NULL;
+
+	if (!q)
+		return;
+	swake_up_one(&q->sq_wait);
+	swake_up_one(&q->cq_wait);
+}
+
 void register_mx_ops_v1(struct mx_operations *ops);
 void register_mx_ops_v2(struct mx_operations *ops);
 

From 4b4d77fa732242cbcb00e0ed99234f632fd13471 Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Thu, 23 Apr 2026 11:00:18 +0900
Subject: [PATCH 3/8] perf(core): bind io handlers to device-local NUMA cpumask

Restrict mx_submit_thd and mx_complete_thd to the device's NUMA node
via set_cpus_allowed_ptr at queue init.  Keeps handler cache traffic
(descriptor ring, sq/cq_wait, transfer structs) node-local instead
of letting the scheduler place them on any CPU in the system.

Uses set_cpus_allowed_ptr rather than kthread_bind so operators can
still taskset to colocate handlers with a specific userspace CPU for
tighter tuning.  No-op on devices without NUMA affinity.
---
 core_v1.c |  2 ++
 core_v2.c |  2 ++
 mx_dma.h  | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/core_v1.c b/core_v1.c
index a8e4d91..68b00eb 100644
--- a/core_v1.c
+++ b/core_v1.c
@@ -332,6 +332,8 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev)
 
 	mx_pdev->io_queue = (struct mx_queue *)queue;
 
+	mx_bind_handlers_to_numa(mx_pdev);
+
 	return 0;
 }
 
diff --git a/core_v2.c b/core_v2.c
index 30d5ab3..3d5c953 100644
--- a/core_v2.c
+++ b/core_v2.c
@@ -469,6 +469,8 @@ static int configure_io_queue(struct mx_pci_dev *mx_pdev)
 
 	mx_pdev->io_queue = (struct mx_queue *)io_queue;
 
+	mx_bind_handlers_to_numa(mx_pdev);
+
 	return 0;
 }
 
diff --git a/mx_dma.h b/mx_dma.h
index 6d31df4..a7bb091 100644
--- a/mx_dma.h
+++ b/mx_dma.h
@@ -11,8 +11,11 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/kthread.h>
+#include <linux/numa.h>
 #include <linux/poll.h>
+#include <linux/sched.h>
 #include <linux/swait.h>
+#include <linux/topology.h>
 
 #include <asm/current.h>
 #include <asm/cacheflush.h>
@@ -300,6 +303,36 @@ static inline void mx_prewake_handlers(struct mx_pci_dev *mx_pdev)
 	swake_up_one(&q->cq_wait);
 }
 
+/*
+ * Restrict io handler kthreads to the device-local NUMA node so their
+ * cache traffic (descriptor ring, sq/cq_wait, transfer structs) stays
+ * node-local.  This is a soft affinity hint via set_cpus_allowed_ptr,
+ * not a hard kthread_bind: operators can still override with taskset
+ * to colocate handlers with a specific userspace CPU.  No-op when the
+ * device has no NUMA affinity or the node cpumask is empty.
+ */
+static inline void mx_bind_handlers_to_numa(struct mx_pci_dev *mx_pdev)
+{
+	const struct cpumask *mask;
+	int node;
+
+	if (!mx_pdev || !mx_pdev->pdev)
+		return;
+
+	node = dev_to_node(&mx_pdev->pdev->dev);
+	if (node == NUMA_NO_NODE)
+		return;
+
+	mask = cpumask_of_node(node);
+	if (cpumask_empty(mask))
+		return;
+
+	if (!IS_ERR_OR_NULL(mx_pdev->submit_thread))
+		set_cpus_allowed_ptr(mx_pdev->submit_thread, mask);
+	if (!IS_ERR_OR_NULL(mx_pdev->complete_thread))
+		set_cpus_allowed_ptr(mx_pdev->complete_thread, mask);
+}
+
 void register_mx_ops_v1(struct mx_operations *ops);
 void register_mx_ops_v2(struct mx_operations *ops);
 

From 55d737e71c7b3489527c04c3881d1a46b7201b5a Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Thu, 23 Apr 2026 11:02:03 +0900
Subject: [PATCH 4/8] perf(pci): hold cpu_latency PM QoS to block deep C-states

Register a cpu_latency_qos request with a 50us wake-up budget at
device probe and release it at device remove.  This blocks deep
idle states whose exit latency would stretch the freq ramp-up
window we observed adding ~12us to cold DMA submissions (governor
reaching boost freq after the CPU wakes from a deep idle).

Held across the device's lifetime; shallow idle remains allowed
so we don't force a polling-idle CPU.  Freed on both success and
out_fail paths via destroy_mx_pdev.
---
 init.c   | 11 +++++++++++
 mx_dma.h | 16 ++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/init.c b/init.c
index 3aaaaec..0d663a4 100644
--- a/init.c
+++ b/init.c
@@ -233,6 +233,9 @@ static void destroy_mx_pdev(struct pci_dev *pdev)
 	if (!mx_pdev)
 		return;
 
+	if (cpu_latency_qos_request_active(&mx_pdev->cpu_latency_req))
+		cpu_latency_qos_remove_request(&mx_pdev->cpu_latency_req);
+
 	mx_pdev->ops.release_queue(mx_pdev);
 
 	if (!IS_ERR_OR_NULL(mx_pdev->zombie_cleanup_thread)) {
@@ -269,6 +272,14 @@ static int create_mx_pdev(struct pci_dev *pdev, int cxl_memdev_id)
 	mx_pdev->pdev = pdev;
 	mx_pdev->dev_id = cxl_memdev_id;
 
+	/*
+	 * Hold a cpu_latency PM QoS for the device's lifetime.  Blocks deep
+	 * C-states whose exit latency would stretch the freq ramp-up window
+	 * that adds ~12 us to cold DMA submissions in our measurements.
+	 * Removed in destroy_mx_pdev (including the out_fail path).
+	 */
+	cpu_latency_qos_add_request(&mx_pdev->cpu_latency_req, MX_CPU_LATENCY_QOS_US);
+
 	if (pdev->revision == 0x1) {
 		register_mx_ops_v1(&mx_pdev->ops);
 		pr_info("PCI device revision 1 detected\n");
diff --git a/mx_dma.h b/mx_dma.h
index a7bb091..07fdf52 100644
--- a/mx_dma.h
+++ b/mx_dma.h
@@ -12,6 +12,7 @@
 #include <linux/aer.h>
 #include <linux/kthread.h>
 #include <linux/numa.h>
+#include <linux/pm_qos.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/swait.h>
@@ -40,6 +41,15 @@
 #define POLLING_INTERVAL_MSEC	4
 #define ZOMBIE_POLL_INTERVAL_MSEC	1000
 
+/*
+ * Wake-up latency budget held via cpu_latency_qos for the lifetime of each
+ * mx device.  Blocks deep C-states whose exit latency would stretch the
+ * freq ramp-up window we observed adding ~12 us to cold DMA submissions.
+ * Small enough to still allow shallow idle for power; large enough not to
+ * force a polling-idle CPU.
+ */
+#define MX_CPU_LATENCY_QOS_US	50
+
 enum {
 	MX_CDEV_DATA = 0,
 	MX_CDEV_CONTEXT,
@@ -250,6 +260,12 @@ struct mx_pci_dev {
 	struct list_head zombie_list;
 	spinlock_t zombie_lock;
 	struct task_struct *zombie_cleanup_thread;
+
+	/*
+	 * Held across the device's lifetime to block deep C-states.  Shallow
+	 * idle is still allowed so we don't force a polling-idle CPU.
+	 */
+	struct pm_qos_request cpu_latency_req;
 };
 
 extern struct file_operations *mxdma_fops_array[];

From 36a9f31f0d8eb681b178120644eb913bcf89adb0 Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Thu, 23 Apr 2026 16:19:11 +0900
Subject: [PATCH 5/8] perf(transfer): embed single-page storage in mx_transfer

Add pages_inline[1], sg_inline[1], and a 64 B cmd_inline area to
struct mx_transfer so the single-page hot path skips kcalloc(pages),
sg_alloc_table_from_pages(), and kzalloc(mx_command).

Free paths detect inline use by pointer identity and skip the
corresponding kfree / sg_free_table. BUILD_BUG_ON guards cmd_inline
against future growth of struct mx_command (v1=32 B, v2=64 B today).
---
 core_v1.c  | 17 ++++--------
 core_v2.c  | 11 ++------
 mx_dma.h   | 27 ++++++++++++++++++
 transfer.c | 80 ++++++++++++++++++++++++++++++++++++++++--------------
 4 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/core_v1.c b/core_v1.c
index 68b00eb..868912d 100644
--- a/core_v1.c
+++ b/core_v1.c
@@ -155,12 +155,10 @@ static const struct mx_queue_ops v1_queue_ops = {
 
 static struct mx_command *alloc_mx_command(struct mx_transfer *transfer, int opcode)
 {
-	struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL);
+	struct mx_command *comm = (struct mx_command *)transfer->cmd_inline;
 
-	if (!comm) {
-		pr_warn("Failed to allocate mx_command\n");
-		return NULL;
-	}
+	BUILD_BUG_ON(sizeof(struct mx_command) > MX_CMD_INLINE_SIZE);
+	memset(comm, 0, sizeof(*comm));
 
 	comm->magic = MAGIC_COMMAND;
 	comm->id = transfer->id;
@@ -192,7 +190,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
 		comm->host_addr = sg_dma_address(sg);
 		if (!comm->host_addr) {
 			pr_warn("Failed to get sg_dma_address\n");
-			kfree(comm);
 			return NULL;
 		}
 	} else {
@@ -200,7 +197,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
 		comm->prp_entry1 = mx_desc_list_init(mx_pdev, transfer, SINGLE_DMA_SIZE, NUM_OF_DESC_PER_LIST, false);
 		if (!comm->prp_entry1) {
 			pr_warn("Failed to get desc_list_init\n");
-			kfree(comm);
 			return NULL;
 		}
 	}
@@ -236,12 +232,9 @@ static void *create_mx_command_ctrl(struct mx_transfer *transfer, int opcode)
 
 static void *create_mx_command_passthru(struct mx_transfer *transfer, int subopcode)
 {
-	struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL);
+	struct mx_command *comm = (struct mx_command *)transfer->cmd_inline;
 
-	if (!comm) {
-		pr_warn("Failed to allocate mx_command for passthru\n");
-		return NULL;
-	}
+	memset(comm, 0, sizeof(*comm));
 
 	comm->magic = MAGIC_COMMAND;
 	comm->opcode = IO_OPCODE_PASSTHRU;
diff --git a/core_v2.c b/core_v2.c
index 3d5c953..44bcecb 100644
--- a/core_v2.c
+++ b/core_v2.c
@@ -211,12 +211,10 @@ static const struct mx_queue_ops v2_queue_ops = {
 
 static struct mx_command *alloc_mx_command(struct mx_transfer *transfer, int opcode)
 {
-	struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL);
+	struct mx_command *comm = (struct mx_command *)transfer->cmd_inline;
 
-	if (!comm) {
-		pr_warn("Failed to allocate mx_command\n");
-		return NULL;
-	}
+	BUILD_BUG_ON(sizeof(struct mx_command) > MX_CMD_INLINE_SIZE);
+	memset(comm, 0, sizeof(*comm));
 
 	comm->opcode = opcode;
 	comm->command_id = transfer->id;
@@ -241,7 +239,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
 	comm->prp_entry1 = sg_dma_address(sg);
 	if (!comm->prp_entry1) {
 		pr_warn("Failed to get sg_dma_address\n");
-		kfree(comm);
 		return NULL;
 	}
 
@@ -254,14 +251,12 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
 			comm->prp_entry2 = sg_dma_address(sg_next(sg));
 		if (!comm->prp_entry2) {
 			pr_warn("Failed to get sg_dma_address\n");
-			kfree(comm);
 			return NULL;
 		}
 	} else {
 		comm->prp_entry2 = mx_desc_list_init(mx_pdev, transfer, SINGLE_DMA_SIZE, NUM_OF_DESC_PER_LIST, true);
 		if (!comm->prp_entry2) {
 			pr_warn("Failed to desc_list_init\n");
-			kfree(comm);
 			return NULL;
 		}
 	}
diff --git a/mx_dma.h b/mx_dma.h
index 07fdf52..9b5825f 100644
--- a/mx_dma.h
+++ b/mx_dma.h
@@ -15,6 +15,7 @@
 #include <linux/pm_qos.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
+#include <linux/scatterlist.h>
 #include <linux/swait.h>
 #include <linux/topology.h>
 
@@ -41,6 +42,22 @@
 #define POLLING_INTERVAL_MSEC	4
 #define ZOMBIE_POLL_INTERVAL_MSEC	1000
 
+/*
+ * Single-page fast path: embed one struct page * and one scatterlist inside
+ * mx_transfer so the 8 B / sub-page hot path skips kcalloc(pages) and
+ * sg_alloc_table_from_pages().  Multi-page transfers still fall back to the
+ * dynamic allocations in map_user_addr_to_sg().
+ */
+#define MX_PAGES_INLINE_NR	1
+
+/*
+ * Inline storage for the hardware command struct.  Sized to the larger of
+ * the v1 / v2 struct mx_command definitions (v1=32 B, v2=64 B).  Enforced
+ * by BUILD_BUG_ON in each core_v*.c; bumping either struct past this limit
+ * fails the build instead of silently overrunning.
+ */
+#define MX_CMD_INLINE_SIZE	64
+
 /*
  * Wake-up latency budget held via cpu_latency_qos for the lifetime of each
  * mx device.  Blocks deep C-states whose exit latency would stretch the
@@ -172,6 +189,16 @@ struct mx_transfer {
 	int desc_list_cnt;
 	void **desc_list_va;
 	dma_addr_t *desc_list_ba;
+
+	/*
+	 * Inline fast-path storage.  Active when pages_nr <= MX_PAGES_INLINE_NR.
+	 * Free paths detect inline use by pointer identity
+	 * (pages == pages_inline, sgt.sgl == sg_inline, command == cmd_inline)
+	 * and skip the corresponding kfree / sg_free_table.
+	 */
+	struct page		*pages_inline[MX_PAGES_INLINE_NR];
+	struct scatterlist	 sg_inline[MX_PAGES_INLINE_NR];
+	uint8_t			 cmd_inline[MX_CMD_INLINE_SIZE] __aligned(8);
 };
 
 struct mx_event {
diff --git a/transfer.c b/transfer.c
index 2453b43..852d463 100644
--- a/transfer.c
+++ b/transfer.c
@@ -32,12 +32,20 @@ static void unmap_user_addr_to_sg(struct device *dev, struct mx_transfer *transf
 	if (transfer->pages_nr > 0)
 		unpin_user_pages(transfer->pages, transfer->pages_nr);
 
-	sg_free_table(&transfer->sgt);
+	/*
+	 * Inline SG (sg_inline[]) is embedded in mx_transfer — calling
+	 * sg_free_table() on it would kfree a static array.  Only free the
+	 * table when sg_alloc_table_from_pages() backed the sgl.
+	 */
+	if (sgt->sgl && sgt->sgl != transfer->sg_inline)
+		sg_free_table(sgt);
+	sgt->sgl = NULL;
+	sgt->nents = 0;
+	sgt->orig_nents = 0;
 
-	if (transfer->pages) {
+	if (transfer->pages && transfer->pages != transfer->pages_inline)
 		kfree(transfer->pages);
-		transfer->pages = NULL;
-	}
+	transfer->pages = NULL;
 }
 
 static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer)
@@ -56,10 +64,18 @@ static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer)
 	if (!pages_nr)
 		return 0;
 
-	transfer->pages = kcalloc(pages_nr, sizeof(struct page *), GFP_KERNEL);
-	if (!transfer->pages) {
-		pr_warn("Failed to alloc pages\n");
-		return -ENOMEM;
+	/*
+	 * Fast path: single-page transfers reuse the inline array embedded in
+	 * mx_transfer.  Only the >MX_PAGES_INLINE_NR case hits the allocator.
+	 */
+	if (pages_nr <= MX_PAGES_INLINE_NR) {
+		transfer->pages = transfer->pages_inline;
+	} else {
+		transfer->pages = kcalloc(pages_nr, sizeof(struct page *), GFP_KERNEL);
+		if (!transfer->pages) {
+			pr_warn("Failed to alloc pages\n");
+			return -ENOMEM;
+		}
 	}
 
 	/* Pin user_addr to pages */
@@ -69,7 +85,8 @@ static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer)
 	pinned = pin_user_pages_fast((unsigned long)user_addr, pages_nr, gup_flags, transfer->pages);
 	if (pinned < 0) {
 		pr_warn("pin_user_pages_fast failed (err=%ld)\n", pinned);
-		kfree(transfer->pages);
+		if (transfer->pages != transfer->pages_inline)
+			kfree(transfer->pages);
 		transfer->pages = NULL;
 		return (int)pinned;
 	}
@@ -77,26 +94,47 @@ static int map_user_addr_to_sg(struct device *dev, struct mx_transfer *transfer)
 		pr_warn("pin_user_pages_fast partial (req=%u, got=%ld)\n", pages_nr, pinned);
 		if (pinned > 0)
 			unpin_user_pages(transfer->pages, pinned);
-		kfree(transfer->pages);
+		if (transfer->pages != transfer->pages_inline)
+			kfree(transfer->pages);
 		transfer->pages = NULL;
 		return -EFAULT;
 	}
 	transfer->pages_nr = pages_nr;
 
-	/* Alloc sg_table as pages_nr */
-	ret = sg_alloc_table_from_pages(sgt, transfer->pages, pages_nr, offset, size, GFP_KERNEL);
-	if (ret) {
-		pr_warn("sg_alloc_table_from_pages failed (err=%d)\n", ret);
-		unpin_user_pages(transfer->pages, transfer->pages_nr);
-		transfer->pages_nr = 0;
-		return ret;
+	if (pages_nr <= MX_PAGES_INLINE_NR) {
+		/*
+		 * Hand-build a single-entry sg_table using the inline scatterlist.
+		 * Skipping sg_alloc_table_from_pages() saves its internal kmalloc
+		 * plus the dynamic sgl free path in unmap_user_addr_to_sg().
+		 */
+		sg_init_table(transfer->sg_inline, MX_PAGES_INLINE_NR);
+		sg_set_page(&transfer->sg_inline[0], transfer->pages[0], size, offset);
+		sgt->sgl = transfer->sg_inline;
+		sgt->orig_nents = pages_nr;
+		sgt->nents = pages_nr;
+	} else {
+		ret = sg_alloc_table_from_pages(sgt, transfer->pages, pages_nr, offset, size, GFP_KERNEL);
+		if (ret) {
+			pr_warn("sg_alloc_table_from_pages failed (err=%d)\n", ret);
+			unpin_user_pages(transfer->pages, transfer->pages_nr);
+			if (transfer->pages != transfer->pages_inline)
+				kfree(transfer->pages);
+			transfer->pages = NULL;
+			transfer->pages_nr = 0;
+			return ret;
+		}
 	}
 
 	/* Map the given buffer for DMA */
 	sgt->nents = dma_map_sg(dev, sgt->sgl, sgt->orig_nents, transfer->dir);
 	if (!sgt->nents) {
-		sg_free_table(sgt);
+		if (sgt->sgl != transfer->sg_inline)
+			sg_free_table(sgt);
+		sgt->sgl = NULL;
 		unpin_user_pages(transfer->pages, transfer->pages_nr);
+		if (transfer->pages != transfer->pages_inline)
+			kfree(transfer->pages);
+		transfer->pages = NULL;
 		pr_warn("Failed to dma_map_sg\n");
 		return -EIO;
 	}
@@ -169,7 +207,8 @@ int desc_list_alloc(struct mx_pci_dev *mx_pdev, struct mx_transfer *transfer, in
 static void release_mx_transfer(struct mx_transfer *transfer)
 {
 	transfer_id_free(transfer->id);
-	kfree(transfer->command);
+	if (transfer->command && transfer->command != (void *)transfer->cmd_inline)
+		kfree(transfer->command);
 	kfree(transfer);
 }
 
@@ -741,7 +780,8 @@ static void drain_zombie_list(struct mx_pci_dev *mx_pdev, struct list_head *list
 			desc_list_free(mx_pdev, transfer);
 		}
 
-		kfree(transfer->command);
+		if (transfer->command && transfer->command != (void *)transfer->cmd_inline)
+			kfree(transfer->command);
 		kfree(transfer);
 	}
 }

From bb73f2e8c3ccb6ce6cd32a5fec7fee57be7665a9 Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Thu, 23 Apr 2026 16:21:37 +0900
Subject: [PATCH 6/8] perf(transfer): allocate mx_transfer from dedicated
 kmem_cache

Replace the generic kmalloc bucket allocation with a SLAB_HWCACHE_ALIGN
kmem_cache sized exactly to struct mx_transfer. The per-cpu slab
magazine keeps freshly freed transfers hot for the next allocation,
cutting slab-partial contention on repeated small-I/O loops.

Cache lifetime is tied to module load: create after class_create() in
mxdma_init(), destroy after the PCI / bus teardown that drains all
in-flight transfers in mxdma_exit().
---
 init.c     | 31 ++++++++++++++++++++++++++++++-
 mx_dma.h   |  8 ++++++++
 transfer.c |  8 ++++----
 3 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/init.c b/init.c
index 0d663a4..979bcbd 100644
--- a/init.c
+++ b/init.c
@@ -6,6 +6,7 @@
 /* Initialization                                                             */
 /******************************************************************************/
 static struct class *mxdma_class;
+struct kmem_cache *mx_transfer_cache;
 
 #ifndef CONFIG_WO_CXL
 static LIST_HEAD(mx_device_list_head);
@@ -540,10 +541,28 @@ static int mxdma_init(void)
 
 	mxdma_class->devnode = mxdma_devnode;
 
+	mx_transfer_cache = kmem_cache_create("mx_transfer",
+					      sizeof(struct mx_transfer), 0,
+					      SLAB_HWCACHE_ALIGN, NULL);
+	if (!mx_transfer_cache) {
+		pr_err("Failed to create mx_transfer kmem_cache\n");
+		class_destroy(mxdma_class);
+		return -ENOMEM;
+	}
+
 	pr_info("MXDMA driver is loaded\n");
 
 #ifdef CONFIG_WO_CXL
-	return pci_register_driver(&pci_driver);
+	{
+		int ret = pci_register_driver(&pci_driver);
+
+		if (ret) {
+			kmem_cache_destroy(mx_transfer_cache);
+			mx_transfer_cache = NULL;
+			class_destroy(mxdma_class);
+		}
+		return ret;
+	}
 #else
 	bus_register_notifier(&pci_bus_type, &mxdma_pci_notifier);
 	return 0;
@@ -579,6 +598,16 @@ static void mxdma_exit(void)
 	destroy_device_list();
 #endif
 
+	/*
+	 * PCI unregister / device-list teardown above completes all in-flight
+	 * transfers (including zombie drain in remove()), so every mx_transfer
+	 * has been returned to the slab before we destroy the cache.
+	 */
+	if (mx_transfer_cache) {
+		kmem_cache_destroy(mx_transfer_cache);
+		mx_transfer_cache = NULL;
+	}
+
 	if (mxdma_class)
 		class_destroy(mxdma_class);
 
diff --git a/mx_dma.h b/mx_dma.h
index 9b5825f..f433fe3 100644
--- a/mx_dma.h
+++ b/mx_dma.h
@@ -297,6 +297,14 @@ struct mx_pci_dev {
 
 extern struct file_operations *mxdma_fops_array[];
 
+/*
+ * Dedicated slab cache for struct mx_transfer.  Sized exactly to the
+ * transfer and tagged SLAB_HWCACHE_ALIGN so per-op alloc/free hits a
+ * hot per-cpu magazine instead of the generic kmalloc-256/512 buckets.
+ * Created in mxdma_init(), destroyed in mxdma_exit().
+ */
+extern struct kmem_cache *mx_transfer_cache;
+
 int transfer_id_alloc(void *ptr);
 void transfer_id_free(unsigned long id);
 void *find_transfer_by_id(unsigned long id);
diff --git a/transfer.c b/transfer.c
index 852d463..f774e24 100644
--- a/transfer.c
+++ b/transfer.c
@@ -209,7 +209,7 @@ static void release_mx_transfer(struct mx_transfer *transfer)
 	transfer_id_free(transfer->id);
 	if (transfer->command && transfer->command != (void *)transfer->cmd_inline)
 		kfree(transfer->command);
-	kfree(transfer);
+	kmem_cache_free(mx_transfer_cache, transfer);
 }
 
 static struct mx_transfer *alloc_mx_transfer(char __user *user_addr, size_t size, uint64_t device_addr,
@@ -217,7 +217,7 @@ static struct mx_transfer *alloc_mx_transfer(char __user *user_addr, size_t size
 {
 	struct mx_transfer *transfer;
 
-	transfer = kzalloc(sizeof(struct mx_transfer), GFP_KERNEL);
+	transfer = kmem_cache_zalloc(mx_transfer_cache, GFP_KERNEL);
 	if (!transfer) {
 		return NULL;
 	}
@@ -228,7 +228,7 @@ static struct mx_transfer *alloc_mx_transfer(char __user *user_addr, size_t size
 	transfer->id = transfer_id_alloc(transfer);
 	if (transfer->id < 0) {
 		pr_warn("Failed to alloc transfer_id\n");
-		kfree(transfer);
+		kmem_cache_free(mx_transfer_cache, transfer);
 		return NULL;
 	}
 
@@ -782,7 +782,7 @@ static void drain_zombie_list(struct mx_pci_dev *mx_pdev, struct list_head *list
 
 		if (transfer->command && transfer->command != (void *)transfer->cmd_inline)
 			kfree(transfer->command);
-		kfree(transfer);
+		kmem_cache_free(mx_transfer_cache, transfer);
 	}
 }
 

From ceb93e19295f19062709784f19bd2e8cfd56db81 Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Thu, 23 Apr 2026 17:28:09 +0900
Subject: [PATCH 7/8] perf(core_v1): pop_mx_command reads only fields used by
 completion path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v1 profile showed memcpy_fromio(sizeof(struct mx_command)) at ~6.5 %
of total cycles — four MMIO readq per pop — while the completion
path only consumes the header (id / control) and host_addr (result).
size and device_addr are producer-side fields that the host never
reads on completion.

Drop the full 32 B memcpy_fromio for two explicit readq covering
just the required words, saving ~500–1000 ns per op.  Zero the
untouched words so dev_dbg doesn't print stack garbage for them.
---
 core_v1.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/core_v1.c b/core_v1.c
index 868912d..d9efbea 100644
--- a/core_v1.c
+++ b/core_v1.c
@@ -92,7 +92,19 @@ static void pop_mx_command(struct mx_queue_v1 *queue, struct mx_command *comm)
 	void __iomem *data_addr;
 
 	data_addr = (void *)mbox->data_addr + get_data_offset(ctx->head);
-	memcpy_fromio(comm, data_addr, sizeof(struct mx_command));
+
+	/*
+	 * The completion path consumes only the header (id / control) and
+	 * host_addr (result).  size and device_addr are producer-side fields
+	 * unused on the completion side, so skip the extra 2 readq per pop
+	 * (v1 profile shows pop_mx_command memcpy_fromio at ~6.5 % of total).
+	 * Zero the untouched words so any caller that stringifies them (e.g.
+	 * dev_dbg below) prints 0 instead of stack garbage.
+	 */
+	comm->header      = readq(data_addr);
+	comm->size        = 0;
+	comm->device_addr = 0;
+	comm->host_addr   = readq(data_addr + offsetof(struct mx_command, host_addr));
 
 	dev_dbg(queue->common.dev, "CQ- head=0x%02x id=0x%04x op=%u ha=0x%llx da=0x%llx len=%llu\n",
 			ctx->head, comm->id, comm->opcode, comm->host_addr, comm->device_addr, comm->size);

From 98876d5355b0a13d217f13808b06bc20fed712dd Mon Sep 17 00:00:00 2001
From: DAEYEONG LEE <daeyeong.lee@metisx.com>
Date: Thu, 23 Apr 2026 17:28:31 +0900
Subject: [PATCH 8/8] perf(core_v1): skip MMIO refresh in is_pushable when
 cache has headroom
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

is_pushable() readq of the SQ mbox context showed up at ~2.8 % in the
v1 profile because the submit_handler re-checks it on every command.
Tail moves only from our own push_mx_command and head only grows as
HW consumes, so the locally tracked free_space is a conservative
lower bound — if we already see room for at least two commands there
is no need to read HW for just this one.

Keep the readq for the genuinely-full case so the HW refresh still
drives forward progress when the queue fills up.
---
 core_v1.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/core_v1.c b/core_v1.c
index d9efbea..1b0da59 100644
--- a/core_v1.c
+++ b/core_v1.c
@@ -44,14 +44,22 @@ struct mx_command {
 /******************************************************************************/
 static bool is_pushable(struct mx_queue_v1 *queue)
 {
-	static uint32_t data_count = sizeof(struct mx_command) / sizeof(uint64_t);
+	static const uint32_t data_count = sizeof(struct mx_command) / sizeof(uint64_t);
 	struct mx_mbox *mbox = &queue->sq_mbox;
-	uint32_t free_space;
 
-	mbox->ctx.u64 = readq((void *)mbox->r_ctx_addr);
-	free_space = get_free_space(mbox);
+	/*
+	 * Fast path: tail is advanced only by our own push_mx_command and head
+	 * can only grow as HW consumes, so the locally tracked free_space is a
+	 * conservative lower bound on the true value.  If the cache still has
+	 * headroom for another full command even after this one, skip the MMIO
+	 * readq entirely — v1 profile shows is_pushable() readq at ~2.8 % of
+	 * total cycles in tight submit loops.
+	 */
+	if (get_free_space(mbox) >= data_count * 2)
+		return true;
 
-	return free_space >= data_count;
+	mbox->ctx.u64 = readq((void *)mbox->r_ctx_addr);
+	return get_free_space(mbox) >= data_count;
 }
 
 static bool is_popable(struct mx_queue_v1 *queue)