Skip to content
60 changes: 42 additions & 18 deletions core_v1.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: <SPDX License Expression>

#include <linux/atomic.h>
#include <linux/sched.h>

#include "mx_dma.h"

Expand Down Expand Up @@ -43,14 +44,22 @@ struct mx_command {
/******************************************************************************/
static bool is_pushable(struct mx_queue_v1 *queue)
{
static uint32_t data_count = sizeof(struct mx_command) / sizeof(uint64_t);
static const uint32_t data_count = sizeof(struct mx_command) / sizeof(uint64_t);
struct mx_mbox *mbox = &queue->sq_mbox;
uint32_t free_space;

mbox->ctx.u64 = readq((void *)mbox->r_ctx_addr);
free_space = get_free_space(mbox);
/*
* Fast path: tail is advanced only by our own push_mx_command and head
* can only grow as HW consumes, so the locally tracked free_space is a
* conservative lower bound on the true value. If the cache still has
* headroom for another full command even after this one, skip the MMIO
* readq entirely โ€” v1 profile shows is_pushable() readq at ~2.8 % of
* total cycles in tight submit loops.
*/
if (get_free_space(mbox) >= data_count * 2)
return true;

return free_space >= data_count;
mbox->ctx.u64 = readq((void *)mbox->r_ctx_addr);
return get_free_space(mbox) >= data_count;
}

static bool is_popable(struct mx_queue_v1 *queue)
Expand Down Expand Up @@ -91,7 +100,19 @@ static void pop_mx_command(struct mx_queue_v1 *queue, struct mx_command *comm)
void __iomem *data_addr;

data_addr = (void *)mbox->data_addr + get_data_offset(ctx->head);
memcpy_fromio(comm, data_addr, sizeof(struct mx_command));

/*
* The completion path consumes only the header (id / control) and
* host_addr (result). size and device_addr are producer-side fields
* unused on the completion side, so skip the extra 2 readq per pop
* (v1 profile shows pop_mx_command memcpy_fromio at ~6.5 % of total).
* Zero the untouched words so any caller that stringifies them (e.g.
* dev_dbg below) prints 0 instead of stack garbage.
*/
comm->header = readq(data_addr);
comm->size = 0;
comm->device_addr = 0;
comm->host_addr = readq(data_addr + offsetof(struct mx_command, host_addr));

dev_dbg(queue->common.dev, "CQ- head=0x%02x id=0x%04x op=%u ha=0x%llx da=0x%llx len=%llu\n",
ctx->head, comm->id, comm->opcode, comm->host_addr, comm->device_addr, comm->size);
Expand Down Expand Up @@ -154,12 +175,10 @@ static const struct mx_queue_ops v1_queue_ops = {

static struct mx_command *alloc_mx_command(struct mx_transfer *transfer, int opcode)
{
struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL);
struct mx_command *comm = (struct mx_command *)transfer->cmd_inline;

if (!comm) {
pr_warn("Failed to allocate mx_command\n");
return NULL;
}
BUILD_BUG_ON(sizeof(struct mx_command) > MX_CMD_INLINE_SIZE);
memset(comm, 0, sizeof(*comm));

comm->magic = MAGIC_COMMAND;
comm->id = transfer->id;
Expand Down Expand Up @@ -191,15 +210,13 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
comm->host_addr = sg_dma_address(sg);
if (!comm->host_addr) {
pr_warn("Failed to get sg_dma_address\n");
kfree(comm);
return NULL;
}
} else {
comm->page_mode = MXDMA_PAGE_MODE_MULTI;
comm->prp_entry1 = mx_desc_list_init(mx_pdev, transfer, SINGLE_DMA_SIZE, NUM_OF_DESC_PER_LIST, false);
if (!comm->prp_entry1) {
pr_warn("Failed to get desc_list_init\n");
kfree(comm);
return NULL;
}
}
Expand Down Expand Up @@ -235,12 +252,9 @@ static void *create_mx_command_ctrl(struct mx_transfer *transfer, int opcode)

static void *create_mx_command_passthru(struct mx_transfer *transfer, int subopcode)
{
struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL);
struct mx_command *comm = (struct mx_command *)transfer->cmd_inline;

if (!comm) {
pr_warn("Failed to allocate mx_command for passthru\n");
return NULL;
}
memset(comm, 0, sizeof(*comm));

comm->magic = MAGIC_COMMAND;
comm->opcode = IO_OPCODE_PASSTHRU;
Expand Down Expand Up @@ -313,16 +327,26 @@ static int init_mx_queue(struct mx_pci_dev* mx_pdev)
pr_err("Failed to create submit thread (err=%ld)\n", PTR_ERR(mx_pdev->submit_thread));
return PTR_ERR(mx_pdev->submit_thread);
}
/*
* SCHED_FIFO (lowest RT band) keeps the handler ahead of CFS noise so
* a userspace I/O submission doesn't pay CFS wake latency when the box
* is busy. Handlers still yield via cond_resched() and sleep in
* swait_event when idle, so softlockup/RCU stalls are not a concern.
*/
sched_set_fifo_low(mx_pdev->submit_thread);

mx_pdev->complete_thread = kthread_run(mx_complete_handler, &queue->common, "mx_complete_thd%d", mx_pdev->dev_id);
if (IS_ERR(mx_pdev->complete_thread)) {
pr_err("Failed to create complete thread (err=%ld)\n", PTR_ERR(mx_pdev->complete_thread));
kthread_stop(mx_pdev->submit_thread);
return PTR_ERR(mx_pdev->complete_thread);
}
sched_set_fifo_low(mx_pdev->complete_thread);

mx_pdev->io_queue = (struct mx_queue *)queue;

mx_bind_handlers_to_numa(mx_pdev);

return 0;
}

Expand Down
18 changes: 10 additions & 8 deletions core_v2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: <SPDX License Expression>

#include <linux/nvme.h>
#include <linux/sched.h>

#include "mx_dma.h"

Expand Down Expand Up @@ -210,12 +211,10 @@ static const struct mx_queue_ops v2_queue_ops = {

static struct mx_command *alloc_mx_command(struct mx_transfer *transfer, int opcode)
{
struct mx_command *comm = kzalloc(sizeof(struct mx_command), GFP_KERNEL);
struct mx_command *comm = (struct mx_command *)transfer->cmd_inline;

if (!comm) {
pr_warn("Failed to allocate mx_command\n");
return NULL;
}
BUILD_BUG_ON(sizeof(struct mx_command) > MX_CMD_INLINE_SIZE);
memset(comm, 0, sizeof(*comm));

comm->opcode = opcode;
comm->command_id = transfer->id;
Expand All @@ -240,7 +239,6 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
comm->prp_entry1 = sg_dma_address(sg);
if (!comm->prp_entry1) {
pr_warn("Failed to get sg_dma_address\n");
kfree(comm);
return NULL;
}

Expand All @@ -253,14 +251,12 @@ static void *create_mx_command_sg(struct mx_pci_dev *mx_pdev, struct mx_transfer
comm->prp_entry2 = sg_dma_address(sg_next(sg));
if (!comm->prp_entry2) {
pr_warn("Failed to get sg_dma_address\n");
kfree(comm);
return NULL;
}
} else {
comm->prp_entry2 = mx_desc_list_init(mx_pdev, transfer, SINGLE_DMA_SIZE, NUM_OF_DESC_PER_LIST, true);
if (!comm->prp_entry2) {
pr_warn("Failed to desc_list_init\n");
kfree(comm);
return NULL;
}
}
Expand Down Expand Up @@ -455,15 +451,21 @@ static int configure_io_queue(struct mx_pci_dev *mx_pdev)
pr_err("Failed to create submit thread (err=%ld)\n", PTR_ERR(mx_pdev->submit_thread));
return PTR_ERR(mx_pdev->submit_thread);
}
/* See core_v1.c: SCHED_FIFO (lowest RT band) for low scheduling latency. */
sched_set_fifo_low(mx_pdev->submit_thread);

mx_pdev->complete_thread = kthread_run(mx_complete_handler, &io_queue->common, "mx_complete_thd%d", mx_pdev->dev_id);
if (IS_ERR(mx_pdev->complete_thread)) {
pr_err("Failed to create complete thread (err=%ld)\n", PTR_ERR(mx_pdev->complete_thread));
kthread_stop(mx_pdev->submit_thread);
return PTR_ERR(mx_pdev->complete_thread);
}
sched_set_fifo_low(mx_pdev->complete_thread);

mx_pdev->io_queue = (struct mx_queue *)io_queue;

mx_bind_handlers_to_numa(mx_pdev);

return 0;
}

Expand Down
5 changes: 5 additions & 0 deletions fops.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ static ssize_t mxdma_device_read_data(struct file *file, char __user *buf, size_
if (ret)
return ret;

mx_prewake_handlers(mx_pdev);
return read_data_from_device_parallel(mx_pdev, buf, count, pos, IO_OPCODE_DATA_READ);
}

Expand All @@ -115,6 +116,7 @@ static ssize_t mxdma_device_read_context(struct file *file, char __user *buf, si
if (ret)
return ret;

mx_prewake_handlers(mx_pdev);
return read_data_from_device(mx_pdev, buf, count, pos, IO_OPCODE_CONTEXT_READ);
}

Expand All @@ -133,6 +135,7 @@ static ssize_t mxdma_device_write_data(struct file *file, const char __user *buf
if (ret)
return ret;

mx_prewake_handlers(mx_pdev);
return write_data_to_device_parallel(mx_pdev, buf, count, pos, IO_OPCODE_DATA_WRITE, false);
}

Expand All @@ -151,6 +154,7 @@ static ssize_t mxdma_device_write_context(struct file *file, const char __user *
if (ret)
return ret;

mx_prewake_handlers(mx_pdev);
return write_data_to_device(mx_pdev, buf, count, pos, IO_OPCODE_CONTEXT_WRITE, false);
}

Expand All @@ -164,6 +168,7 @@ static long mxdma_device_ioctl(struct file *file, unsigned int cmd, unsigned lon
if (ret)
return ret;

mx_prewake_handlers(mx_pdev);
return ioctl_to_device(mx_pdev, cmd, arg);
}

Expand Down
42 changes: 41 additions & 1 deletion init.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
/* Initialization */
/******************************************************************************/
static struct class *mxdma_class;
struct kmem_cache *mx_transfer_cache;

#ifndef CONFIG_WO_CXL
static LIST_HEAD(mx_device_list_head);
Expand Down Expand Up @@ -233,6 +234,9 @@ static void destroy_mx_pdev(struct pci_dev *pdev)
if (!mx_pdev)
return;

if (cpu_latency_qos_request_active(&mx_pdev->cpu_latency_req))
cpu_latency_qos_remove_request(&mx_pdev->cpu_latency_req);

mx_pdev->ops.release_queue(mx_pdev);

if (!IS_ERR_OR_NULL(mx_pdev->zombie_cleanup_thread)) {
Expand Down Expand Up @@ -269,6 +273,14 @@ static int create_mx_pdev(struct pci_dev *pdev, int cxl_memdev_id)
mx_pdev->pdev = pdev;
mx_pdev->dev_id = cxl_memdev_id;

/*
* Hold a cpu_latency PM QoS for the device's lifetime. Blocks deep
* C-states whose exit latency would stretch the freq ramp-up window
* that adds ~12 us to cold DMA submissions in our measurements.
* Removed in destroy_mx_pdev (including the out_fail path).
*/
cpu_latency_qos_add_request(&mx_pdev->cpu_latency_req, MX_CPU_LATENCY_QOS_US);

if (pdev->revision == 0x1) {
register_mx_ops_v1(&mx_pdev->ops);
pr_info("PCI device revision 1 detected\n");
Expand Down Expand Up @@ -529,10 +541,28 @@ static int mxdma_init(void)

mxdma_class->devnode = mxdma_devnode;

mx_transfer_cache = kmem_cache_create("mx_transfer",
sizeof(struct mx_transfer), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!mx_transfer_cache) {
pr_err("Failed to create mx_transfer kmem_cache\n");
class_destroy(mxdma_class);
return -ENOMEM;
}

pr_info("MXDMA driver is loaded\n");

#ifdef CONFIG_WO_CXL
return pci_register_driver(&pci_driver);
{
int ret = pci_register_driver(&pci_driver);

if (ret) {
kmem_cache_destroy(mx_transfer_cache);
mx_transfer_cache = NULL;
class_destroy(mxdma_class);
}
return ret;
}
#else
bus_register_notifier(&pci_bus_type, &mxdma_pci_notifier);
return 0;
Expand Down Expand Up @@ -568,6 +598,16 @@ static void mxdma_exit(void)
destroy_device_list();
#endif

/*
* PCI unregister / device-list teardown above completes all in-flight
* transfers (including zombie drain in remove()), so every mx_transfer
* has been returned to the slab before we destroy the cache.
*/
if (mx_transfer_cache) {
kmem_cache_destroy(mx_transfer_cache);
mx_transfer_cache = NULL;
}

if (mxdma_class)
class_destroy(mxdma_class);

Expand Down
Loading