Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions patches-sonic/0001-PCI-AER-Simplify-pci_print_aer.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
From 642a9f17e1bce39ecc639ed1e6f959edcd039aef Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 22 May 2025 18:21:15 -0500
Subject: [PATCH 01/12] PCI/AER: Simplify pci_print_aer()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify pci_print_aer() by initializing the struct aer_err_info "info"
with a designated initializer list (it was previously initialized with
memset()) and using pci_name().

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Link: https://patch.msgid.link/20250522232339.1525671-10-helgaas@kernel.org
(cherry picked from commit ad9839137cf9fb0f0c2d531bd04bc4382e6f2de9)
---
drivers/pci/pcie/aer.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 13b8586924ea..7d7fc4a9fec2 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -730,7 +730,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
if (info->id && info->error_dev_num > 1 && info->id == id)
pci_err(dev, " Error of this Agent is reported first\n");

- trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+ trace_aer_event(pci_name(dev), (info->status & ~info->mask),
info->severity, info->tlp_header_valid, &info->tlp);
}

@@ -766,7 +766,10 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
{
int layer, agent, tlp_header_valid = 0;
u32 status, mask;
- struct aer_err_info info;
+ struct aer_err_info info = {
+ .severity = aer_severity,
+ .first_error = PCI_ERR_CAP_FEP(aer->cap_control),
+ };

if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
@@ -777,14 +780,11 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
tlp_header_valid = status & AER_LOG_TLP_MASKS;
}

- layer = AER_GET_LAYER_ERROR(aer_severity, status);
- agent = AER_GET_AGENT(aer_severity, status);
-
- memset(&info, 0, sizeof(info));
- info.severity = aer_severity;
info.status = status;
info.mask = mask;
- info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
+
+ layer = AER_GET_LAYER_ERROR(aer_severity, status);
+ agent = AER_GET_AGENT(aer_severity, status);

pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
__aer_print_error(dev, &info);
@@ -798,7 +798,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
if (tlp_header_valid)
__print_tlp_header(dev, &aer->header_log);

- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+ trace_aer_event(pci_name(dev), (status & ~mask),
aer_severity, tlp_header_valid, &aer->header_log);
}
EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
--
2.47.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
From dfcd0e7d5008dffb55517a8207d455cab69dc70c Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 22 May 2025 18:21:16 -0500
Subject: [PATCH 02/12] PCI/AER: Update statistics before ratelimiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are two AER logging entry points:

- aer_print_error() is used by DPC (dpc_process_error()) and native AER
handling (aer_process_err_devices()).

- pci_print_aer() is used by GHES (aer_recover_work_func()) and CXL
(cxl_handle_rdport_errors())

Both use __aer_print_error() to print the AER error bits. Previously
__aer_print_error() also incremented the AER statistics via
pci_dev_aer_stats_incr().

Call pci_dev_aer_stats_incr() early in the entry points instead of in
__aer_print_error() so we update the statistics even if the actual printing
of error bits is rate limited by a future change.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20250522232339.1525671-11-helgaas@kernel.org
(cherry picked from commit 88a7765e62b9e4c79c7ca2c7b749ae04f54a5668)
---
drivers/pci/pcie/aer.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 7d7fc4a9fec2..48a33151d145 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -694,7 +694,6 @@ static void __aer_print_error(struct pci_dev *dev,
pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
info->first_error == i ? " (First)" : "");
}
- pci_dev_aer_stats_incr(dev, info);
}

void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
@@ -703,6 +702,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
int id = pci_dev_id(dev);
const char *level;

+ pci_dev_aer_stats_incr(dev, info);
+
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
aer_error_severity_string[info->severity]);
@@ -783,6 +784,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.status = status;
info.mask = mask;

+ pci_dev_aer_stats_incr(dev, &info);
+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
agent = AER_GET_AGENT(aer_severity, status);

--
2.47.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
From b6cbde42bc2406d734511a7b28499f9fda4fd91c Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 22 May 2025 18:21:17 -0500
Subject: [PATCH 03/12] PCI/AER: Trace error event before ratelimiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As with the AER statistics, we always want to emit trace events, even if
the actual dmesg logging is rate limited.

Call trace_aer_event() immediately after pci_dev_aer_stats_incr() so both
happen before ratelimiting.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Link: https://patch.msgid.link/20250522232339.1525671-12-helgaas@kernel.org
(cherry picked from commit 6bb4befbd65fa7f99688fb707e376637e5acfe36)
---
drivers/pci/pcie/aer.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 48a33151d145..6fc993d05647 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -703,6 +703,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
const char *level;

pci_dev_aer_stats_incr(dev, info);
+ trace_aer_event(pci_name(dev), (info->status & ~info->mask),
+ info->severity, info->tlp_header_valid, &info->tlp);

if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -730,9 +732,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
out:
if (info->id && info->error_dev_num > 1 && info->id == id)
pci_err(dev, " Error of this Agent is reported first\n");
-
- trace_aer_event(pci_name(dev), (info->status & ~info->mask),
- info->severity, info->tlp_header_valid, &info->tlp);
}

static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -785,6 +784,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.mask = mask;

pci_dev_aer_stats_incr(dev, &info);
+ trace_aer_event(pci_name(dev), (status & ~mask),
+ aer_severity, tlp_header_valid, &aer->header_log);

layer = AER_GET_LAYER_ERROR(aer_severity, status);
agent = AER_GET_AGENT(aer_severity, status);
@@ -800,9 +801,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,

if (tlp_header_valid)
__print_tlp_header(dev, &aer->header_log);
-
- trace_aer_event(pci_name(dev), (status & ~mask),
- aer_severity, tlp_header_valid, &aer->header_log);
}
EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);

--
2.47.0

49 changes: 49 additions & 0 deletions patches-sonic/0004-PCI-AER-Simplify-add_error_device.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
From 824b8af8cc8d81ef95d020dcfcd4338249f1f7de Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 22 May 2025 18:21:23 -0500
Subject: [PATCH 04/12] PCI/AER: Simplify add_error_device()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Return -ENOSPC error early so the usual path through add_error_device() is
the straightline code.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Link: https://patch.msgid.link/20250522232339.1525671-18-helgaas@kernel.org
(cherry picked from commit d72bae423004aa7b4d94c34a7fd0b48b64305a08)
---
drivers/pci/pcie/aer.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 6fc993d05647..a10a005ca6e9 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -811,12 +811,15 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
*/
static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
{
- if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
- e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
- e_info->error_dev_num++;
- return 0;
- }
- return -ENOSPC;
+ int i = e_info->error_dev_num;
+
+ if (i >= AER_MAX_MULTI_ERR_DEVICES)
+ return -ENOSPC;
+
+ e_info->dev[i] = pci_dev_get(dev);
+ e_info->error_dev_num++;
+
+ return 0;
}

/**
--
2.47.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
From 4355136d6587aa7ce7729aa8f35f898993c5f8a4 Mon Sep 17 00:00:00 2001
From: Karolina Stolarek <karolina.stolarek@oracle.com>
Date: Thu, 22 May 2025 18:21:18 -0500
Subject: [PATCH 05/12] PCI/AER: Check log level once and remember it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When reporting an AER error, we check its type multiple times to determine
the log level for each message. Do this check only in the top-level
functions (aer_isr_one_error(), pci_print_aer()) and save the level in
struct aer_err_info.

[bhelgaas: save log level in struct aer_err_info instead of passing it
as a parameter]

Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20250522232339.1525671-13-helgaas@kernel.org
(cherry picked from commit c8f6791e33a7757025285db26f3b382cdcb7f7cd)
---
drivers/pci/pci.h | 1 +
drivers/pci/pcie/aer.c | 18 +++++++++---------
drivers/pci/pcie/dpc.c | 1 +
3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 65df6d2ac003..ce78241b6302 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -505,6 +505,7 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev)
struct aer_err_info {
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
int error_dev_num;
+ const char *level; /* printk level */

unsigned int id:16;

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a10a005ca6e9..82acb7580fd4 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -675,16 +675,14 @@ static void __aer_print_error(struct pci_dev *dev,
{
const char **strings;
unsigned long status = info->status & ~info->mask;
- const char *level, *errmsg;
+ const char *level = info->level;
+ const char *errmsg;
int i;

- if (info->severity == AER_CORRECTABLE) {
+ if (info->severity == AER_CORRECTABLE)
strings = aer_correctable_error_string;
- level = KERN_WARNING;
- } else {
+ else
strings = aer_uncorrectable_error_string;
- level = KERN_ERR;
- }

for_each_set_bit(i, &status, 32) {
errmsg = strings[i];
@@ -700,7 +698,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
{
int layer, agent;
int id = pci_dev_id(dev);
- const char *level;
+ const char *level = info->level;

pci_dev_aer_stats_incr(dev, info);
trace_aer_event(pci_name(dev), (info->status & ~info->mask),
@@ -715,8 +713,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);

- level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
-
pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
aer_error_severity_string[info->severity],
aer_error_layer[layer], aer_agent_string[agent]);
@@ -774,9 +770,11 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
mask = aer->cor_mask;
+ info.level = KERN_WARNING;
} else {
status = aer->uncor_status;
mask = aer->uncor_mask;
+ info.level = KERN_ERR;
tlp_header_valid = status & AER_LOG_TLP_MASKS;
}

@@ -1291,6 +1289,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
e_info.id = ERR_COR_ID(e_src->id);
e_info.severity = AER_CORRECTABLE;
+ e_info.level = KERN_WARNING;

if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
e_info.multi_error_valid = 1;
@@ -1304,6 +1303,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,

if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
e_info.id = ERR_UNCOR_ID(e_src->id);
+ e_info.level = KERN_ERR;

if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
e_info.severity = AER_FATAL;
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index cdc54315d879..d830696dccec 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -254,6 +254,7 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
else
info->severity = AER_NONFATAL;

+ info->level = KERN_ERR;
return 1;
}

--
2.47.0

Loading