From f4549ce1f34c4ef1012e9ffdb4d5e4e41dab4b6f Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:41 +0800 Subject: [PATCH 01/24] FROMLIST: ACPI/AEST: Parse the AEST table This patch introduces the creation of AEST platform devices, where each device represents a logical "error node device" grouping one or more AEST nodes from the ACPI table. Instead of relying on the optional 'error_node_device' field in the AEST table[1], this commit uses the interrupt number as the sole identifier for the parent device. This design simplifies the driver logic by providing a single, consistent mechanism for grouping nodes. The 'error_node_device' field can be unspecified, but an AEST node is always physically associated with a parent component. The interrupt number serves as a reliable proxy for this association. This approach is based on the safe assumption that distinct hardware components (e.g., SMMU, CMN, GIC) are assigned unique error interrupts and do not share them. [1]: https://developer.arm.com/documentation/den0085/latest Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-2-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- MAINTAINERS | 8 + arch/arm64/include/asm/ras.h | 15 ++ drivers/acpi/arm64/Kconfig | 11 ++ drivers/acpi/arm64/Makefile | 1 + drivers/acpi/arm64/aest.c | 311 +++++++++++++++++++++++++++++++++++ include/linux/acpi_aest.h | 56 +++++++ 6 files changed, 402 insertions(+) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 drivers/acpi/arm64/aest.c create mode 100644 include/linux/acpi_aest.h diff --git a/MAINTAINERS b/MAINTAINERS index f35e1769fa729..d4e7121ce96fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -333,6 +333,14 @@ S: Maintained F: drivers/acpi/arm64 F: include/linux/acpi_iort.h +ACPI AEST +M: Ruidong Tian +L: linux-acpi@vger.kernel.org +L: linux-arm-kernel@lists.infradead.org +S: Supported +F: drivers/acpi/arm64/aest.c +F: include/linux/acpi_aest.h + ACPI FOR RISC-V (ACPI/riscv) M: Sunil V L L: linux-acpi@vger.kernel.org diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 0000000000000..b6640b9972bfb --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_RAS_H +#define __ASM_RAS_H + +#include + +struct ras_ext_regs { + u64 err_fr; + u64 err_ctlr; + u64 err_status; + u64 err_addr; + u64 err_misc[4]; +}; + +#endif /* __ASM_RAS_H */ diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig index f2fd79f22e7d8..52df190356c82 100644 --- a/drivers/acpi/arm64/Kconfig +++ b/drivers/acpi/arm64/Kconfig @@ -24,3 +24,14 @@ config ACPI_APMT config ACPI_MPAM bool + +config ACPI_AEST + bool "ARM Error Source Table Support" + depends on ARM64_RAS_EXTN + + help + The Arm Error Source Table (AEST) provides details on ACPI + extensions that enable kernel-first handling of errors in a + system that supports the Armv8 RAS extensions. + + If set, the kernel will report and log hardware errors. diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile index 9390b57cb5648..bad77fdbf8dd0 100644 --- a/drivers/acpi/arm64/Makefile +++ b/drivers/acpi/arm64/Makefile @@ -7,5 +7,6 @@ obj-$(CONFIG_ACPI_IORT) += iort.o obj-$(CONFIG_ACPI_MPAM) += mpam.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ARM_AMBA) += amba.o +obj-$(CONFIG_ACPI_AEST) += aest.o obj-y += dma.o init.o obj-y += thermal_cpufreq.o diff --git a/drivers/acpi/arm64/aest.c b/drivers/acpi/arm64/aest.c new file mode 100644 index 0000000000000..b8359b95f40f9 --- /dev/null +++ b/drivers/acpi/arm64/aest.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Error Source Table Support + * + * Copyright (c) 2025, Alibaba Group. + */ + +#include +#include +#include + +#include "init.h" + +#undef pr_fmt +#define pr_fmt(fmt) "ACPI AEST: " fmt + +static struct xarray *aest_array; + +static void __init aest_init_interface(struct acpi_aest_hdr *hdr, + struct acpi_aest_node *node) +{ + struct acpi_aest_node_interface_header *interface; + + interface = ACPI_ADD_PTR(struct acpi_aest_node_interface_header, hdr, + hdr->node_interface_offset); + + node->type = hdr->type; + node->interface_hdr = interface; + + switch (interface->group_format) { + case ACPI_AEST_NODE_GROUP_FORMAT_4K: { + struct acpi_aest_node_interface_4k *interface_4k = + (struct acpi_aest_node_interface_4k *)(interface + 1); + + node->common = &interface_4k->common; + node->record_implemented = + (unsigned long *)&interface_4k->error_record_implemented; + node->status_reporting = + (unsigned long *)&interface_4k->error_status_reporting; + node->addressing_mode = + (unsigned long *)&interface_4k->addressing_mode; + break; + } + case ACPI_AEST_NODE_GROUP_FORMAT_16K: { + struct acpi_aest_node_interface_16k *interface_16k = + (struct acpi_aest_node_interface_16k *)(interface + 1); + + node->common = &interface_16k->common; + node->record_implemented = + (unsigned long *)interface_16k->error_record_implemented; + node->status_reporting = + (unsigned long *)interface_16k->error_status_reporting; + node->addressing_mode = + (unsigned long *)interface_16k->addressing_mode; + break; + } + case ACPI_AEST_NODE_GROUP_FORMAT_64K: { + struct acpi_aest_node_interface_64k *interface_64k = + (struct acpi_aest_node_interface_64k *)(interface + 1); + + node->common = &interface_64k->common; + node->record_implemented = + (unsigned long *)interface_64k->error_record_implemented; + node->status_reporting = + (unsigned long *)interface_64k->error_status_reporting; + node->addressing_mode = + (unsigned long *)interface_64k->addressing_mode; + break; + } + default: + pr_err("invalid group format: %d\n", interface->group_format); + } + + node->interrupt = ACPI_ADD_PTR(struct acpi_aest_node_interrupt_v2, hdr, + hdr->node_interrupt_offset); + + node->interrupt_count = hdr->node_interrupt_count; +} + +static struct aest_hnode *__init +acpi_aest_alloc_ahnode(struct acpi_aest_node *node, u64 error_device_id) +{ + struct aest_hnode *ahnode __free(kfree) = NULL; + + ahnode = kzalloc(sizeof(*ahnode), GFP_KERNEL); + if (!ahnode) + return NULL; + + INIT_LIST_HEAD(&ahnode->list); + ahnode->id = error_device_id; + ahnode->count = 0; + ahnode->type = node->type; + + return_ptr(ahnode); +} +static int __init acpi_aest_init_node(struct acpi_aest_hdr *aest_hdr) +{ + struct aest_hnode *ahnode; + u64 error_device_id; + struct acpi_aest_node *node; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + node->spec_pointer = + ACPI_ADD_PTR(void, aest_hdr, aest_hdr->node_specific_offset); + if (aest_hdr->type == ACPI_AEST_PROCESSOR_ERROR_NODE) + node->processor_spec_pointer = + ACPI_ADD_PTR(void, node->spec_pointer, + sizeof(struct acpi_aest_processor)); + + aest_init_interface(aest_hdr, node); + + if (node->interrupt_count <= 0) + return -EINVAL; + + error_device_id = node->interrupt[0].gsiv; + ahnode = xa_load(aest_array, error_device_id); + if (!ahnode) { + ahnode = acpi_aest_alloc_ahnode(node, error_device_id); + if (!ahnode) + return -ENOMEM; + xa_store(aest_array, error_device_id, ahnode, GFP_KERNEL); + } + + list_add_tail(&node->list, &ahnode->list); + ahnode->count++; + + return 0; +} + +static int __init acpi_aest_init_nodes(struct acpi_table_header *aest_table) +{ + struct acpi_aest_hdr *aest_node, *aest_end; + struct acpi_table_aest *aest; + int rc; + + aest = (struct acpi_table_aest *)aest_table; + aest_node = ACPI_ADD_PTR(struct acpi_aest_hdr, aest, + sizeof(struct acpi_table_header)); + aest_end = ACPI_ADD_PTR(struct acpi_aest_hdr, aest, aest_table->length); + + while (aest_node < aest_end) { + if (((u64)aest_node + aest_node->length) > (u64)aest_end) { + pr_warn(FW_WARN + "AEST node pointer overflow, bad table.\n"); + return -EINVAL; + } + + rc = acpi_aest_init_node(aest_node); + if (rc) + return rc; + + aest_node = ACPI_ADD_PTR(struct acpi_aest_hdr, aest_node, + aest_node->length); + } + + return 0; +} + +static int acpi_aest_parse_irqs(struct platform_device *pdev, + struct acpi_aest_node *anode, + struct resource *res, int *res_idx, int irqs[2]) +{ + int i; + struct acpi_aest_node_interrupt_v2 *interrupt; + int trigger, irq; + + for (i = 0; i < anode->interrupt_count; i++) { + interrupt = &anode->interrupt[i]; + if (irqs[interrupt->type]) + continue; + + trigger = (interrupt->flags & AEST_INTERRUPT_MODE) ? + ACPI_LEVEL_SENSITIVE : + ACPI_EDGE_SENSITIVE; + + irq = acpi_register_gsi(&pdev->dev, interrupt->gsiv, trigger, + ACPI_ACTIVE_HIGH); + if (irq <= 0) { + pr_err("failed to map AEST GSI %d\n", interrupt->gsiv); + return irq; + } + + res[*res_idx].start = irq; + res[*res_idx].end = irq; + res[*res_idx].flags = IORESOURCE_IRQ; + res[*res_idx].name = interrupt->type ? AEST_ERI_NAME : + AEST_FHI_NAME; + + (*res_idx)++; + + irqs[interrupt->type] = irq; + } + + return 0; +} + +DEFINE_FREE(res, struct resource *, if (_T) kfree(_T)) +static struct platform_device *__init +acpi_aest_alloc_pdev(struct aest_hnode *ahnode, int index) +{ + struct platform_device *pdev __free(platform_device_put) = + platform_device_alloc("AEST", index++); + struct resource *res __free(res); + struct acpi_aest_node *anode; + int ret, size, j, irq[AEST_MAX_INTERRUPT_PER_NODE] = { 0 }; + + if (!pdev) + return ERR_PTR(-ENOMEM); + + res = kcalloc(ahnode->count + AEST_MAX_INTERRUPT_PER_NODE, sizeof(*res), + GFP_KERNEL); + if (!res) + return ERR_PTR(-ENOMEM); + + j = 0; + list_for_each_entry(anode, &ahnode->list, list) { + if (anode->interface_hdr->type != + ACPI_AEST_NODE_SYSTEM_REGISTER) { + res[j].name = AEST_NODE_NAME; + res[j].start = anode->interface_hdr->address; + switch (anode->interface_hdr->group_format) { + case ACPI_AEST_NODE_GROUP_FORMAT_4K: + size = 4 * KB; + break; + case ACPI_AEST_NODE_GROUP_FORMAT_16K: + size = 16 * KB; + break; + case ACPI_AEST_NODE_GROUP_FORMAT_64K: + size = 64 * KB; + break; + default: + size = 4 * KB; + } + res[j].end = res[j].start + size - 1; + res[j].flags = IORESOURCE_MEM; + } + + ret = acpi_aest_parse_irqs(pdev, anode, res, &j, irq); + if (ret) + return ERR_PTR(ret); + } + + ret = platform_device_add_resources(pdev, res, j); + if (ret) + return ERR_PTR(ret); + + ret = platform_device_add_data(pdev, &ahnode, sizeof(ahnode)); + if (ret) + return ERR_PTR(ret); + + ret = platform_device_add(pdev); + if (ret) + return ERR_PTR(ret); + + return_ptr(pdev); +} +static int __init acpi_aest_alloc_pdevs(void) +{ + int ret = 0, index = 0; + struct aest_hnode *ahnode = NULL; + unsigned long i; + + xa_for_each(aest_array, i, ahnode) { + struct platform_device *pdev = + acpi_aest_alloc_pdev(ahnode, index++); + + if (IS_ERR(pdev)) { + ret = PTR_ERR(pdev); + break; + } + } + + return ret; +} + +static int __init acpi_aest_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + struct acpi_table_header *aest_table __free(acpi_put_table) = + acpi_get_table_pointer(ACPI_SIG_AEST, 0); + if (IS_ERR(aest_table)) + return 0; + + aest_array = kzalloc(sizeof(struct xarray), GFP_KERNEL); + if (!aest_array) + return -ENOMEM; + + xa_init(aest_array); + + ret = acpi_aest_init_nodes(aest_table); + if (ret) { + pr_err("Failed init aest node %d\n", ret); + return ret; + } + + ret = acpi_aest_alloc_pdevs(); + if (ret) { + pr_err("Failed alloc pdev %d\n", ret); + return ret; + } + + return 0; +} +subsys_initcall_sync(acpi_aest_init); diff --git a/include/linux/acpi_aest.h b/include/linux/acpi_aest.h new file mode 100644 index 0000000000000..53c1970e7583b --- /dev/null +++ b/include/linux/acpi_aest.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ACPI_AEST_H__ +#define __ACPI_AEST_H__ + +#include +#include + +/* AEST resource name */ +#define AEST_NODE_NAME "AEST:NODE" +#define AEST_FHI_NAME "AEST:FHI" +#define AEST_ERI_NAME "AEST:ERI" + +/* AEST interrupt */ +#define AEST_INTERRUPT_MODE BIT(0) + +#define AEST_MAX_INTERRUPT_PER_NODE 2 + +#define KB 1024 +#define MB (1024 * KB) +#define GB (1024 * MB) + +struct aest_hnode { + struct list_head list; + int count; + u32 id; + int type; +}; + +struct acpi_aest_node { + struct list_head list; + int type; + struct acpi_aest_node_interface_header *interface_hdr; + unsigned long *record_implemented; + unsigned long *status_reporting; + unsigned long *addressing_mode; + struct acpi_aest_node_interface_common *common; + union { + struct acpi_aest_processor *processor; + struct acpi_aest_memory *memory; + struct acpi_aest_smmu *smmu; + struct acpi_aest_vendor_v2 *vendor; + struct acpi_aest_gic *gic; + struct acpi_aest_pcie *pcie; + struct acpi_aest_proxy *proxy; + void *spec_pointer; + }; + union { + struct acpi_aest_processor_cache *cache; + struct acpi_aest_processor_tlb *tlb; + struct acpi_aest_processor_generic *generic; + void *processor_spec_pointer; + }; + struct acpi_aest_node_interrupt_v2 *interrupt; + int interrupt_count; +}; +#endif /* __ACPI_AEST_H__ */ From 78f543712abe37503120a7cb5e8a9a9872928ba8 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:42 +0800 Subject: [PATCH 02/24] FROMLIST: ras: AEST: Add probe/remove for AEST driver Parse register information from the AEST table in the probe function, create corresponding structures, and mappings AEST record. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-3-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- MAINTAINERS | 2 + drivers/ras/Kconfig | 1 + drivers/ras/Makefile | 1 + drivers/ras/aest/Kconfig | 17 +++ drivers/ras/aest/Makefile | 5 + drivers/ras/aest/aest-core.c | 217 +++++++++++++++++++++++++++++++++++ drivers/ras/aest/aest.h | 124 ++++++++++++++++++++ include/linux/acpi_aest.h | 9 ++ 8 files changed, 376 insertions(+) create mode 100644 drivers/ras/aest/Kconfig create mode 100644 drivers/ras/aest/Makefile create mode 100644 drivers/ras/aest/aest-core.c create mode 100644 drivers/ras/aest/aest.h diff --git a/MAINTAINERS b/MAINTAINERS index d4e7121ce96fb..b272af3770dac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -338,7 +338,9 @@ M: Ruidong Tian L: linux-acpi@vger.kernel.org L: linux-arm-kernel@lists.infradead.org S: Supported +F: arch/arm64/include/asm/ras.h F: drivers/acpi/arm64/aest.c +F: drivers/ras/aest/ F: include/linux/acpi_aest.h ACPI FOR RISC-V (ACPI/riscv) diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index fc4f4bb94a4c6..61a2a05d9c949 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -33,6 +33,7 @@ if RAS source "arch/x86/ras/Kconfig" source "drivers/ras/amd/atl/Kconfig" +source "drivers/ras/aest/Kconfig" config RAS_FMPM tristate "FRU Memory Poison Manager" diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 11f95d59d3972..72411ee9deafd 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_RAS_CEC) += cec.o obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o obj-y += amd/atl/ +obj-y += aest/ diff --git a/drivers/ras/aest/Kconfig b/drivers/ras/aest/Kconfig new file mode 100644 index 0000000000000..0b09a5d5acce3 --- /dev/null +++ b/drivers/ras/aest/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# ARM Error Source Table Support +# +# Copyright (c) 2025, Alibaba Group. +# + +config AEST + tristate "ARM AEST Driver" + depends on ACPI_AEST && RAS + + help + The Arm Error Source Table (AEST) provides details on ACPI + extensions that enable kernel-first handling of errors in a + system that supports the Armv8 RAS extensions. + + If set, the kernel will report and log hardware errors. diff --git a/drivers/ras/aest/Makefile b/drivers/ras/aest/Makefile new file mode 100644 index 0000000000000..a6ba7e36fb432 --- /dev/null +++ b/drivers/ras/aest/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_AEST) += aest.o + +aest-y := aest-core.o diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c new file mode 100644 index 0000000000000..c7ef6c13fd440 --- /dev/null +++ b/drivers/ras/aest/aest-core.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Error Source Table Support + * + * Copyright (c) 2025, Alibaba Group. + */ + +#include +#include +#include + +#include "aest.h" + +DEFINE_PER_CPU(struct aest_device, percpu_adev); + +#undef pr_fmt +#define pr_fmt(fmt) "AEST: " fmt + +static int aest_init_record(struct aest_record *record, int i, + struct aest_node *node) +{ + struct device *dev = node->adev->dev; + + record->name = devm_kasprintf(dev, GFP_KERNEL, "record%d", i); + if (!record->name) + return -ENOMEM; + + if (node->base) + record->regs_base = + node->base + sizeof(struct ras_ext_regs) * i; + + record->addressing_mode = test_bit(i, node->info->addressing_mode); + record->index = i; + record->node = node; + + aest_record_dbg(record, "base: %p, index: %d, address mode: %x\n", + record->regs_base, record->index, + record->addressing_mode); + return 0; +} + +static void aest_device_remove(struct platform_device *pdev) +{ + platform_set_drvdata(pdev, NULL); +} + +static char *alloc_aest_node_name(struct aest_node *node) +{ + char *name; + + switch (node->type) { + case ACPI_AEST_PROCESSOR_ERROR_NODE: + name = devm_kasprintf(node->adev->dev, GFP_KERNEL, "%s.%d", + aest_node_name[node->type], + node->info->processor->processor_id); + break; + case ACPI_AEST_MEMORY_ERROR_NODE: + case ACPI_AEST_SMMU_ERROR_NODE: + case ACPI_AEST_VENDOR_ERROR_NODE: + case ACPI_AEST_GIC_ERROR_NODE: + case ACPI_AEST_PCIE_ERROR_NODE: + case ACPI_AEST_PROXY_ERROR_NODE: + name = devm_kasprintf(node->adev->dev, GFP_KERNEL, "%s.%llx", + aest_node_name[node->type], + node->info->interface_hdr->address); + break; + default: + name = devm_kasprintf(node->adev->dev, GFP_KERNEL, "Unknown"); + } + + return name; +} + +static int aest_node_set_errgsr(struct aest_device *adev, + struct aest_node *node) +{ + struct acpi_aest_node *anode = node->info; + u64 errgsr_base = anode->common->error_group_register_base; + + if (anode->interface_hdr->type != ACPI_AEST_NODE_MEMORY_MAPPED) + return 0; + + if (!node->base) + return 0; + + if (!(anode->interface_hdr->flags & AEST_XFACE_FLAG_ERROR_GROUP)) { + node->errgsr = node->base + ERXGROUP; + return 0; + } + + if (!errgsr_base) + return -EINVAL; + + node->errgsr = devm_ioremap(adev->dev, errgsr_base, PAGE_SIZE); + if (!node->errgsr) + return -ENOMEM; + + return 0; +} + +static int aest_init_node(struct aest_device *adev, struct aest_node *node, + struct acpi_aest_node *anode) +{ + int i, ret; + u64 address; + + node->adev = adev; + node->info = anode; + node->type = anode->type; + node->name = alloc_aest_node_name(node); + if (!node->name) + return -ENOMEM; + node->record_implemented = anode->record_implemented; + node->status_reporting = anode->status_reporting; + + address = anode->interface_hdr->address; + if (address) { + node->base = devm_ioremap(adev->dev, address, PAGE_SIZE); + if (!node->base) + return -ENOMEM; + } + + ret = aest_node_set_errgsr(adev, node); + if (ret) + return ret; + + node->record_count = anode->interface_hdr->error_record_count; + node->records = devm_kcalloc(adev->dev, node->record_count, + sizeof(struct aest_record), GFP_KERNEL); + if (!node->records) + return -ENOMEM; + + for (i = 0; i < node->record_count; i++) { + ret = aest_init_record(&node->records[i], i, node); + if (ret) + return ret; + } + aest_node_dbg(node, "%d records, base: %llx, errgsr: %llx\n", + node->record_count, (u64)node->base, (u64)node->errgsr); + return 0; +} + +static int aest_init_nodes(struct aest_device *adev, struct aest_hnode *ahnode) +{ + struct acpi_aest_node *anode; + struct aest_node *node; + int ret, i = 0; + + adev->node_cnt = ahnode->count; + adev->nodes = devm_kcalloc(adev->dev, adev->node_cnt, + sizeof(struct aest_node), GFP_KERNEL); + if (!adev->nodes) + return -ENOMEM; + + list_for_each_entry(anode, &ahnode->list, list) { + adev->type = anode->type; + + node = &adev->nodes[i++]; + ret = aest_init_node(adev, node, anode); + if (ret) + return ret; + } + + return 0; +} + +static int aest_device_probe(struct platform_device *pdev) +{ + int ret; + struct aest_device *adev; + struct aest_hnode *ahnode; + + ahnode = *((struct aest_hnode **)pdev->dev.platform_data); + if (!ahnode) + return -ENODEV; + + adev = devm_kzalloc(&pdev->dev, sizeof(*adev), GFP_KERNEL); + if (!adev) + return -ENOMEM; + + adev->dev = &pdev->dev; + adev->id = pdev->id; + aest_set_name(adev, ahnode); + ret = aest_init_nodes(adev, ahnode); + if (ret) + return ret; + + platform_set_drvdata(pdev, adev); + + aest_dev_dbg(adev, "Node cnt: %x, id: %x\n", adev->node_cnt, adev->id); + + return 0; +} + +static struct platform_driver aest_driver = { + .driver = { + .name = "AEST", + }, + .probe = aest_device_probe, + .remove = aest_device_remove, +}; + +static int __init aest_init(void) +{ + return platform_driver_register(&aest_driver); +} +module_init(aest_init); + +static void __exit aest_exit(void) +{ + platform_driver_unregister(&aest_driver); +} +module_exit(aest_exit); + +MODULE_DESCRIPTION("ARM AEST Driver"); +MODULE_AUTHOR("Ruidong Tian "); +MODULE_LICENSE("GPL"); diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h new file mode 100644 index 0000000000000..d918240c3f57e --- /dev/null +++ b/drivers/ras/aest/aest.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM Error Source Table Support + * + * Copyright (c) 2025, Alibaba Group. + */ + +#include +#include + +#define MAX_GSI_PER_NODE 2 + +#define aest_dev_err(__adev, format, ...) \ + dev_err((__adev)->dev, format, ##__VA_ARGS__) +#define aest_dev_info(__adev, format, ...) \ + dev_info((__adev)->dev, format, ##__VA_ARGS__) +#define aest_dev_dbg(__adev, format, ...) \ + dev_dbg((__adev)->dev, format, ##__VA_ARGS__) + +#define aest_node_err(__node, format, ...) \ + dev_err((__node)->adev->dev, "%s: " format, (__node)->name, \ + ##__VA_ARGS__) +#define aest_node_info(__node, format, ...) \ + dev_info((__node)->adev->dev, "%s: " format, (__node)->name, \ + ##__VA_ARGS__) +#define aest_node_dbg(__node, format, ...) \ + dev_dbg((__node)->adev->dev, "%s: " format, (__node)->name, \ + ##__VA_ARGS__) + +#define aest_record_err(__record, format, ...) \ + dev_err((__record)->node->adev->dev, "%s: %s: " format, \ + (__record)->node->name, (__record)->name, ##__VA_ARGS__) +#define aest_record_info(__record, format, ...) \ + dev_info((__record)->node->adev->dev, "%s: %s: " format, \ + (__record)->node->name, (__record)->name, ##__VA_ARGS__) +#define aest_record_dbg(__record, format, ...) \ + dev_dbg((__record)->node->adev->dev, "%s: %s: " format, \ + (__record)->node->name, (__record)->name, ##__VA_ARGS__) + +#define ERXGROUP 0xE00 + +struct aest_record { + char *name; + int index; + void __iomem *regs_base; + + /* + * This bit specifies the addressing mode to populate the ERR_ADDR + * register: + * 0b: Error record reports System Physical Addresses (SPA) in + * the ERR_ADDR register. + * 1b: Error record reports error node-specific Logical Addresses(LA) + * in the ERR_ADD register. OS must use other means to translate + * the reported LA into SPA + */ + int addressing_mode; + struct aest_node *node; +}; + +struct aest_node { + char *name; + u8 type; + void *errgsr; + void *base; + + /* + * This bitmap indicates which of the error records within this error + * node must be polled for error status. + * Bit[n] of this field pertains to error record corresponding to + * index n in this error group. + * Bit[n] = 0b: Error record at index n needs to be polled. + * Bit[n] = 1b: Error record at index n do not needs to be polled. + */ + unsigned long *record_implemented; + /* + * This bitmap indicates which of the error records within this error + * node support error status reporting using ERRGSR register. + * Bit[n] of this field pertains to error record corresponding to + * index n in this error group. + * Bit[n] = 0b: Error record at index n supports error status reporting + * through ERRGSR.S. + * Bit[n] = 1b: Error record at index n does not support error reporting + * through the ERRGSR.S bit If this error record is + * implemented, then it must be polled explicitly for + * error events. + */ + unsigned long *status_reporting; + + struct aest_device *adev; + struct acpi_aest_node *info; + + int record_count; + struct aest_record *records; +}; + +struct aest_device { + struct device *dev; + u32 type; + int node_cnt; + struct aest_node *nodes; + u32 id; +}; + +static const char *const aest_node_name[] = { + [ACPI_AEST_PROCESSOR_ERROR_NODE] = "processor", + [ACPI_AEST_MEMORY_ERROR_NODE] = "memory", + [ACPI_AEST_SMMU_ERROR_NODE] = "smmu", + [ACPI_AEST_VENDOR_ERROR_NODE] = "vendor", + [ACPI_AEST_GIC_ERROR_NODE] = "gic", + [ACPI_AEST_PCIE_ERROR_NODE] = "pcie", + [ACPI_AEST_PROXY_ERROR_NODE] = "proxy", +}; + +static inline int aest_set_name(struct aest_device *adev, + struct aest_hnode *ahnode) +{ + adev->dev->init_name = devm_kasprintf(adev->dev, GFP_KERNEL, "%s%d", + aest_node_name[ahnode->type], + adev->id); + if (!adev->dev->init_name) + return -ENOMEM; + + return 0; +} diff --git a/include/linux/acpi_aest.h b/include/linux/acpi_aest.h index 53c1970e7583b..77187ce43d44b 100644 --- a/include/linux/acpi_aest.h +++ b/include/linux/acpi_aest.h @@ -15,6 +15,15 @@ #define AEST_MAX_INTERRUPT_PER_NODE 2 +/* AEST interface */ +#define AEST_XFACE_FLAG_SHARED (1 << 0) +#define AEST_XFACE_FLAG_CLEAR_MISC (1 << 1) +#define AEST_XFACE_FLAG_ERROR_DEVICE (1 << 2) +#define AEST_XFACE_FLAG_AFFINITY (1 << 3) +#define AEST_XFACE_FLAG_ERROR_GROUP (1 << 4) +#define AEST_XFACE_FLAG_FAULT_INJECT (1 << 5) +#define AEST_XFACE_FLAG_INT_CONFIG (1 << 6) + #define KB 1024 #define MB (1024 * KB) #define GB (1024 * MB) From b9b9c07ef428fa548e657964dbfb637058cf9d6d Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:43 +0800 Subject: [PATCH 03/24] FROMLIST: ras: AEST: support different group format Support for various AEST group formats allows for flexible configuration of AEST node address space sizes and maximum record counts per group. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-4-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 6 ++++-- drivers/ras/aest/aest.h | 39 +++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index c7ef6c13fd440..acebb293ac75a 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -84,7 +84,7 @@ static int aest_node_set_errgsr(struct aest_device *adev, return 0; if (!(anode->interface_hdr->flags & AEST_XFACE_FLAG_ERROR_GROUP)) { - node->errgsr = node->base + ERXGROUP; + node->errgsr = node->base + node->group->errgsr_offset; return 0; } @@ -112,10 +112,12 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, return -ENOMEM; node->record_implemented = anode->record_implemented; node->status_reporting = anode->status_reporting; + node->group = &aest_group_config[anode->interface_hdr->group_format]; address = anode->interface_hdr->address; if (address) { - node->base = devm_ioremap(adev->dev, address, PAGE_SIZE); + node->base = + devm_ioremap(adev->dev, address, node->group->size); if (!node->base) return -ENOMEM; } diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index d918240c3f57e..3250675e99b77 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -37,7 +37,15 @@ dev_dbg((__record)->node->adev->dev, "%s: %s: " format, \ (__record)->node->name, (__record)->name, ##__VA_ARGS__) -#define ERXGROUP 0xE00 +#define ERXGROUP_4K_OFFSET 0xE00 +#define ERXGROUP_16K_OFFSET 0x3800 +#define ERXGROUP_64K_OFFSET 0xE000 +#define ERXGROUP_4K_SIZE (4 * KB) +#define ERXGROUP_16K_SIZE (16 * KB) +#define ERXGROUP_64K_SIZE (64 * KB) +#define ERXGROUP_4K_ERRGSR_NUM 1 +#define ERXGROUP_16K_ERRGSR_NUM 4 +#define ERXGROUP_64K_ERRGSR_NUM 14 struct aest_record { char *name; @@ -57,6 +65,34 @@ struct aest_record { struct aest_node *node; }; +struct aest_group { + int type; + int errgsr_num; + size_t size; + u64 errgsr_offset; +}; + +static const struct aest_group aest_group_config[] = { + [ACPI_AEST_NODE_GROUP_FORMAT_4K] = { + .type = ACPI_AEST_NODE_GROUP_FORMAT_4K, + .errgsr_num = ERXGROUP_4K_ERRGSR_NUM, + .size = ERXGROUP_4K_SIZE, + .errgsr_offset = ERXGROUP_4K_OFFSET, + }, + [ACPI_AEST_NODE_GROUP_FORMAT_16K] = { + .type = ACPI_AEST_NODE_GROUP_FORMAT_16K, + .errgsr_num = ERXGROUP_16K_ERRGSR_NUM, + .size = ERXGROUP_16K_SIZE, + .errgsr_offset = ERXGROUP_16K_OFFSET, + }, + [ACPI_AEST_NODE_GROUP_FORMAT_64K] = { + .type = ACPI_AEST_NODE_GROUP_FORMAT_64K, + .errgsr_num = ERXGROUP_64K_ERRGSR_NUM, + .size = ERXGROUP_64K_SIZE, + .errgsr_offset = ERXGROUP_64K_OFFSET, + }, +}; + struct aest_node { char *name; u8 type; @@ -86,6 +122,7 @@ struct aest_node { */ unsigned long *status_reporting; + const struct aest_group *group; struct aest_device *adev; struct acpi_aest_node *info; From 6b6b980289701cd2e249c8979c900ed4b0479a6e Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:44 +0800 Subject: [PATCH 04/24] FROMLIST: ras: AEST: Unify the read/write interface for system and MMIO register Use record_read/write to simultaneously read and write system registers and MMIO registers while maintaining code conciseness. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-5-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 1 + drivers/ras/aest/aest.h | 94 ++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index acebb293ac75a..f4a5119dc513b 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -29,6 +29,7 @@ static int aest_init_record(struct aest_record *record, int i, record->regs_base = node->base + sizeof(struct ras_ext_regs) * i; + record->access = &aest_access[node->info->interface_hdr->type]; record->addressing_mode = test_bit(i, node->info->addressing_mode); record->index = i; record->node = node; diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 3250675e99b77..31131cce99281 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -10,6 +10,11 @@ #define MAX_GSI_PER_NODE 2 +#define record_read(record, offset) \ + record->access->read(record->regs_base, offset) +#define record_write(record, offset, val) \ + record->access->write(record->regs_base, offset, val) + #define aest_dev_err(__adev, format, ...) \ dev_err((__adev)->dev, format, ##__VA_ARGS__) #define aest_dev_info(__adev, format, ...) \ @@ -47,6 +52,20 @@ #define ERXGROUP_16K_ERRGSR_NUM 4 #define ERXGROUP_64K_ERRGSR_NUM 14 +#define ERXFR 0x0 +#define ERXCTLR 0x8 +#define ERXSTATUS 0x10 +#define ERXADDR 0x18 +#define ERXMISC0 0x20 +#define ERXMISC1 0x28 +#define ERXMISC2 0x30 +#define ERXMISC3 0x38 + +struct aest_access { + u64 (*read)(void *base, u32 offset); + void (*write)(void *base, u32 offset, u64 val); +}; + struct aest_record { char *name; int index; @@ -63,6 +82,7 @@ struct aest_record { */ int addressing_mode; struct aest_node *node; + const struct aest_access *access; }; struct aest_group { @@ -159,3 +179,77 @@ static inline int aest_set_name(struct aest_device *adev, return 0; } + +#define CASE_READ(res, x) \ + case (x): { \ + res = read_sysreg_s(SYS_##x##_EL1); \ + break; \ + } + +#define CASE_WRITE(val, x) \ + case (x): { \ + write_sysreg_s((val), SYS_##x##_EL1); \ + break; \ + } + +static inline u64 aest_sysreg_read(void *__unused, u32 offset) +{ + u64 res; + + switch (offset) { + CASE_READ(res, ERXFR) + CASE_READ(res, ERXCTLR) + CASE_READ(res, ERXSTATUS) + CASE_READ(res, ERXADDR) + CASE_READ(res, ERXMISC0) + CASE_READ(res, ERXMISC1) + CASE_READ(res, ERXMISC2) + CASE_READ(res, ERXMISC3) + default : + res = 0; + } + return res; +} + +static inline void aest_sysreg_write(void *base, u32 offset, u64 val) +{ + switch (offset) { + CASE_WRITE(val, ERXFR) + CASE_WRITE(val, ERXCTLR) + CASE_WRITE(val, ERXSTATUS) + CASE_WRITE(val, ERXADDR) + CASE_WRITE(val, ERXMISC0) + CASE_WRITE(val, ERXMISC1) + CASE_WRITE(val, ERXMISC2) + CASE_WRITE(val, ERXMISC3) + default : + return; + } +} + +static inline u64 aest_iomem_read(void *base, u32 offset) +{ + return readq_relaxed(base + offset); +} + +static inline void aest_iomem_write(void *base, u32 offset, u64 val) +{ + writeq_relaxed(val, base + offset); +} + +/* access type is decided by AEST interface type. */ +static const struct aest_access aest_access[] = { + [ACPI_AEST_NODE_SYSTEM_REGISTER] = { + .read = aest_sysreg_read, + .write = aest_sysreg_write, + }, + [ACPI_AEST_NODE_MEMORY_MAPPED] = { + .read = aest_iomem_read, + .write = aest_iomem_write, + }, + [ACPI_AEST_NODE_SINGLE_RECORD_MEMORY_MAPPED] = { + .read = aest_iomem_read, + .write = aest_iomem_write, + }, + { } +}; From 7d9206edf99fc25c88b530b615ae3d600c22cb9d Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:45 +0800 Subject: [PATCH 05/24] FROMLIST: ras: AEST: Probe RAS system architecture version The RAS version of a component can be probed via its ERRDEVARCH register. In cases where a component (e.g., SMMU) does not implement an ERRDEVARCH register, the driver falls back to using the RAS version of the Processing Element (PE). Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-6-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- arch/arm64/include/asm/ras.h | 3 +++ drivers/ras/aest/aest-core.c | 22 ++++++++++++++++++++++ drivers/ras/aest/aest.h | 3 +++ 3 files changed, 28 insertions(+) diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h index b6640b9972bfb..da7c441252feb 100644 --- a/arch/arm64/include/asm/ras.h +++ b/arch/arm64/include/asm/ras.h @@ -4,6 +4,9 @@ #include +/* ERRDEVARCH */ +#define ERRDEVARCH_REV GENMASK(19, 16) + struct ras_ext_regs { u64 err_fr; u64 err_ctlr; diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index f4a5119dc513b..84b2fb8127ffc 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -16,6 +16,27 @@ DEFINE_PER_CPU(struct aest_device, percpu_adev); #undef pr_fmt #define pr_fmt(fmt) "AEST: " fmt +static int get_aest_node_ver(struct aest_node *node) +{ + u64 reg; + void *devarch_base; + + if (node->type == ACPI_AEST_GIC_ERROR_NODE) { + devarch_base = ioremap(node->info->interface_hdr->address + + GIC_ERRDEVARCH, + PAGE_SIZE); + if (!devarch_base) + return 0; + + reg = readl_relaxed(devarch_base); + iounmap(devarch_base); + + return FIELD_GET(ERRDEVARCH_REV, reg); + } + + return FIELD_GET(ID_AA64PFR0_EL1_RAS_MASK, read_cpuid(ID_AA64PFR0_EL1)); +} + static int aest_init_record(struct aest_record *record, int i, struct aest_node *node) { @@ -108,6 +129,7 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, node->adev = adev; node->info = anode; node->type = anode->type; + node->version = get_aest_node_ver(node); node->name = alloc_aest_node_name(node); if (!node->name) return -ENOMEM; diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 31131cce99281..bf0b9a49fdaa2 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -61,6 +61,8 @@ #define ERXMISC2 0x30 #define ERXMISC3 0x38 +#define GIC_ERRDEVARCH 0xFFBC + struct aest_access { u64 (*read)(void *base, u32 offset); void (*write)(void *base, u32 offset, u64 val); @@ -141,6 +143,7 @@ struct aest_node { * error events. */ unsigned long *status_reporting; + int version; const struct aest_group *group; struct aest_device *adev; From 4e2fc91aac4c1982bd7a04dc1083895fae6633ad Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:46 +0800 Subject: [PATCH 06/24] FROMLIST: ras: AEST: Support RAS Common Fault Injection Model Extension Add inject register descripted in Common Fault Injection Model Extension. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-7-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 15 ++++++++++++++- drivers/ras/aest/aest.h | 10 ++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 84b2fb8127ffc..1218ae51079cf 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -124,7 +124,7 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, struct acpi_aest_node *anode) { int i, ret; - u64 address; + u64 address, flags; node->adev = adev; node->info = anode; @@ -145,6 +145,19 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, return -ENOMEM; } + flags = anode->interface_hdr->flags; + address = node->info->common->fault_inject_register_base; + if ((flags & AEST_XFACE_FLAG_FAULT_INJECT) && address) { + if (address - anode->interface_hdr->address < node->group->size) + node->inj = node->base + + (address - anode->interface_hdr->address); + else { + node->inj = devm_ioremap(adev->dev, address, PAGE_SIZE); + if (!node->inj) + return -ENOMEM; + } + } + ret = aest_node_set_errgsr(adev, node); if (ret) return ret; diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index bf0b9a49fdaa2..505ecd9635bcb 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -60,6 +60,9 @@ #define ERXMISC1 0x28 #define ERXMISC2 0x30 #define ERXMISC3 0x38 +#define ERXPFGF 0x800 +#define ERXPFGCTL 0x808 +#define ERXPFGCDN 0x810 #define GIC_ERRDEVARCH 0xFFBC @@ -120,6 +123,7 @@ struct aest_node { u8 type; void *errgsr; void *base; + void *inj; /* * This bitmap indicates which of the error records within this error @@ -208,6 +212,9 @@ static inline u64 aest_sysreg_read(void *__unused, u32 offset) CASE_READ(res, ERXMISC1) CASE_READ(res, ERXMISC2) CASE_READ(res, ERXMISC3) + CASE_READ(res, ERXPFGF) + CASE_READ(res, ERXPFGCTL) + CASE_READ(res, ERXPFGCDN) default : res = 0; } @@ -225,6 +232,9 @@ static inline void aest_sysreg_write(void *base, u32 offset, u64 val) CASE_WRITE(val, ERXMISC1) CASE_WRITE(val, ERXMISC2) CASE_WRITE(val, ERXMISC3) + CASE_WRITE(val, ERXPFGF) + CASE_WRITE(val, ERXPFGCTL) + CASE_WRITE(val, ERXPFGCDN) default : return; } From 1d55662eb8fba5b60cd0e95e09d37e0d52a16761 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:47 +0800 Subject: [PATCH 07/24] FROMLIST: ras: AEST: Support CE threshold of error record The CE threshold defines the number of Correctable Errors (CE) that must occur in a record before triggering an interrupt. Error records support multiple threshold configurations, including 8B, 16B, and 32B. This patch detects the supported threshold settings for error records and sets the default threshold to 1, ensuring an interrupt is generated for every CE occurrence. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-8-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- arch/arm64/include/asm/ras.h | 41 ++++++++++++++++++++ drivers/ras/aest/aest-core.c | 74 ++++++++++++++++++++++++++++++++++++ drivers/ras/aest/aest.h | 17 +++++++++ include/linux/acpi_aest.h | 3 ++ 4 files changed, 135 insertions(+) diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h index da7c441252feb..6c51d27520c00 100644 --- a/arch/arm64/include/asm/ras.h +++ b/arch/arm64/include/asm/ras.h @@ -4,9 +4,50 @@ #include +/* ERRFR */ +#define ERR_FR_CE GENMASK_ULL(54, 53) +#define ERR_FR_RP BIT(15) +#define ERR_FR_CEC GENMASK_ULL(14, 12) + +#define ERR_FR_RP_SINGLE_COUNTER 0 +#define ERR_FR_RP_DOUBLE_COUNTER 1 + +#define ERR_FR_CEC_0B_COUNTER 0 +#define ERR_FR_CEC_8B_COUNTER BIT(1) +#define ERR_FR_CEC_16B_COUNTER BIT(2) + +/* ERRMISC0 */ + +/* ERRFR.CEC == 0b010, ERRFR.RP == 0 */ +#define ERR_MISC0_8B_OF BIT(39) +#define ERR_MISC0_8B_CEC GENMASK_ULL(38, 32) + +/* ERRFR.CEC == 0b100, ERRFR.RP == 0 */ +#define ERR_MISC0_16B_OF BIT(47) +#define ERR_MISC0_16B_CEC GENMASK_ULL(46, 32) + +#define ERR_MISC0_CEC_SHIFT 31 + +#define ERR_8B_CEC_MAX (ERR_MISC0_8B_CEC >> ERR_MISC0_CEC_SHIFT) +#define ERR_16B_CEC_MAX (ERR_MISC0_16B_CEC >> ERR_MISC0_CEC_SHIFT) + +/* ERRFR.CEC == 0b100, ERRFR.RP == 1 */ +#define ERR_MISC0_16B_OFO BIT(63) +#define ERR_MISC0_16B_CECO GENMASK_ULL(62, 48) +#define ERR_MISC0_16B_OFR BIT(47) +#define ERR_MISC0_16B_CECR GENMASK_ULL(46, 32) + /* ERRDEVARCH */ #define ERRDEVARCH_REV GENMASK(19, 16) +enum ras_ce_threshold { + RAS_CE_THRESHOLD_0B, + RAS_CE_THRESHOLD_8B, + RAS_CE_THRESHOLD_16B, + RAS_CE_THRESHOLD_32B, + UNKNOWN, +}; + struct ras_ext_regs { u64 err_fr; u64 err_ctlr; diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 1218ae51079cf..5cfe91a6d72a9 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -16,6 +16,79 @@ DEFINE_PER_CPU(struct aest_device, percpu_adev); #undef pr_fmt #define pr_fmt(fmt) "AEST: " fmt +static enum ras_ce_threshold aest_get_ce_threshold(struct aest_record *record) +{ + u64 err_fr, err_fr_cec, err_fr_rp = -1; + + err_fr = record_read(record, ERXFR); + err_fr_cec = FIELD_GET(ERR_FR_CEC, err_fr); + err_fr_rp = FIELD_GET(ERR_FR_RP, err_fr); + + if (err_fr_cec == ERR_FR_CEC_0B_COUNTER) + return RAS_CE_THRESHOLD_0B; + else if (err_fr_rp == ERR_FR_RP_DOUBLE_COUNTER) + return RAS_CE_THRESHOLD_32B; + else if (err_fr_cec == ERR_FR_CEC_8B_COUNTER) + return RAS_CE_THRESHOLD_8B; + else if (err_fr_cec == ERR_FR_CEC_16B_COUNTER) + return RAS_CE_THRESHOLD_16B; + else + return UNKNOWN; +} + +static const struct ce_threshold_info ce_info[] = { + [RAS_CE_THRESHOLD_0B] = { 0 }, + [RAS_CE_THRESHOLD_8B] = { + .max_count = ERR_8B_CEC_MAX, + .mask = ERR_MISC0_8B_CEC, + .shift = ERR_MISC0_CEC_SHIFT, + }, + [RAS_CE_THRESHOLD_16B] = { + .max_count = ERR_16B_CEC_MAX, + .mask = ERR_MISC0_16B_CEC, + .shift = ERR_MISC0_CEC_SHIFT, + }, +}; + +static void aest_set_ce_threshold(struct aest_record *record) +{ + u64 err_misc0; + struct ce_threshold *ce = &record->ce; + const struct ce_threshold_info *info; + + record->threshold_type = aest_get_ce_threshold(record); + + switch (record->threshold_type) { + case RAS_CE_THRESHOLD_0B: + aest_record_dbg(record, "do not support CE threshold!\n"); + return; + case RAS_CE_THRESHOLD_8B: + aest_record_dbg(record, "support 8 bit CE threshold!\n"); + break; + case RAS_CE_THRESHOLD_16B: + aest_record_dbg(record, "support 16 bit CE threshold!\n"); + break; + case RAS_CE_THRESHOLD_32B: + aest_record_dbg(record, "not support 32 bit CE threshold!\n"); + break; + default: + aest_record_dbg(record, "Unknown misc0 ce threshold!\n"); + } + + err_misc0 = record_read(record, ERXMISC0); + info = &ce_info[record->threshold_type]; + ce->info = info; + + // Default CE threshold is 1. + ce->count = info->max_count; + ce->threshold = DEFAULT_CE_THRESHOLD; + ce->reg_val = err_misc0 | info->mask; + + record_write(record, ERXMISC0, ce->reg_val); + aest_record_dbg(record, "CE threshold is %llx, controlled by Kernel", + ce->threshold); +} + static int get_aest_node_ver(struct aest_node *node) { u64 reg; @@ -54,6 +127,7 @@ static int aest_init_record(struct aest_record *record, int i, record->addressing_mode = test_bit(i, node->info->addressing_mode); record->index = i; record->node = node; + aest_set_ce_threshold(record); aest_record_dbg(record, "base: %p, index: %d, address mode: %x\n", record->regs_base, record->index, diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 505ecd9635bcb..85eeed79bcbee 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -9,6 +9,7 @@ #include #define MAX_GSI_PER_NODE 2 +#define DEFAULT_CE_THRESHOLD 1 #define record_read(record, offset) \ record->access->read(record->regs_base, offset) @@ -71,6 +72,19 @@ struct aest_access { void (*write)(void *base, u32 offset, u64 val); }; +struct ce_threshold_info { + const u64 max_count; + const u64 mask; + const u64 shift; +}; + +struct ce_threshold { + const struct ce_threshold_info *info; + u64 count; + u64 threshold; + u64 reg_val; +}; + struct aest_record { char *name; int index; @@ -88,6 +102,9 @@ struct aest_record { int addressing_mode; struct aest_node *node; const struct aest_access *access; + + struct ce_threshold ce; + enum ras_ce_threshold threshold_type; }; struct aest_group { diff --git a/include/linux/acpi_aest.h b/include/linux/acpi_aest.h index 77187ce43d44b..a7898c643896e 100644 --- a/include/linux/acpi_aest.h +++ b/include/linux/acpi_aest.h @@ -13,6 +13,9 @@ /* AEST interrupt */ #define AEST_INTERRUPT_MODE BIT(0) +#define AEST_INTERRUPT_FHI_UE_SUPPORT BIT(0) +#define AEST_INTERRUPT_FHI_UE_NO_SUPPORT BIT(1) + #define AEST_MAX_INTERRUPT_PER_NODE 2 /* AEST interface */ From 5baf7ce2ef31e99f4c17d38acb642573c8697475 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:48 +0800 Subject: [PATCH 08/24] FROMLIST: ras: AEST: Enable and register IRQs The interrupt numbers for certain error records may be explicitly programmed into their configuration register. And for PPIs, each core will maintains its own copy of the aest_device structure. Given that handling RAS errors entails complex processes such as EDAC and memory_failure, all handling is deferred to and handled within a bottom-half context. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-9-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- arch/arm64/include/asm/ras.h | 36 +++ drivers/ras/aest/aest-core.c | 531 ++++++++++++++++++++++++++++++++++- drivers/ras/aest/aest.h | 56 ++++ include/linux/acpi_aest.h | 7 + include/linux/ras.h | 8 + 5 files changed, 637 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h index 6c51d27520c00..02cf15278d9ff 100644 --- a/arch/arm64/include/asm/ras.h +++ b/arch/arm64/include/asm/ras.h @@ -2,6 +2,7 @@ #ifndef __ASM_RAS_H #define __ASM_RAS_H +#include #include /* ERRFR */ @@ -37,6 +38,41 @@ #define ERR_MISC0_16B_OFR BIT(47) #define ERR_MISC0_16B_CECR GENMASK_ULL(46, 32) +/* ERRSTATUS */ +#define ERR_STATUS_AV BIT(31) +#define ERR_STATUS_V BIT(30) +#define ERR_STATUS_UE BIT(29) +#define ERR_STATUS_ER BIT(28) +#define ERR_STATUS_OF BIT(27) +#define ERR_STATUS_MV BIT(26) +#define ERR_STATUS_CE (BIT(25) | BIT(24)) +#define ERR_STATUS_DE BIT(23) +#define ERR_STATUS_PN BIT(22) +#define ERR_STATUS_UET (BIT(21) | BIT(20)) +#define ERR_STATUS_CI BIT(19) +#define ERR_STATUS_IERR GENMASK_ULL(15, 8) +#define ERR_STATUS_SERR GENMASK_ULL(7, 0) + +/* Theses bits are write-one-to-clear */ +#define ERR_STATUS_W1TC \ + (ERR_STATUS_AV | ERR_STATUS_V | ERR_STATUS_UE | ERR_STATUS_ER | \ + ERR_STATUS_OF | ERR_STATUS_MV | ERR_STATUS_CE | ERR_STATUS_DE | \ + ERR_STATUS_PN | ERR_STATUS_UET | ERR_STATUS_CI) + +#define ERR_STATUS_UET_UC 0 +#define ERR_STATUS_UET_UEU 1 +#define ERR_STATUS_UET_UEO 2 +#define ERR_STATUS_UET_UER 3 + +/* ERRADDR */ +#define ERR_ADDR_AI BIT(61) +#define ERR_ADDR_PADDR GENMASK_ULL(55, 0) + +/* ERRCTLR */ +#define ERR_CTLR_CFI BIT(8) +#define ERR_CTLR_FI BIT(3) +#define ERR_CTLR_UI BIT(2) + /* ERRDEVARCH */ #define ERRDEVARCH_REV GENMASK(19, 16) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 5cfe91a6d72a9..5ec0ba38f51b4 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -5,8 +5,11 @@ * Copyright (c) 2025, Alibaba Group. */ +#include +#include #include #include +#include #include #include "aest.h" @@ -16,6 +19,439 @@ DEFINE_PER_CPU(struct aest_device, percpu_adev); #undef pr_fmt #define pr_fmt(fmt) "AEST: " fmt +/* + * This memory pool is only to be used to save AEST node in AEST irq context. + * There can be 500 AEST node at most. + */ +#define AEST_NODE_ALLOCED_MAX 500 + +#define AEST_LOG_PREFIX_BUFFER 64 + +BLOCKING_NOTIFIER_HEAD(aest_decoder_chain); + +static void aest_print(struct aest_event *event) +{ + static atomic_t seqno = { 0 }; + unsigned int curr_seqno; + char pfx_seq[AEST_LOG_PREFIX_BUFFER]; + int index; + struct ras_ext_regs *regs; + + curr_seqno = atomic_inc_return(&seqno); + snprintf(pfx_seq, sizeof(pfx_seq), "{%u}" HW_ERR, curr_seqno); + pr_info("%sHardware error from AEST %s\n", pfx_seq, event->node_name); + + switch (event->type) { + case ACPI_AEST_PROCESSOR_ERROR_NODE: + pr_err("%s Error from CPU%d\n", pfx_seq, event->id0); + break; + case ACPI_AEST_MEMORY_ERROR_NODE: + pr_err("%s Error from memory at SRAT proximity domain %#x\n", + pfx_seq, event->id0); + break; + case ACPI_AEST_SMMU_ERROR_NODE: + pr_err("%s Error from SMMU IORT node %#x subcomponent %#x\n", + pfx_seq, event->id0, event->id1); + break; + case ACPI_AEST_VENDOR_ERROR_NODE: + pr_err("%s Error from vendor hid %8.8s uid %#x\n", pfx_seq, + event->hid, event->id1); + break; + case ACPI_AEST_GIC_ERROR_NODE: + pr_err("%s Error from GIC type %#x instance %#x\n", pfx_seq, + event->id0, event->id1); + break; + default: + pr_err("%s Unknown AEST node type\n", pfx_seq); + return; + } + + index = event->index; + regs = &event->regs; + + pr_err("%s ERR%dFR: 0x%llx\n", pfx_seq, index, regs->err_fr); + pr_err("%s ERR%dCTRL: 0x%llx\n", pfx_seq, index, regs->err_ctlr); + pr_err("%s ERR%dSTATUS: 0x%llx\n", pfx_seq, index, regs->err_status); + if (regs->err_status & ERR_STATUS_AV) + pr_err("%s ERR%dADDR: 0x%llx\n", pfx_seq, index, + regs->err_addr); + + if (regs->err_status & ERR_STATUS_MV) { + pr_err("%s ERR%dMISC0: 0x%llx\n", pfx_seq, index, + regs->err_misc[0]); + pr_err("%s ERR%dMISC1: 0x%llx\n", pfx_seq, index, + regs->err_misc[1]); + pr_err("%s ERR%dMISC2: 0x%llx\n", pfx_seq, index, + regs->err_misc[2]); + pr_err("%s ERR%dMISC3: 0x%llx\n", pfx_seq, index, + regs->err_misc[3]); + } +} + +static void aest_handle_memory_failure(u64 addr) +{ + unsigned long pfn; + + pfn = PHYS_PFN(addr); + + if (!pfn_valid(pfn)) { + pr_warn(HW_ERR "Invalid physical address: %#llx\n", addr); + return; + } + +#ifdef CONFIG_MEMORY_FAILURE + memory_failure(pfn, 0); +#endif +} + +static void init_aest_event(struct aest_event *event, + struct aest_record *record, + struct ras_ext_regs *regs) +{ + struct aest_node *node = record->node; + struct acpi_aest_node *info = node->info; + + event->type = node->type; + event->node_name = node->name; + switch (node->type) { + case ACPI_AEST_PROCESSOR_ERROR_NODE: + if (info->processor->flags & + (ACPI_AEST_PROC_FLAG_SHARED | ACPI_AEST_PROC_FLAG_GLOBAL)) + event->id0 = smp_processor_id(); + else + event->id0 = get_cpu_for_acpi_id( + info->processor->processor_id); + + event->id1 = info->processor->resource_type; + break; + case ACPI_AEST_MEMORY_ERROR_NODE: + event->id0 = info->memory->srat_proximity_domain; + break; + case ACPI_AEST_SMMU_ERROR_NODE: + event->id0 = info->smmu->iort_node_reference; + event->id1 = info->smmu->subcomponent_reference; + break; + case ACPI_AEST_VENDOR_ERROR_NODE: + event->id0 = 0; + event->id1 = info->vendor->acpi_uid; + event->hid = info->vendor->acpi_hid; + break; + case ACPI_AEST_GIC_ERROR_NODE: + event->id0 = info->gic->interface_type; + event->id1 = info->gic->instance_id; + break; + default: + event->id0 = 0; + event->id1 = 0; + } + + memcpy(&event->regs, regs, sizeof(*regs)); + event->index = record->index; + event->addressing_mode = record->addressing_mode; +} + +static int aest_node_gen_pool_add(struct aest_device *adev, + struct aest_record *record, + struct ras_ext_regs *regs) +{ + struct aest_event *event; + + if (!adev->pool) + return -EINVAL; + + event = (void *)gen_pool_alloc(adev->pool, sizeof(*event)); + if (!event) + return -ENOMEM; + + init_aest_event(event, record, regs); + llist_add(&event->llnode, &adev->event_list); + + return 0; +} + +static void aest_log(struct aest_record *record, struct ras_ext_regs *regs) +{ + struct aest_device *adev = record->node->adev; + + if (!aest_node_gen_pool_add(adev, record, regs)) + schedule_work(&adev->aest_work); +} + +void aest_register_decode_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&aest_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(aest_register_decode_chain); + +void aest_unregister_decode_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&aest_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(aest_unregister_decode_chain); + +static void aest_node_pool_process(struct work_struct *work) +{ + struct llist_node *head; + struct aest_event *event; + struct aest_device *adev = + container_of(work, struct aest_device, aest_work); + u64 status, addr; + + head = llist_del_all(&adev->event_list); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry(event, head, llnode) { + aest_print(event); + + status = event->regs.err_status; + if (!(event->regs.err_addr & ERR_ADDR_AI) && + (status & (ERR_STATUS_UE | ERR_STATUS_DE))) { + if (event->addressing_mode == AEST_ADDREESS_SPA) + addr = event->regs.err_addr & PHYS_MASK; + aest_handle_memory_failure(addr); + } + + blocking_notifier_call_chain(&aest_decoder_chain, 0, event); + gen_pool_free(adev->pool, (unsigned long)event, sizeof(*event)); + } +} + +static int aest_node_pool_init(struct aest_device *adev) +{ + unsigned long addr, size; + + size = ilog2(sizeof(struct aest_event)); + adev->pool = + devm_gen_pool_create(adev->dev, size, -1, dev_name(adev->dev)); + if (!adev->pool) + return -ENOMEM; + + size = PAGE_ALIGN(size * AEST_NODE_ALLOCED_MAX); + addr = (unsigned long)devm_kzalloc(adev->dev, size, GFP_KERNEL); + if (!addr) + return -ENOMEM; + + return gen_pool_add(adev->pool, addr, size, -1); +} + +static void aest_panic(struct aest_record *record, struct ras_ext_regs *regs, + char *msg) +{ + struct aest_event event = { 0 }; + + init_aest_event(&event, record, regs); + + aest_print(&event); + + panic(msg); +} + +static void aest_proc_record(struct aest_record *record, void *data) +{ + struct ras_ext_regs regs = { 0 }; + int *count = data; + u64 ue; + + regs.err_status = record_read(record, ERXSTATUS); + if (!(regs.err_status & ERR_STATUS_V)) + return; + + (*count)++; + + if (regs.err_status & ERR_STATUS_AV) + regs.err_addr = record_read(record, ERXADDR); + + regs.err_fr = record_read(record, ERXFR); + regs.err_ctlr = record_read(record, ERXCTLR); + + if (regs.err_status & ERR_STATUS_MV) { + regs.err_misc[0] = record_read(record, ERXMISC0); + regs.err_misc[1] = record_read(record, ERXMISC1); + if (record->node->version >= ID_AA64PFR0_EL1_RAS_V1P1) { + regs.err_misc[2] = record_read(record, ERXMISC2); + regs.err_misc[3] = record_read(record, ERXMISC3); + } + + if (record->node->info->interface_hdr->flags & + AEST_XFACE_FLAG_CLEAR_MISC) { + record_write(record, ERXMISC0, 0); + record_write(record, ERXMISC1, 0); + if (record->node->version >= ID_AA64PFR0_EL1_RAS_V1P1) { + record_write(record, ERXMISC2, 0); + record_write(record, ERXMISC3, 0); + } + /* ce count is 0 if record do not support ce */ + } else if (record->ce.count > 0) + record_write(record, ERXMISC0, record->ce.reg_val); + } + + /* panic if unrecoverable and uncontainable error encountered */ + ue = FIELD_GET(ERR_STATUS_UET, regs.err_status); + if ((regs.err_status & ERR_STATUS_UE) && + (ue == ERR_STATUS_UET_UC || ue == ERR_STATUS_UET_UEU)) + aest_panic(record, ®s, + "AEST: unrecoverable error encountered"); + + aest_log(record, ®s); + + /* Write-one-to-clear the bits we've seen */ + regs.err_status &= ERR_STATUS_W1TC; + + /* Multi bit filed need to write all-ones to clear. */ + if (regs.err_status & ERR_STATUS_CE) + regs.err_status |= ERR_STATUS_CE; + + /* Multi bit filed need to write all-ones to clear. */ + if (regs.err_status & ERR_STATUS_UET) + regs.err_status |= ERR_STATUS_UET; + + record_write(record, ERXSTATUS, regs.err_status); +} + +static void aest_node_foreach_record(void (*func)(struct aest_record *, void *), + struct aest_node *node, void *data, + unsigned long *bitmap) +{ + int i; + + for_each_clear_bit(i, bitmap, node->record_count) { + aest_select_record(node, i); + + func(&node->records[i], data); + + aest_sync(node); + } +} + +static int aest_proc(struct aest_node *node) +{ + int count = 0, i, j, size = node->record_count; + u64 err_group = 0; + + aest_node_dbg(node, "Poll bitmap %*pb\n", size, + node->record_implemented); + aest_node_foreach_record(aest_proc_record, node, &count, + node->record_implemented); + + if (!node->errgsr) + return count; + + aest_node_dbg(node, "Report bitmap %*pb\n", size, + node->status_reporting); + for (i = 0; i < BITS_TO_U64(size); i++) { + err_group = readq_relaxed((void *)node->errgsr + i * 8); + aest_node_dbg(node, "errgsr[%d]: 0x%llx\n", i, err_group); + + for_each_set_bit(j, (unsigned long *)&err_group, + BITS_PER_LONG) { + /* + * Error group base is only valid in Memory Map node, + * so driver do not need to write select register and + * sync. + */ + if (test_bit(i * BITS_PER_LONG + j, + node->status_reporting)) + continue; + aest_proc_record(&node->records[j], &count); + } + } + + return count; +} + +static irqreturn_t aest_irq_func(int irq, void *input) +{ + struct aest_device *adev = input; + int i; + + for (i = 0; i < adev->node_cnt; i++) + aest_proc(&adev->nodes[i]); + + return IRQ_HANDLED; +} + +static int aest_register_irq(struct aest_device *adev) +{ + int i, irq, ret; + char *irq_desc; + + irq_desc = devm_kasprintf(adev->dev, GFP_KERNEL, "%s.%s.", + dev_driver_string(adev->dev), + dev_name(adev->dev)); + if (!irq_desc) + return -ENOMEM; + + for (i = 0; i < MAX_GSI_PER_NODE; i++) { + irq = adev->irq[i]; + + if (!irq) + continue; + + if (irq_is_percpu_devid(irq)) { + ret = request_percpu_irq(irq, aest_irq_func, irq_desc, + adev->adev_oncore); + if (ret) + goto free; + } else { + ret = devm_request_irq(adev->dev, irq, aest_irq_func, 0, + irq_desc, adev); + if (ret) + return ret; + } + } + return 0; + +free: + for (; i >= 0; i--) { + irq = adev->irq[i]; + + if (irq_is_percpu_devid(irq)) + free_percpu_irq(irq, adev->adev_oncore); + } + + return ret; +} + +static void aest_enable_irq(struct aest_record *record) +{ + u64 err_ctlr; + struct aest_device *adev = record->node->adev; + + err_ctlr = record_read(record, ERXCTLR); + + if (adev->irq[ACPI_AEST_NODE_FAULT_HANDLING]) + err_ctlr |= (ERR_CTLR_FI | ERR_CTLR_CFI); + if (adev->irq[ACPI_AEST_NODE_ERROR_RECOVERY]) + err_ctlr |= ERR_CTLR_UI; + + record_write(record, ERXCTLR, err_ctlr); +} + +static void aest_config_irq(struct aest_node *node) +{ + int i; + struct acpi_aest_node_interrupt_v2 *interrupt; + + if (!node->irq_config) + return; + + for (i = 0; i < node->info->interrupt_count; i++) { + interrupt = &node->info->interrupt[i]; + + if (interrupt->type == ACPI_AEST_NODE_FAULT_HANDLING) + writeq_relaxed(interrupt->gsiv, node->irq_config); + + if (interrupt->type == ACPI_AEST_NODE_ERROR_RECOVERY) + writeq_relaxed(interrupt->gsiv, node->irq_config + 8); + + aest_node_dbg(node, "config irq type %d gsiv %d at %llx", + interrupt->type, interrupt->gsiv, + (u64)node->irq_config); + } +} + static enum ras_ce_threshold aest_get_ce_threshold(struct aest_record *record) { u64 err_fr, err_fr_cec, err_fr_rp = -1; @@ -128,6 +564,7 @@ static int aest_init_record(struct aest_record *record, int i, record->index = i; record->node = node; aest_set_ce_threshold(record); + aest_enable_irq(record); aest_record_dbg(record, "base: %p, index: %d, address mode: %x\n", record->regs_base, record->index, @@ -232,6 +669,21 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, } } + address = node->info->common->interrupt_config_register_base; + if ((flags & AEST_XFACE_FLAG_INT_CONFIG) && address) { + if (address - anode->interface_hdr->address < node->group->size) + node->irq_config = + node->base + + (address - anode->interface_hdr->address); + else { + node->irq_config = + devm_ioremap(adev->dev, address, PAGE_SIZE); + if (!node->irq_config) + return -ENOMEM; + } + } + aest_config_irq(node); + ret = aest_node_set_errgsr(adev, node); if (ret) return ret; @@ -276,6 +728,66 @@ static int aest_init_nodes(struct aest_device *adev, struct aest_hnode *ahnode) return 0; } +static int __setup_ppi(struct aest_device *adev) +{ + int cpu, i; + struct aest_device *oncore_adev; + struct aest_node *oncore_node; + size_t size; + + adev->adev_oncore = &percpu_adev; + for_each_possible_cpu(cpu) { + oncore_adev = per_cpu_ptr(&percpu_adev, cpu); + memcpy(oncore_adev, adev, sizeof(struct aest_device)); + + oncore_adev->nodes = + devm_kcalloc(adev->dev, oncore_adev->node_cnt, + sizeof(struct aest_node), GFP_KERNEL); + if (!oncore_adev->nodes) + return -ENOMEM; + + size = adev->node_cnt * sizeof(struct aest_node); + memcpy(oncore_adev->nodes, adev->nodes, size); + for (i = 0; i < oncore_adev->node_cnt; i++) { + oncore_node = &oncore_adev->nodes[i]; + oncore_node->records = devm_kcalloc( + adev->dev, oncore_node->record_count, + sizeof(struct aest_record), GFP_KERNEL); + if (!oncore_node->records) + return -ENOMEM; + + size = oncore_node->record_count * + sizeof(struct aest_record); + memcpy(oncore_node->records, adev->nodes[i].records, + size); + } + + aest_dev_dbg(adev, "Init device on CPU%d.\n", cpu); + } + + return 0; +} + +static int aest_setup_irq(struct platform_device *pdev, + struct aest_device *adev) +{ + int fhi_irq, eri_irq; + + fhi_irq = platform_get_irq_byname_optional(pdev, AEST_FHI_NAME); + if (fhi_irq > 0) + adev->irq[0] = fhi_irq; + + eri_irq = platform_get_irq_byname_optional(pdev, AEST_ERI_NAME); + if (eri_irq > 0) + adev->irq[1] = eri_irq; + + /* Allocate and initialise the percpu device pointer for PPI */ + if (irq_is_percpu(fhi_irq) || irq_is_percpu(eri_irq)) + return __setup_ppi(adev); + + return 0; +} + static int aest_device_probe(struct platform_device *pdev) { int ret; @@ -289,14 +801,31 @@ static int aest_device_probe(struct platform_device *pdev) adev = devm_kzalloc(&pdev->dev, sizeof(*adev), GFP_KERNEL); if (!adev) return -ENOMEM; - adev->dev = &pdev->dev; adev->id = pdev->id; aest_set_name(adev, ahnode); + + INIT_WORK(&adev->aest_work, aest_node_pool_process); + ret = aest_node_pool_init(adev); + if (ret) { + aest_dev_err(adev, "Failed init aest node pool.\n"); + return ret; + } + init_llist_head(&adev->event_list); + ret = aest_init_nodes(adev, ahnode); if (ret) return ret; + ret = aest_setup_irq(pdev, adev); + if (ret) + return ret; + + ret = aest_register_irq(adev); + if (ret) { + aest_dev_err(adev, "register irq failed\n"); + return ret; + } platform_set_drvdata(pdev, adev); aest_dev_dbg(adev, "Node cnt: %x, id: %x\n", adev->node_cnt, adev->id); diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 85eeed79bcbee..a5e43b2a2e906 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -67,6 +67,34 @@ #define GIC_ERRDEVARCH 0xFFBC +struct aest_event { + struct llist_node llnode; + char *node_name; + u32 type; + /* + * Different nodes have different meanings: + * - Processor node : processor number. + * - Memory node : SRAT proximity domain. + * - SMMU node : IORT proximity domain. + * - GIC node : interface type. + */ + u32 id0; + /* + * Different nodes have different meanings: + * - Processor node : processor resource type. + * - Memory node : Non. + * - SMMU node : subcomponent reference. + * - Vendor node : Unique ID. + * - GIC node : instance identifier. + */ + u32 id1; + /* Vendor node : hardware ID. */ + char *hid; + u32 index; + int addressing_mode; + struct ras_ext_regs regs; +}; + struct aest_access { u64 (*read)(void *base, u32 offset); void (*write)(void *base, u32 offset, u64 val); @@ -141,6 +169,7 @@ struct aest_node { void *errgsr; void *base; void *inj; + void *irq_config; /* * This bitmap indicates which of the error records within this error @@ -172,6 +201,7 @@ struct aest_node { int record_count; struct aest_record *records; + struct aest_node __percpu *oncore_node; }; struct aest_device { @@ -180,6 +210,12 @@ struct aest_device { int node_cnt; struct aest_node *nodes; u32 id; + int irq[MAX_GSI_PER_NODE]; + + struct work_struct aest_work; + struct gen_pool *pool; + struct llist_head event_list; + struct aest_device __percpu *adev_oncore; }; static const char *const aest_node_name[] = { @@ -283,3 +319,23 @@ static const struct aest_access aest_access[] = { }, { } }; + +/* + * Each PE may has multi error record, you must selects an error + * record to be accessed through the Error Record System + * registers. + */ +static inline void aest_select_record(struct aest_node *node, int index) +{ + if (node->type == ACPI_AEST_PROCESSOR_ERROR_NODE) { + write_sysreg_s(index, SYS_ERRSELR_EL1); + isb(); + } +} + +/* Ensure all writes has taken effect. */ +static inline void aest_sync(struct aest_node *node) +{ + if (node->type == ACPI_AEST_PROCESSOR_ERROR_NODE) + isb(); +} diff --git a/include/linux/acpi_aest.h b/include/linux/acpi_aest.h index a7898c643896e..3a899f57f92fb 100644 --- a/include/linux/acpi_aest.h +++ b/include/linux/acpi_aest.h @@ -10,6 +10,13 @@ #define AEST_FHI_NAME "AEST:FHI" #define AEST_ERI_NAME "AEST:ERI" +/* AEST component */ +#define ACPI_AEST_PROC_FLAG_GLOBAL (1<<0) +#define ACPI_AEST_PROC_FLAG_SHARED (1<<1) + +#define AEST_ADDREESS_SPA 0 +#define AEST_ADDREESS_LA 1 + /* AEST interrupt */ #define AEST_INTERRUPT_MODE BIT(0) diff --git a/include/linux/ras.h b/include/linux/ras.h index 468941bfe855f..05096f049dacb 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -63,4 +63,12 @@ amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } #define GET_LOGICAL_INDEX(mpidr) -EINVAL #endif /* CONFIG_ARM || CONFIG_ARM64 */ +#if IS_ENABLED(CONFIG_AEST) +void aest_register_decode_chain(struct notifier_block *nb); +void aest_unregister_decode_chain(struct notifier_block *nb); +#else +static inline void aest_register_decode_chain(struct notifier_block *nb) {} +static inline void aest_unregister_decode_chain(struct notifier_block *nb) {} +#endif /* CONFIG_AEST */ + #endif /* __RAS_H__ */ From af52959f6b8672a4151094c3d407b3c217b13c6f Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:49 +0800 Subject: [PATCH 09/24] FROMLIST: ras: AEST: Add cpuhp callback Move the configuration of interrupts and CE thresholds into the CPU hotplug callbacks for the per-CPU AEST node. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-10-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 118 ++++++++++++++++++++++++++++++++++- drivers/ras/aest/aest.h | 5 ++ include/linux/cpuhotplug.h | 1 + 3 files changed, 121 insertions(+), 3 deletions(-) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 5ec0ba38f51b4..686dde6f2e680 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -5,6 +5,7 @@ * Copyright (c) 2025, Alibaba Group. */ +#include #include #include #include @@ -563,8 +564,6 @@ static int aest_init_record(struct aest_record *record, int i, record->addressing_mode = test_bit(i, node->info->addressing_mode); record->index = i; record->node = node; - aest_set_ce_threshold(record); - aest_enable_irq(record); aest_record_dbg(record, "base: %p, index: %d, address mode: %x\n", record->regs_base, record->index, @@ -572,9 +571,113 @@ static int aest_init_record(struct aest_record *record, int i, return 0; } +static void aest_online_record(struct aest_record *record, void *data) +{ + if (record_read(record, ERXFR) & ERR_FR_CE) + aest_set_ce_threshold(record); + + aest_enable_irq(record); +} + +static void aest_online_oncore_node(struct aest_node *node) +{ + int count; + + count = aest_proc(node); + aest_node_dbg(node, "Find %d error on CPU%d before AEST probe\n", count, + smp_processor_id()); + + aest_node_foreach_record(aest_online_record, node, NULL, + node->record_implemented); + + aest_node_foreach_record(aest_online_record, node, NULL, + node->status_reporting); +} + +static void aest_online_oncore_dev(void *data) +{ + int fhi_irq, eri_irq, i; + struct aest_device *adev = this_cpu_ptr(data); + + for (i = 0; i < adev->node_cnt; i++) + aest_online_oncore_node(&adev->nodes[i]); + + fhi_irq = adev->irq[ACPI_AEST_NODE_FAULT_HANDLING]; + if (fhi_irq > 0) + enable_percpu_irq(fhi_irq, IRQ_TYPE_NONE); + eri_irq = adev->irq[ACPI_AEST_NODE_ERROR_RECOVERY]; + if (eri_irq > 0) + enable_percpu_irq(eri_irq, IRQ_TYPE_NONE); +} + +static void aest_offline_oncore_dev(void *data) +{ + int fhi_irq, eri_irq; + struct aest_device *adev = this_cpu_ptr(data); + + fhi_irq = adev->irq[ACPI_AEST_NODE_FAULT_HANDLING]; + if (fhi_irq > 0) + disable_percpu_irq(fhi_irq); + eri_irq = adev->irq[ACPI_AEST_NODE_ERROR_RECOVERY]; + if (eri_irq > 0) + disable_percpu_irq(eri_irq); +} + +static void aest_online_dev(struct aest_device *adev) +{ + int count, i; + struct aest_node *node; + + for (i = 0; i < adev->node_cnt; i++) { + node = &adev->nodes[i]; + + if (!node->name) + continue; + + count = aest_proc(node); + aest_node_dbg(node, "Find %d error before AEST probe\n", count); + + aest_config_irq(node); + + aest_node_foreach_record(aest_online_record, node, NULL, + node->record_implemented); + aest_node_foreach_record(aest_online_record, node, NULL, + node->status_reporting); + } +} + +static int aest_starting_cpu(unsigned int cpu) +{ + pr_debug("CPU%d starting\n", cpu); + aest_online_oncore_dev(&percpu_adev); + + return 0; +} + +static int aest_dying_cpu(unsigned int cpu) +{ + pr_debug("CPU%d dying\n", cpu); + aest_offline_oncore_dev(&percpu_adev); + + return 0; +} + static void aest_device_remove(struct platform_device *pdev) { + struct aest_device *adev = platform_get_drvdata(pdev); + int i; + platform_set_drvdata(pdev, NULL); + + if (adev->type != ACPI_AEST_PROCESSOR_ERROR_NODE) + return; + + on_each_cpu(aest_offline_oncore_dev, adev->adev_oncore, 1); + + for (i = 0; i < MAX_GSI_PER_NODE; i++) { + if (adev->irq[i]) + free_percpu_irq(adev->irq[i], adev->adev_oncore); + } } static char *alloc_aest_node_name(struct aest_node *node) @@ -682,7 +785,6 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, return -ENOMEM; } } - aest_config_irq(node); ret = aest_node_set_errgsr(adev, node); if (ret) @@ -826,6 +928,16 @@ static int aest_device_probe(struct platform_device *pdev) aest_dev_err(adev, "register irq failed\n"); return ret; } + + if (aest_dev_is_oncore(adev)) + ret = cpuhp_setup_state(CPUHP_AP_ARM_AEST_STARTING, + "drivers/acpi/arm64/aest:starting", + aest_starting_cpu, aest_dying_cpu); + else + aest_online_dev(adev); + if (ret) + return ret; + platform_set_drvdata(pdev, adev); aest_dev_dbg(adev, "Node cnt: %x, id: %x\n", adev->node_cnt, adev->id); diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index a5e43b2a2e906..f85e81ff35a6e 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -339,3 +339,8 @@ static inline void aest_sync(struct aest_node *node) if (node->type == ACPI_AEST_PROCESSOR_ERROR_NODE) isb(); } + +static inline bool aest_dev_is_oncore(struct aest_device *adev) +{ + return adev->type == ACPI_AEST_PROCESSOR_ERROR_NODE; +} diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 22ba327ec2278..e7b553241b305 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -178,6 +178,7 @@ enum cpuhp_state { CPUHP_AP_HYPERV_TIMER_STARTING, /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, + CPUHP_AP_ARM_AEST_STARTING, CPUHP_AP_ARM_XEN_STARTING, CPUHP_AP_ARM_XEN_RUNSTATE_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, From 04d6b011eb7e29b09a35d8814b682dba5fdc7cd1 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:50 +0800 Subject: [PATCH 10/24] FROMLIST: ras: AEST: Introduce AEST driver sysfs interface Exposes certain AEST driver information to userspace. Only ROOT can access these interface because it includes hardware-sensitive information: ls /sys/kernel/debug/aest/ memory smmu ... ls /sys/kernel/debug/aest/memory/ record0 record1 ... All details at: Documentation/ABI/testing/debugfs-aest Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-11-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- Documentation/ABI/testing/debugfs-aest | 32 +++++++ MAINTAINERS | 1 + drivers/ras/aest/Makefile | 1 + drivers/ras/aest/aest-core.c | 13 +++ drivers/ras/aest/aest-sysfs.c | 118 +++++++++++++++++++++++++ drivers/ras/aest/aest.h | 8 ++ 6 files changed, 173 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-aest create mode 100644 drivers/ras/aest/aest-sysfs.c diff --git a/Documentation/ABI/testing/debugfs-aest b/Documentation/ABI/testing/debugfs-aest new file mode 100644 index 0000000000000..8bacc6bb20b6d --- /dev/null +++ b/Documentation/ABI/testing/debugfs-aest @@ -0,0 +1,32 @@ +What: /sys/kernel/debug/aest/./ +Date: Dec 2025 +KernelVersion: 6.19 +Contact: Ruidong Tian +Description: + Directory represented a AEST device, means device type, + like: + + - processor + - memory + - smmu + - ... + + is the unique ID for this device. + +What: /sys/kernel/debug/aest/.//* +Date: Dec 2025 +KernelVersion: 6.19 +Contact: Ruidong Tian +Description: + Attibute for aest node which belong this device, the format + of node name is: - + + See more at: + https://developer.arm.com/documentation/den0085/latest/ + +What: /sys/kernel/debug/aest/.//record/err_* +Date: Dec 2025 +KernelVersion: 6.19 +Contact: Ruidong Tian +Description: + (RO) Read err_* register and return val. diff --git a/MAINTAINERS b/MAINTAINERS index b272af3770dac..d8993970d0f4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -338,6 +338,7 @@ M: Ruidong Tian L: linux-acpi@vger.kernel.org L: linux-arm-kernel@lists.infradead.org S: Supported +F: Documentation/ABI/testing/debugfs-aest F: arch/arm64/include/asm/ras.h F: drivers/acpi/arm64/aest.c F: drivers/ras/aest/ diff --git a/drivers/ras/aest/Makefile b/drivers/ras/aest/Makefile index a6ba7e36fb432..75495413d2b6e 100644 --- a/drivers/ras/aest/Makefile +++ b/drivers/ras/aest/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_AEST) += aest.o aest-y := aest-core.o +aest-y += aest-sysfs.o diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 686dde6f2e680..3bcc635cf8e4d 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -20,6 +20,9 @@ DEFINE_PER_CPU(struct aest_device, percpu_adev); #undef pr_fmt #define pr_fmt(fmt) "AEST: " fmt +#ifdef CONFIG_DEBUG_FS +struct dentry *aest_debugfs; +#endif /* * This memory pool is only to be used to save AEST node in AEST irq context. * There can be 500 AEST node at most. @@ -940,6 +943,8 @@ static int aest_device_probe(struct platform_device *pdev) platform_set_drvdata(pdev, adev); + aest_dev_init_debugfs(adev); + aest_dev_dbg(adev, "Node cnt: %x, id: %x\n", adev->node_cnt, adev->id); return 0; @@ -955,12 +960,20 @@ static struct platform_driver aest_driver = { static int __init aest_init(void) { +#ifdef CONFIG_DEBUG_FS + aest_debugfs = debugfs_create_dir("aest", NULL); +#endif + return platform_driver_register(&aest_driver); } module_init(aest_init); static void __exit aest_exit(void) { +#ifdef CONFIG_DEBUG_FS + debugfs_remove(aest_debugfs); +#endif + platform_driver_unregister(&aest_driver); } module_exit(aest_exit); diff --git a/drivers/ras/aest/aest-sysfs.c b/drivers/ras/aest/aest-sysfs.c new file mode 100644 index 0000000000000..f3b5427ff4f0f --- /dev/null +++ b/drivers/ras/aest/aest-sysfs.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Error Source Table Support + * + * Copyright (c) 2025, Alibaba Group. + */ + +#include "aest.h" + +/******************************************************************************* + * + * Attribute for AEST record + * + ******************************************************************************/ + +#define DEFINE_AEST_DEBUGFS_ATTR(name, offset) \ +static int name##_get(void *data, u64 *val) \ +{ \ + struct aest_record *record = data; \ + *val = record_read(record, offset); \ + return 0; \ +} \ +static int name##_set(void *data, u64 val) \ +{ \ + struct aest_record *record = data; \ + record_write(record, offset, val); \ + return 0; \ +} \ +DEFINE_DEBUGFS_ATTRIBUTE(name##_ops, name##_get, name##_set, "%#llx\n") + +DEFINE_AEST_DEBUGFS_ATTR(err_fr, ERXFR); +DEFINE_AEST_DEBUGFS_ATTR(err_ctrl, ERXCTLR); +DEFINE_AEST_DEBUGFS_ATTR(err_status, ERXSTATUS); +DEFINE_AEST_DEBUGFS_ATTR(err_addr, ERXADDR); +DEFINE_AEST_DEBUGFS_ATTR(err_misc0, ERXMISC0); +DEFINE_AEST_DEBUGFS_ATTR(err_misc1, ERXMISC1); +DEFINE_AEST_DEBUGFS_ATTR(err_misc2, ERXMISC2); +DEFINE_AEST_DEBUGFS_ATTR(err_misc3, ERXMISC3); + +static void aest_record_init_debugfs(struct aest_record *record) +{ + debugfs_create_file("err_fr", 0600, record->debugfs, record, + &err_fr_ops); + debugfs_create_file("err_ctrl", 0600, record->debugfs, record, + &err_ctrl_ops); + debugfs_create_file("err_status", 0600, record->debugfs, record, + &err_status_ops); + debugfs_create_file("err_addr", 0600, record->debugfs, record, + &err_addr_ops); + debugfs_create_file("err_misc0", 0600, record->debugfs, record, + &err_misc0_ops); + debugfs_create_file("err_misc1", 0600, record->debugfs, record, + &err_misc1_ops); + debugfs_create_file("err_misc2", 0600, record->debugfs, record, + &err_misc2_ops); + debugfs_create_file("err_misc3", 0600, record->debugfs, record, + &err_misc3_ops); +} + +static void +aest_node_init_debugfs(struct aest_node *node) +{ + int i; + struct aest_record *record; + + for (i = 0; i < node->record_count; i++) { + record = &node->records[i]; + if (!record->name) + continue; + record->debugfs = debugfs_create_dir(record->name, + node->debugfs); + aest_record_init_debugfs(record); + } +} + +static void +aest_oncore_dev_init_debugfs(struct aest_device *adev) +{ + int cpu, i; + struct aest_node *node; + struct aest_device *percpu_dev; + char name[16]; + + for_each_possible_cpu(cpu) { + percpu_dev = this_cpu_ptr(adev->adev_oncore); + + snprintf(name, sizeof(name), "processor%u", cpu); + percpu_dev->debugfs = debugfs_create_dir(name, aest_debugfs); + + for (i = 0; i < adev->node_cnt; i++) { + node = &adev->nodes[i]; + + node->debugfs = debugfs_create_dir(node->name, + percpu_dev->debugfs); + aest_node_init_debugfs(node); + } + } +} + +void aest_dev_init_debugfs(struct aest_device *adev) +{ + int i; + struct aest_node *node; + + adev->debugfs = debugfs_create_dir(dev_name(adev->dev), aest_debugfs); + if (aest_dev_is_oncore(adev)) { + aest_oncore_dev_init_debugfs(adev); + return; + } + + for (i = 0; i < adev->node_cnt; i++) { + node = &adev->nodes[i]; + if (!node->name) + continue; + node->debugfs = debugfs_create_dir(node->name, adev->debugfs); + aest_node_init_debugfs(node); + } +} diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index f85e81ff35a6e..ceb9e32bcee3c 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -7,6 +7,7 @@ #include #include +#include #define MAX_GSI_PER_NODE 2 #define DEFAULT_CE_THRESHOLD 1 @@ -67,6 +68,8 @@ #define GIC_ERRDEVARCH 0xFFBC +extern struct dentry *aest_debugfs; + struct aest_event { struct llist_node llnode; char *node_name; @@ -133,6 +136,7 @@ struct aest_record { struct ce_threshold ce; enum ras_ce_threshold threshold_type; + struct dentry *debugfs; }; struct aest_group { @@ -201,6 +205,7 @@ struct aest_node { int record_count; struct aest_record *records; + struct dentry *debugfs; struct aest_node __percpu *oncore_node; }; @@ -215,6 +220,7 @@ struct aest_device { struct work_struct aest_work; struct gen_pool *pool; struct llist_head event_list; + struct dentry *debugfs; struct aest_device __percpu *adev_oncore; }; @@ -344,3 +350,5 @@ static inline bool aest_dev_is_oncore(struct aest_device *adev) { return adev->type == ACPI_AEST_PROCESSOR_ERROR_NODE; } + +void aest_dev_init_debugfs(struct aest_device *adev); From c777a202e7f4e613453b5b1b14d731f2ed74488c Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:51 +0800 Subject: [PATCH 11/24] FROMLIST: ras: AEST: Add error count tracking and debugfs interface This commit introduces error counting functionality for AEST records. Previously, error statistics were not directly available for individual error records or AEST nodes. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-12-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- Documentation/ABI/testing/debugfs-aest | 14 ++++++ drivers/ras/aest/aest-core.c | 21 +++++++++ drivers/ras/aest/aest-sysfs.c | 64 ++++++++++++++++++++++++++ drivers/ras/aest/aest.h | 10 ++++ 4 files changed, 109 insertions(+) diff --git a/Documentation/ABI/testing/debugfs-aest b/Documentation/ABI/testing/debugfs-aest index 8bacc6bb20b6d..295df9e9b4558 100644 --- a/Documentation/ABI/testing/debugfs-aest +++ b/Documentation/ABI/testing/debugfs-aest @@ -24,9 +24,23 @@ Description: See more at: https://developer.arm.com/documentation/den0085/latest/ +What: /sys/kernel/debug/aest/.//err_count +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (RO) Outputs error statistics for all error records of this node. + What: /sys/kernel/debug/aest/.//record/err_* Date: Dec 2025 KernelVersion: 6.19 Contact: Ruidong Tian Description: (RO) Read err_* register and return val. + +What: /sys/kernel/debug/aest/.//record/err_count +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (RO) Outputs error statistics for all this records. diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 3bcc635cf8e4d..75cca98024ad7 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -170,6 +170,27 @@ static int aest_node_gen_pool_add(struct aest_device *adev, init_aest_event(event, record, regs); llist_add(&event->llnode, &adev->event_list); + if (regs->err_status & ERR_STATUS_CE) + record->count.ce++; + if (regs->err_status & ERR_STATUS_DE) + record->count.de++; + if (regs->err_status & ERR_STATUS_UE) { + switch (regs->err_status & ERR_STATUS_UET) { + case ERR_STATUS_UET_UC: + record->count.uc++; + break; + case ERR_STATUS_UET_UEU: + record->count.ueu++; + break; + case ERR_STATUS_UET_UER: + record->count.uer++; + break; + case ERR_STATUS_UET_UEO: + record->count.ueo++; + break; + } + } + return 0; } diff --git a/drivers/ras/aest/aest-sysfs.c b/drivers/ras/aest/aest-sysfs.c index f3b5427ff4f0f..b54e879506aa9 100644 --- a/drivers/ras/aest/aest-sysfs.c +++ b/drivers/ras/aest/aest-sysfs.c @@ -7,6 +7,46 @@ #include "aest.h" +static void +aest_error_count(struct aest_record *record, void *data) +{ + struct record_count *count = data; + + count->ce += record->count.ce; + count->de += record->count.de; + count->uc += record->count.uc; + count->ueu += record->count.ueu; + count->uer += record->count.uer; + count->ueo += record->count.ueo; +} + +/******************************************************************************* + * + * Debugfs for AEST node + * + ******************************************************************************/ + +static int aest_node_err_count_show(struct seq_file *m, void *data) +{ + struct aest_node *node = m->private; + struct record_count count = { 0 }; + int i; + + for (i = 0; i < node->record_count; i++) + aest_error_count(&node->records[i], &count); + + seq_printf(m, "CE: %llu\n" + "DE: %llu\n" + "UC: %llu\n" + "UEU: %llu\n" + "UEO: %llu\n" + "UER: %llu\n", + count.ce, count.de, count.uc, count.ueu, + count.uer, count.ueo); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(aest_node_err_count); + /******************************************************************************* * * Attribute for AEST record @@ -37,6 +77,25 @@ DEFINE_AEST_DEBUGFS_ATTR(err_misc1, ERXMISC1); DEFINE_AEST_DEBUGFS_ATTR(err_misc2, ERXMISC2); DEFINE_AEST_DEBUGFS_ATTR(err_misc3, ERXMISC3); +static int aest_record_err_count_show(struct seq_file *m, void *data) +{ + struct aest_record *record = m->private; + struct record_count count = { 0 }; + + aest_error_count(record, &count); + + seq_printf(m, "CE: %llu\n" + "DE: %llu\n" + "UC: %llu\n" + "UEU: %llu\n" + "UEO: %llu\n" + "UER: %llu\n", + count.ce, count.de, count.uc, count.ueu, + count.uer, count.ueo); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(aest_record_err_count); + static void aest_record_init_debugfs(struct aest_record *record) { debugfs_create_file("err_fr", 0600, record->debugfs, record, @@ -55,6 +114,8 @@ static void aest_record_init_debugfs(struct aest_record *record) &err_misc2_ops); debugfs_create_file("err_misc3", 0600, record->debugfs, record, &err_misc3_ops); + debugfs_create_file("err_count", 0400, record->debugfs, record, + &aest_record_err_count_fops); } static void @@ -63,6 +124,9 @@ aest_node_init_debugfs(struct aest_node *node) int i; struct aest_record *record; + debugfs_create_file("err_count", 0400, node->debugfs, node, + &aest_node_err_count_fops); + for (i = 0; i < node->record_count; i++) { record = &node->records[i]; if (!record->name) diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index ceb9e32bcee3c..802430857dc49 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -116,6 +116,15 @@ struct ce_threshold { u64 reg_val; }; +struct record_count { + u64 ce; + u64 de; + u64 uc; + u64 uer; + u64 ueo; + u64 ueu; +}; + struct aest_record { char *name; int index; @@ -136,6 +145,7 @@ struct aest_record { struct ce_threshold ce; enum ras_ce_threshold threshold_type; + struct record_count count; struct dentry *debugfs; }; From aa5ffba3de060d266ea812f7c7d03407b850eb4e Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:52 +0800 Subject: [PATCH 12/24] FROMLIST: ras: AEST: Allow configuring CE threshold via debugfs This commit introduces the ability to configure the Corrected Error (CE) threshold for AEST records through debugfs. This allows administrators to dynamically adjust the CE threshold for error reporting. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-13-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- Documentation/ABI/testing/debugfs-aest | 16 ++++++++++ drivers/ras/aest/aest-sysfs.c | 42 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/Documentation/ABI/testing/debugfs-aest b/Documentation/ABI/testing/debugfs-aest index 295df9e9b4558..4d160072d37a7 100644 --- a/Documentation/ABI/testing/debugfs-aest +++ b/Documentation/ABI/testing/debugfs-aest @@ -24,6 +24,14 @@ Description: See more at: https://developer.arm.com/documentation/den0085/latest/ +What: /sys/kernel/debug/aest/.//ce_threshold +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (WO) Write the ce threshold to all records of this node. Failed + if input exceeded the maximum threshold + What: /sys/kernel/debug/aest/.//err_count Date: Dec 2025 KernelVersion 6.19 @@ -38,6 +46,14 @@ Contact: Ruidong Tian Description: (RO) Read err_* register and return val. +What: /sys/kernel/debug/aest/.//record/ce_threshold +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (RW) Read and write the ce threshold to this record. Failed + if input exceeded the maximum threshold + What: /sys/kernel/debug/aest/.//record/err_count Date: Dec 2025 KernelVersion 6.19 diff --git a/drivers/ras/aest/aest-sysfs.c b/drivers/ras/aest/aest-sysfs.c index b54e879506aa9..392e7ad8328ed 100644 --- a/drivers/ras/aest/aest-sysfs.c +++ b/drivers/ras/aest/aest-sysfs.c @@ -7,6 +7,25 @@ #include "aest.h" +static void +aest_store_threshold(struct aest_record *record, void *data) +{ + u64 err_misc0, *threshold = data; + struct ce_threshold *ce = &record->ce; + + if (*threshold > ce->info->max_count) + return; + + ce->threshold = *threshold; + ce->count = ce->info->max_count - ce->threshold + 1; + + err_misc0 = record_read(record, ERXMISC0); + ce->reg_val = (err_misc0 & ~ce->info->mask) | + (ce->count << ce->info->shift); + + record_write(record, ERXMISC0, ce->reg_val); +} + static void aest_error_count(struct aest_record *record, void *data) { @@ -77,6 +96,27 @@ DEFINE_AEST_DEBUGFS_ATTR(err_misc1, ERXMISC1); DEFINE_AEST_DEBUGFS_ATTR(err_misc2, ERXMISC2); DEFINE_AEST_DEBUGFS_ATTR(err_misc3, ERXMISC3); +static int record_ce_threshold_get(void *data, u64 *val) +{ + struct aest_record *record = data; + + *val = record->ce.threshold; + return 0; +} + +static int record_ce_threshold_set(void *data, u64 val) +{ + u64 threshold = val; + struct aest_record *record = data; + + aest_store_threshold(record, &threshold); + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(record_ce_threshold_ops, record_ce_threshold_get, + record_ce_threshold_set, "%llu\n"); + static int aest_record_err_count_show(struct seq_file *m, void *data) { struct aest_record *record = m->private; @@ -116,6 +156,8 @@ static void aest_record_init_debugfs(struct aest_record *record) &err_misc3_ops); debugfs_create_file("err_count", 0400, record->debugfs, record, &aest_record_err_count_fops); + debugfs_create_file("ce_threshold", 0600, record->debugfs, record, + &record_ce_threshold_ops); } static void From 164311dadcbebc9f2fe04ef9d8e9d3025be4908b Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:53 +0800 Subject: [PATCH 13/24] FROMLIST: ras: AEST: Introduce AEST inject interface to test AEST driver AEST offers both soft and hard injection. Soft injection simulates errors in software, providing flexibility to define the error register content. Hard injection, on the other hand, utilizes error injection registers to introduce hardware faults, strictly requiring values that adhere to their specifications. Read Documentation/ABI/testing/debugfs-aest to learn how to use them. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-14-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- Documentation/ABI/testing/debugfs-aest | 37 +++++++ drivers/ras/aest/Makefile | 1 + drivers/ras/aest/aest-core.c | 24 +++-- drivers/ras/aest/aest-inject.c | 131 +++++++++++++++++++++++++ drivers/ras/aest/aest-sysfs.c | 8 +- drivers/ras/aest/aest.h | 2 + 6 files changed, 193 insertions(+), 10 deletions(-) create mode 100644 drivers/ras/aest/aest-inject.c diff --git a/Documentation/ABI/testing/debugfs-aest b/Documentation/ABI/testing/debugfs-aest index 4d160072d37a7..cc41ea7032c72 100644 --- a/Documentation/ABI/testing/debugfs-aest +++ b/Documentation/ABI/testing/debugfs-aest @@ -60,3 +60,40 @@ KernelVersion 6.19 Contact: Ruidong Tian Description: (RO) Outputs error statistics for all this records. + +What: /sys/kernel/debug/aest/.//record/inject/err_* +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (RW) These registers are used to simulate soft injection errors + by holding error register values. You can write any values + to them. To trigger the injection, you need to write soft_inject + at last. The validity of the injected error depends on the + value written to err_status. + + Accepts values - any. + +What: /sys/kernel/debug/aest/.//record/inject/soft_inject +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (WO) Write any value to this file to trigger the error + injection. Make sure you have specified all necessary error + parameters, i.e. this write should be the last step when + injecting errors. + + Accepts values - any. + +What: /sys/kernel/debug/aest/.//record/inject/hard_inject +Date: Dec 2025 +KernelVersion 6.19 +Contact: Ruidong Tian +Description: + (WO) If the AEST table provides error injection registers, + you can write to them via this interface. For instance, + values can be written to the ERXPFGCTL register. The post-injection + behavior is then determined by the hardware specification. + + Accepts values - any. diff --git a/drivers/ras/aest/Makefile b/drivers/ras/aest/Makefile index 75495413d2b6e..5ee10fc8b2e9d 100644 --- a/drivers/ras/aest/Makefile +++ b/drivers/ras/aest/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_AEST) += aest.o aest-y := aest-core.o aest-y += aest-sysfs.o +aest-y += aest-inject.o diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 75cca98024ad7..a290b482bf8b7 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -273,7 +273,7 @@ static void aest_panic(struct aest_record *record, struct ras_ext_regs *regs, panic(msg); } -static void aest_proc_record(struct aest_record *record, void *data) +void aest_proc_record(struct aest_record *record, void *data, bool fake) { struct ras_ext_regs regs = { 0 }; int *count = data; @@ -315,9 +315,15 @@ static void aest_proc_record(struct aest_record *record, void *data) /* panic if unrecoverable and uncontainable error encountered */ ue = FIELD_GET(ERR_STATUS_UET, regs.err_status); if ((regs.err_status & ERR_STATUS_UE) && - (ue == ERR_STATUS_UET_UC || ue == ERR_STATUS_UET_UEU)) - aest_panic(record, ®s, - "AEST: unrecoverable error encountered"); + (ue == ERR_STATUS_UET_UC || ue == ERR_STATUS_UET_UEU)) { + if (fake) + aest_record_info( + record, + "Simulated error! Skip panic due to fault injection\n"); + else + aest_panic(record, ®s, + "AEST: unrecoverable error encountered"); + } aest_log(record, ®s); @@ -335,7 +341,8 @@ static void aest_proc_record(struct aest_record *record, void *data) record_write(record, ERXSTATUS, regs.err_status); } -static void aest_node_foreach_record(void (*func)(struct aest_record *, void *), +static void aest_node_foreach_record(void (*func)(struct aest_record *, void *, + bool), struct aest_node *node, void *data, unsigned long *bitmap) { @@ -344,7 +351,7 @@ static void aest_node_foreach_record(void (*func)(struct aest_record *, void *), for_each_clear_bit(i, bitmap, node->record_count) { aest_select_record(node, i); - func(&node->records[i], data); + func(&node->records[i], data, false); aest_sync(node); } @@ -379,7 +386,7 @@ static int aest_proc(struct aest_node *node) if (test_bit(i * BITS_PER_LONG + j, node->status_reporting)) continue; - aest_proc_record(&node->records[j], &count); + aest_proc_record(&node->records[j], &count, false); } } @@ -595,7 +602,8 @@ static int aest_init_record(struct aest_record *record, int i, return 0; } -static void aest_online_record(struct aest_record *record, void *data) +static void aest_online_record(struct aest_record *record, void *data, + bool __unused) { if (record_read(record, ERXFR) & ERR_FR_CE) aest_set_ce_threshold(record); diff --git a/drivers/ras/aest/aest-inject.c b/drivers/ras/aest/aest-inject.c new file mode 100644 index 0000000000000..fe6ccac8338e4 --- /dev/null +++ b/drivers/ras/aest/aest-inject.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Error Source Table Support + * + * Copyright (c) 2024, Alibaba Group. + */ + +#include "aest.h" + +static struct ras_ext_regs regs_inj; + +struct inj_attr { + struct attribute attr; + ssize_t (*show)(struct aest_node *n, struct inj_attr *a, char *b); + ssize_t (*store)(struct aest_node *n, struct inj_attr *a, const char *b, + size_t c); +}; + +struct aest_inject { + struct aest_node *node; + struct kobject kobj; +}; + +#define to_inj(k) container_of(k, struct aest_inject, kobj) +#define to_inj_attr(a) container_of(a, struct inj_attr, attr) + +static u64 aest_sysreg_read_inject(void *__unused, u32 offset) +{ + u64 *p = (u64 *)®s_inj; + + return p[offset/8]; +} + +static void aest_sysreg_write_inject(void *base, u32 offset, u64 val) +{ + u64 *p = (u64 *)®s_inj; + + p[offset/8] = val; +} + +static u64 aest_iomem_read_inject(void *base, u32 offset) +{ + u64 *p = (u64 *)®s_inj; + + return p[offset/8]; +} + +static void aest_iomem_write_inject(void *base, u32 offset, u64 val) +{ + u64 *p = (u64 *)®s_inj; + + p[offset/8] = val; +} + +static struct aest_access aest_access_inject[] = { + [ACPI_AEST_NODE_SYSTEM_REGISTER] = { + .read = aest_sysreg_read_inject, + .write = aest_sysreg_write_inject, + }, + + [ACPI_AEST_NODE_MEMORY_MAPPED] = { + .read = aest_iomem_read_inject, + .write = aest_iomem_write_inject, + }, + [ACPI_AEST_NODE_SINGLE_RECORD_MEMORY_MAPPED] = { + .read = aest_iomem_read_inject, + .write = aest_iomem_write_inject, + }, + { } +}; + +static int soft_inject_store(void *data, u64 val) +{ + int count = 0; + struct aest_record record_inj, *record = data; + struct aest_node node_inj, *node = record->node; + + memcpy(&node_inj, node, sizeof(*node)); + node_inj.name = "AEST-injection"; + + record_inj.access = &aest_access_inject[node->info->interface_hdr->type]; + record_inj.node = &node_inj; + record_inj.index = record->index; + + regs_inj.err_status |= ERR_STATUS_V; + + aest_proc_record(&record_inj, &count, true); + + if (count != 1) + return -EIO; + + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(soft_inject_ops, NULL, soft_inject_store, "%llu\n"); + +static int hard_inject_store(void *data, u64 val) +{ + struct aest_record *record = data; + struct aest_node *node = record->node; + + if (!node->inj) + return -EPERM; + + aest_select_record(node, record->index); + record_write(record, ERXPFGCTL, val); + record_write(record, ERXPFGCDN, 0x100); + aest_sync(node); + + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(hard_inject_ops, NULL, hard_inject_store, "%llu\n"); + +void aest_inject_init_debugfs(struct aest_record *record) +{ + struct dentry *inj; + + inj = debugfs_create_dir("inject", record->debugfs); + + debugfs_create_u64("err_fr", 0600, inj, ®s_inj.err_fr); + debugfs_create_u64("err_ctrl", 0600, inj, ®s_inj.err_ctlr); + debugfs_create_u64("err_status", 0600, inj, ®s_inj.err_status); + debugfs_create_u64("err_addr", 0600, inj, ®s_inj.err_addr); + debugfs_create_u64("err_misc0", 0600, inj, ®s_inj.err_misc[0]); + debugfs_create_u64("err_misc1", 0600, inj, ®s_inj.err_misc[1]); + debugfs_create_u64("err_misc2", 0600, inj, ®s_inj.err_misc[2]); + debugfs_create_u64("err_misc3", 0600, inj, ®s_inj.err_misc[3]); + debugfs_create_file("soft_inject", 0400, inj, record, &soft_inject_ops); + + if (record->node->inj) + debugfs_create_file("hard_inject", 0400, inj, record, &hard_inject_ops); +} diff --git a/drivers/ras/aest/aest-sysfs.c b/drivers/ras/aest/aest-sysfs.c index 392e7ad8328ed..66e9c1103f996 100644 --- a/drivers/ras/aest/aest-sysfs.c +++ b/drivers/ras/aest/aest-sysfs.c @@ -158,6 +158,7 @@ static void aest_record_init_debugfs(struct aest_record *record) &aest_record_err_count_fops); debugfs_create_file("ce_threshold", 0600, record->debugfs, record, &record_ce_threshold_ops); + aest_inject_init_debugfs(record); } static void @@ -190,8 +191,8 @@ aest_oncore_dev_init_debugfs(struct aest_device *adev) for_each_possible_cpu(cpu) { percpu_dev = this_cpu_ptr(adev->adev_oncore); - snprintf(name, sizeof(name), "processor%u", cpu); - percpu_dev->debugfs = debugfs_create_dir(name, aest_debugfs); + snprintf(name, sizeof(name), "processor%u%u", cpu); + percpu_dev->debugfs = debugfs_create_dir(name, adev->debugfs); for (i = 0; i < adev->node_cnt; i++) { node = &adev->nodes[i]; @@ -208,6 +209,9 @@ void aest_dev_init_debugfs(struct aest_device *adev) int i; struct aest_node *node; + if (!aest_debugfs) + dev_err(adev->dev, "debugfs not enabled\n"); + adev->debugfs = debugfs_create_dir(dev_name(adev->dev), aest_debugfs); if (aest_dev_is_oncore(adev)) { aest_oncore_dev_init_debugfs(adev); diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 802430857dc49..2f6a7b9ca4efd 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -362,3 +362,5 @@ static inline bool aest_dev_is_oncore(struct aest_device *adev) } void aest_dev_init_debugfs(struct aest_device *adev); +void aest_inject_init_debugfs(struct aest_record *record); +void aest_proc_record(struct aest_record *record, void *data, bool fake); From f4ff84065ba05247fee98b9518f4b578f794f4f4 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:54 +0800 Subject: [PATCH 14/24] FROMLIST: ras: AEST: Add framework to process AEST vendor node AEST table include vendor error node to support the component that do not implement standard Arm RAS architecture[1]. Each vendor node may have their own initialize and interrupt handle function. This patch supply a framework to process vendor error nodes, the vendor process function is binded with vendor HID. [1]: https://developer.arm.com/documentation/ddi0587/latest/ Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-15-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 28 +++++++++++++++++++++++++++- drivers/ras/aest/aest.h | 5 +++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index a290b482bf8b7..047c9a8cffe40 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -922,6 +922,29 @@ static int aest_setup_irq(struct platform_device *pdev, return 0; } +static struct aest_vendor_match vendor_match[] = { + { }, +}; + +static int +aest_vendor_probe(struct aest_device *adev, struct aest_hnode *ahnode) +{ + int i; + struct acpi_aest_node *anode; + + anode = list_first_entry(&ahnode->list, struct acpi_aest_node, list); + if (!anode) + return -ENODEV; + + aest_dev_dbg(adev, "Try to probe vendor node %s\n", anode->vendor->acpi_hid); + for (i = 0; i < ARRAY_SIZE(vendor_match); i++) { + if (!strncmp(vendor_match[i].hid, anode->vendor->acpi_hid, 8)) + return vendor_match[i].probe(adev, ahnode); + } + + return -ENODEV; +} + static int aest_device_probe(struct platform_device *pdev) { int ret; @@ -947,7 +970,10 @@ static int aest_device_probe(struct platform_device *pdev) } init_llist_head(&adev->event_list); - ret = aest_init_nodes(adev, ahnode); + if (ahnode->type == ACPI_AEST_VENDOR_ERROR_NODE) + ret = aest_vendor_probe(adev, ahnode); + else + ret = aest_init_nodes(adev, ahnode); if (ret) return ret; diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 2f6a7b9ca4efd..304c03839d31f 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -244,6 +244,11 @@ static const char *const aest_node_name[] = { [ACPI_AEST_PROXY_ERROR_NODE] = "proxy", }; +struct aest_vendor_match { + char hid[ACPI_ID_LEN]; + int (*probe)(struct aest_device *adev, struct aest_hnode *anode); +}; + static inline int aest_set_name(struct aest_device *adev, struct aest_hnode *ahnode) { From c0c87da42a9fab728fa3f7cb5be9f60370d3868e Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:55 +0800 Subject: [PATCH 15/24] FROMLIST: ras: AEST: support vendor node CMN700 The CMN (Coherent Mesh Network) architecture incorporates five distinct device types. Each device type is associated with an error group register set. The struct aest_cmn_700 models a single CMN instance, while struct aest_cmn_700_child represents an individual CMN device. CMN's error records utilize a memory-mapped single error record view [1]. Critically, one error record corresponds to one AEST node, implying that a single CMN instance can generate hundreds of AEST nodes. To manage this scale, this driver introduces a virtual AEST node, which represents an entire CMN device, such as an HNI or HNF. This allows an HNF AEST node, for instance, to leverage its errgsr register to pinpoint which specific error record has reported an error. During the AEST probe phase, the CMN AEST driver identifies the CMN node type using the cmn_node_info register. It then reorganizes all AEST nodes belonging to the same CMN node type into a cohesive CMN AEST node structure. To locate the relevant CMN register addresses, the CMN's presence in the DSDT is required, along with the CMN node offset specified in the AEST vendor specification data [1]. [1]: https://developer.arm.com/documentation/102308/latest/ Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-16-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- arch/arm64/include/asm/arm-cmn.h | 47 +++++ drivers/perf/arm-cmn.c | 37 +--- drivers/ras/aest/Makefile | 1 + drivers/ras/aest/aest-cmn.c | 330 +++++++++++++++++++++++++++++++ drivers/ras/aest/aest-core.c | 42 ++-- drivers/ras/aest/aest.h | 39 ++++ 6 files changed, 444 insertions(+), 52 deletions(-) create mode 100644 arch/arm64/include/asm/arm-cmn.h create mode 100644 drivers/ras/aest/aest-cmn.c diff --git a/arch/arm64/include/asm/arm-cmn.h b/arch/arm64/include/asm/arm-cmn.h new file mode 100644 index 0000000000000..1b9f506797944 --- /dev/null +++ b/arch/arm64/include/asm/arm-cmn.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015 ARM Ltd. + */ +#ifndef __ASM_ARM_CMN_H +#define __ASM_ARM_CMN_H + +#include + +/* Common register stuff */ +#define CMN_NODE_INFO 0x0000 +#define CMN_NI_NODE_TYPE GENMASK_ULL(15, 0) +#define CMN_NI_NODE_ID GENMASK_ULL(31, 16) +#define CMN_NI_LOGICAL_ID GENMASK_ULL(47, 32) + +enum cmn_node_type { + CMN_TYPE_INVALID, + CMN_TYPE_DVM, + CMN_TYPE_CFG, + CMN_TYPE_DTC, + CMN_TYPE_HNI, + CMN_TYPE_HNF, + CMN_TYPE_XP, + CMN_TYPE_SBSX, + CMN_TYPE_MPAM_S, + CMN_TYPE_MPAM_NS, + CMN_TYPE_RNI, + CMN_TYPE_RND = 0xd, + CMN_TYPE_RNSAM = 0xf, + CMN_TYPE_MTSX, + CMN_TYPE_HNP, + CMN_TYPE_CXRA = 0x100, + CMN_TYPE_CXHA, + CMN_TYPE_CXLA, + CMN_TYPE_CCRA, + CMN_TYPE_CCHA, + CMN_TYPE_CCLA, + CMN_TYPE_CCLA_RNI, + CMN_TYPE_HNS = 0x200, + CMN_TYPE_HNS_MPAM_S, + CMN_TYPE_HNS_MPAM_NS, + CMN_TYPE_APB = 0x1000, + /* Not a real node type */ + CMN_TYPE_WP = 0x7770 +}; + +#endif /* __ASM_ARM_CMN_H */ diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index f5305c8fdca43..4d0702f16a0ff 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2,6 +2,7 @@ // Copyright (C) 2016-2020 Arm Limited // ARM CMN/CI interconnect PMU driver +#include #include #include #include @@ -19,11 +20,6 @@ #include /* Common register stuff */ -#define CMN_NODE_INFO 0x0000 -#define CMN_NI_NODE_TYPE GENMASK_ULL(15, 0) -#define CMN_NI_NODE_ID GENMASK_ULL(31, 16) -#define CMN_NI_LOGICAL_ID GENMASK_ULL(47, 32) - #define CMN_CHILD_INFO 0x0080 #define CMN_CI_CHILD_COUNT GENMASK_ULL(15, 0) #define CMN_CI_CHILD_PTR_OFFSET GENMASK_ULL(31, 16) @@ -242,37 +238,6 @@ enum cmn_revision { REV_CI700_R2P0, }; -enum cmn_node_type { - CMN_TYPE_INVALID, - CMN_TYPE_DVM, - CMN_TYPE_CFG, - CMN_TYPE_DTC, - CMN_TYPE_HNI, - CMN_TYPE_HNF, - CMN_TYPE_XP, - CMN_TYPE_SBSX, - CMN_TYPE_MPAM_S, - CMN_TYPE_MPAM_NS, - CMN_TYPE_RNI, - CMN_TYPE_RND = 0xd, - CMN_TYPE_RNSAM = 0xf, - CMN_TYPE_MTSX, - CMN_TYPE_HNP, - CMN_TYPE_CXRA = 0x100, - CMN_TYPE_CXHA, - CMN_TYPE_CXLA, - CMN_TYPE_CCRA, - CMN_TYPE_CCHA, - CMN_TYPE_CCLA, - CMN_TYPE_CCLA_RNI, - CMN_TYPE_HNS = 0x200, - CMN_TYPE_HNS_MPAM_S, - CMN_TYPE_HNS_MPAM_NS, - CMN_TYPE_APB = 0x1000, - /* Not a real node type */ - CMN_TYPE_WP = 0x7770 -}; - enum cmn_filter_select { SEL_NONE = -1, SEL_OCCUP1ID, diff --git a/drivers/ras/aest/Makefile b/drivers/ras/aest/Makefile index 5ee10fc8b2e9d..e5a45fde6d362 100644 --- a/drivers/ras/aest/Makefile +++ b/drivers/ras/aest/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_AEST) += aest.o aest-y := aest-core.o aest-y += aest-sysfs.o aest-y += aest-inject.o +aest-y += aest-cmn.o diff --git a/drivers/ras/aest/aest-cmn.c b/drivers/ras/aest/aest-cmn.c new file mode 100644 index 0000000000000..ad82ed163a8c5 --- /dev/null +++ b/drivers/ras/aest/aest-cmn.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Error Source Table CMN700 Support + * + * Copyright (c) 2025, Alibaba Inc + */ + +#include + +#include "aest.h" + +/* + * CMN include 5 device types, each device type has an error group register set + * which contains a set of error records. The struct aest_cmn_700 represents + * one CMN Instance, and the struct aest_cmn_700_child represent one CMN device. + * The error record of CMN use memory-mapped single error record view, so one + * record is correspond to one AEST node, it means there will be hundreds of + * AEST node of CMN. As described in chapters 2.6.3.4 of Arm ACPI Spec[1], we + * use vendor define data to recognize the device type of an AEST node. So AEST + * driver can enumerate all CMN AEST node to initialize struct aest_cmn_700 and + * aest_cmn_700_child with HID, UID and other CMN info described in AEST or CMN + * register. + * + * Each CMN Instance has their own error interrupt and the struct aest_cmn_700 + * is passed to interrupt context. OS check error group register set to locate + * record which report error. All procedure is similar with chapters 3.8 in + * Arm CMN Spec[2]. + * + * The CMN RAS architecture is showed as follow: + * + * +----+ + * -->|XP | ...... + * | +----+ + * | + * | +----+ ...... + * | |HNI | +----------------+ + * | +----+ ->|record/AEST node| + * | | +----------------+ + * +------------+ | +----+ | . + * |CMN Instance|--| |HNF |---| . + * +------------+ | +----+ | . + * | | +----------------+ + * | +----+ ->|record/AEST node| + * | |SBSX| +----------------+ + * | +----+ ...... + * | + * | +----+ + * -->|CCG | ...... + * +----+ + * + * [1]: https://developer.arm.com/documentation/den0093/latest + * [2]: https://developer.arm.com/documentation/102308/latest + */ + +#define CMN_RAS_DEV_NUM 6 +#define CMN700_ERRGSR_NUM 8 +#define CMN_MAX_UID 8 +#define CMN_ERRDEVARCH 0x3FB8 +#define CMN_ERRDEVARCH_REV GENMASK(19, 16) +#define CMN_ERRGSR_OFFSET 0x3000 + +struct cmn_vendor_data { + int node_type; + int node_id; + int logic_id; +}; + +struct cmn_config { + int errgsr_num; + int dev_num; + int ras_ver; + const int *node_id_map; + const char *const *node_name; + int (*errgsr_mapping)(int errgsr_bit); + u64 (*errgsr_offset)(u64 hnd_ofset, int node_idx); +}; + +static const char *const cmn700_node_name[] = { + [CMN_TYPE_HNI] = "HNI", [CMN_TYPE_HNF] = "HNF", + [CMN_TYPE_XP] = "XP", [CMN_TYPE_SBSX] = "SBSX", + [CMN_TYPE_CXRA] = "RND", [CMN_TYPE_MTSX] = "MTSX", +}; + +static const int cmn700_node_id_map[] = { + [CMN_TYPE_HNI] = 1, [CMN_TYPE_HNF] = 2, [CMN_TYPE_XP] = 0, + [CMN_TYPE_SBSX] = 3, [CMN_TYPE_CXRA] = 4, [CMN_TYPE_MTSX] = 5, +}; + +static u64 cmn_dev_array[CMN_MAX_UID]; +static struct cmn_config *cmn_config; + +static u64 cmn700_errgsr_offset(u64 hnd_offset, int node_idx) +{ + return hnd_offset + CMN_ERRGSR_OFFSET + + (node_idx * 2) * CMN700_ERRGSR_NUM * 8; +} + +static struct cmn_config cmn700_config = { + .errgsr_num = CMN700_ERRGSR_NUM, + .dev_num = CMN_RAS_DEV_NUM, + .ras_ver = 1, + .node_name = cmn700_node_name, + .node_id_map = cmn700_node_id_map, + .errgsr_mapping = cmn700_errgsr_mapping, + .errgsr_offset = cmn700_errgsr_offset, +}; + +static acpi_status aest_cmn_700_resource_ioremap(struct acpi_resource *res, + void *data) +{ + struct acpi_resource_address64 addr64; + u32 *uid = data; + acpi_status status; + + status = acpi_resource_to_address64(res, &addr64); + if (ACPI_FAILURE(status) || (addr64.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + cmn_dev_array[*uid] = (u64)ioremap(addr64.address.minimum, + addr64.address.address_length); + + pr_debug("CMN device resource [%llx-%llx] ioremap to %llx\n", + addr64.address.minimum, addr64.address.maximum, + cmn_dev_array[*uid]); + + return AE_CTRL_TERMINATE; +} + +static acpi_status aest_cmn_get_dev_by_uid(acpi_handle handle, u32 level, + void *data, void **return_value) +{ + u32 *match_uid = data; + acpi_status status; + unsigned long long uid; + + status = acpi_evaluate_integer(handle, METHOD_NAME__UID, NULL, &uid); + if (ACPI_FAILURE(status)) { + pr_err("Do not find devive\n"); + return_ACPI_STATUS(status); + } + + if (uid != *match_uid) + return AE_OK; + + pr_debug("CMN device instance %llx, walk through resource\n", uid); + + status = acpi_walk_resources(handle, METHOD_NAME__CRS, + aest_cmn_700_resource_ioremap, data); + + if (ACPI_FAILURE(status)) { + pr_err("Device do not have resource\n"); + return_ACPI_STATUS(status); + } + + return AE_CTRL_TERMINATE; +} + +static inline int aest_cmn_node_ver(void *base) +{ + return FIELD_GET(CMN_ERRDEVARCH_REV, + readl_relaxed(base + CMN_ERRDEVARCH)); +} + +static int aest_cmn_init_node(struct aest_device *adev, + struct aest_node *cmn_node, + struct acpi_aest_node *anode, u64 type, + u64 errgsr_addr) +{ + cmn_node->info = anode; + cmn_node->name = devm_kasprintf(adev->dev, GFP_KERNEL, "%s", + cmn_config->node_name[type]); + if (!cmn_node->name) + return -ENOMEM; + cmn_node->errgsr = (void *)errgsr_addr; + cmn_node->type = anode->type; + cmn_node->adev = adev; + cmn_node->version = cmn_config->ras_ver; + cmn_node->errgsr_num = cmn_config->errgsr_num; + cmn_node->errgsr_mapping = cmn_config->errgsr_mapping; + cmn_node->record_count = cmn_node->errgsr_num * BITS_PER_LONG / 2; + cmn_node->record_implemented = devm_bitmap_zalloc( + adev->dev, cmn_node->record_count, GFP_KERNEL); + if (!cmn_node->record_implemented) + return -ENOMEM; + bitmap_set(cmn_node->record_implemented, 0, cmn_node->record_count); + + cmn_node->status_reporting = devm_bitmap_zalloc( + adev->dev, cmn_node->record_count, GFP_KERNEL); + if (!cmn_node->status_reporting) + return -ENOMEM; + bitmap_set(cmn_node->status_reporting, 0, cmn_node->record_count); + + cmn_node->records = devm_kcalloc(adev->dev, cmn_node->record_count, + sizeof(struct aest_record), + GFP_KERNEL); + if (!cmn_node->records) + return -ENOMEM; + + aest_node_dbg(cmn_node, "Node init with errgsr %llx\n", errgsr_addr); + + return 0; +} + +static int aest_cmn_reorgnize_node(struct aest_device *adev, + struct acpi_aest_node *anode, u64 base) +{ + struct aest_node *cmn_node; + u64 hnd_offset, cmn_node_offset, reg, logic_id, type, node_id; + u64 errgsr_addr, hnd_base; + struct aest_record *record; + int ret, node_index; + struct cmn_vendor_data *vendor_data; + + if (anode->interface_hdr->type != + ACPI_AEST_NODE_SINGLE_RECORD_MEMORY_MAPPED) { + aest_dev_err(adev, "CMN just use single memory mapping\n"); + return -ENODEV; + } + + hnd_offset = *((u64 *)anode->vendor->vendor_specific_data); + cmn_node_offset = *((u64 *)&anode->vendor->vendor_specific_data[8]); + + reg = readq_relaxed((void *)base + cmn_node_offset + CMN_NODE_INFO); + + logic_id = FIELD_GET(CMN_NI_LOGICAL_ID, reg); + type = FIELD_GET(CMN_NI_NODE_TYPE, reg); + node_id = FIELD_GET(CMN_NI_NODE_ID, reg); + + hnd_base = base + hnd_offset; + node_index = cmn_config->node_id_map[type]; + errgsr_addr = base + cmn_config->errgsr_offset(hnd_offset, node_index); + + // node not register, create it + cmn_node = &adev->nodes[node_index]; + if (!cmn_node->errgsr) { + ret = aest_cmn_init_node(adev, cmn_node, anode, type, + errgsr_addr); + if (ret) + return -ENOMEM; + } + + aest_dev_dbg(adev, "node type %llx, id %llx, offset %llx\n", type, + logic_id, cmn_node_offset); + + if (!test_bit(0, anode->record_implemented)) + clear_bit(logic_id, cmn_node->record_implemented); + + if (!test_bit(0, anode->status_reporting)) + clear_bit(logic_id, cmn_node->status_reporting); + + record = &cmn_node->records[logic_id]; + record->name = + devm_kasprintf(adev->dev, GFP_KERNEL, "record%lld", logic_id); + if (!record->name) + return -ENOMEM; + record->regs_base = devm_ioremap( + adev->dev, (resource_size_t)anode->interface_hdr->address, + sizeof(struct ras_ext_regs)); + if (!record->regs_base) + return -ENOMEM; + record->addressing_mode = test_bit(0, anode->addressing_mode); + record->node = cmn_node; + record->index = logic_id; + record->access = &aest_access[anode->interface_hdr->type]; + + vendor_data = devm_kzalloc(adev->dev, sizeof(struct cmn_vendor_data), + GFP_KERNEL); + vendor_data->node_type = type; + vendor_data->node_id = node_id; + vendor_data->logic_id = logic_id; + + record->vendor_data = vendor_data; + record->vendor_data_size = sizeof(struct cmn_vendor_data); + + aest_record_dbg(record, "base %llx\n", anode->interface_hdr->address); + + return 0; +} + +// reorgnize cmn node +static int aest_cmn_probe(struct aest_device *adev, struct aest_hnode *ahnode) +{ + acpi_status status; + u64 base; + int ret = 0; + struct acpi_aest_node *anode; + char name[9]; + + anode = list_first_entry(&ahnode->list, struct acpi_aest_node, list); + if (!anode) + return -ENODEV; + + if (!cmn_dev_array[anode->vendor->acpi_uid]) { + snprintf(name, 9, "%s", anode->vendor->acpi_hid); + status = acpi_get_devices(name, aest_cmn_get_dev_by_uid, + &anode->vendor->acpi_uid, NULL); + if (ACPI_FAILURE(status)) { + aest_dev_err(adev, "Can not find base\n"); + return_ACPI_STATUS(status); + } + } + base = cmn_dev_array[anode->vendor->acpi_uid]; + if (!base) { + aest_dev_err(adev, "Device base invalid\n"); + return -ENODEV; + } + + adev->type = anode->type; + adev->node_cnt = cmn_config->dev_num; + adev->nodes = devm_kcalloc(adev->dev, adev->node_cnt, + sizeof(struct aest_node), GFP_KERNEL); + if (!adev->nodes) + return -ENOMEM; + aest_set_name(adev, ahnode); + + list_for_each_entry(anode, &ahnode->list, list) { + ret = aest_cmn_reorgnize_node(adev, anode, base); + if (ret) + return ret; + } + + return 0; +} + +int aest_cmn700_probe(struct aest_device *adev, struct aest_hnode *ahnode) +{ + cmn_config = &cmn700_config; + + return aest_cmn_probe(adev, ahnode); +} diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 047c9a8cffe40..bbf8b1142be75 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -152,6 +152,8 @@ static void init_aest_event(struct aest_event *event, memcpy(&event->regs, regs, sizeof(*regs)); event->index = record->index; event->addressing_mode = record->addressing_mode; + event->vendor_data_size = record->vendor_data_size; + event->vendor_data = record->vendor_data; } static int aest_node_gen_pool_add(struct aest_device *adev, @@ -341,10 +343,9 @@ void aest_proc_record(struct aest_record *record, void *data, bool fake) record_write(record, ERXSTATUS, regs.err_status); } -static void aest_node_foreach_record(void (*func)(struct aest_record *, void *, - bool), - struct aest_node *node, void *data, - unsigned long *bitmap) +void aest_node_foreach_record(void (*func)(struct aest_record *, void *, bool), + struct aest_node *node, void *data, + unsigned long *bitmap) { int i; @@ -359,7 +360,7 @@ static void aest_node_foreach_record(void (*func)(struct aest_record *, void *, static int aest_proc(struct aest_node *node) { - int count = 0, i, j, size = node->record_count; + int count = 0, i, j, size = node->record_count, record_idx; u64 err_group = 0; aest_node_dbg(node, "Poll bitmap %*pb\n", size, @@ -374,19 +375,21 @@ static int aest_proc(struct aest_node *node) node->status_reporting); for (i = 0; i < BITS_TO_U64(size); i++) { err_group = readq_relaxed((void *)node->errgsr + i * 8); - aest_node_dbg(node, "errgsr[%d]: 0x%llx\n", i, err_group); - for_each_set_bit(j, (unsigned long *)&err_group, BITS_PER_LONG) { + record_idx = + node->errgsr_mapping(i * BITS_PER_LONG + j); + aest_node_dbg(node, "errgsr[%d]: bit %d occur error\n", + i, record_idx); /* * Error group base is only valid in Memory Map node, * so driver do not need to write select register and * sync. */ - if (test_bit(i * BITS_PER_LONG + j, - node->status_reporting)) + if (test_bit(record_idx, node->status_reporting)) continue; - aest_proc_record(&node->records[j], &count, false); + aest_proc_record(&node->records[record_idx], &count, + false); } } @@ -398,8 +401,11 @@ static irqreturn_t aest_irq_func(int irq, void *input) struct aest_device *adev = input; int i; - for (i = 0; i < adev->node_cnt; i++) + for (i = 0; i < adev->node_cnt; i++) { + if (!adev->nodes[i].record_count) + continue; aest_proc(&adev->nodes[i]); + } return IRQ_HANDLED; } @@ -776,6 +782,7 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, node->info = anode; node->type = anode->type; node->version = get_aest_node_ver(node); + node->errgsr_mapping = default_errgsr_mapping; node->name = alloc_aest_node_name(node); if (!node->name) return -ENOMEM; @@ -828,6 +835,7 @@ static int aest_init_node(struct aest_device *adev, struct aest_node *node, if (!node->records) return -ENOMEM; + node->errgsr_num = DIV_ROUND_UP(node->record_count, BITS_PER_LONG); for (i = 0; i < node->record_count; i++) { ret = aest_init_record(&node->records[i], i, node); if (ret) @@ -923,11 +931,12 @@ static int aest_setup_irq(struct platform_device *pdev, } static struct aest_vendor_match vendor_match[] = { - { }, + { "ARMHC700", &aest_cmn700_probe }, + {}, }; -static int -aest_vendor_probe(struct aest_device *adev, struct aest_hnode *ahnode) +static int aest_vendor_probe(struct aest_device *adev, + struct aest_hnode *ahnode) { int i; struct acpi_aest_node *anode; @@ -936,13 +945,14 @@ aest_vendor_probe(struct aest_device *adev, struct aest_hnode *ahnode) if (!anode) return -ENODEV; - aest_dev_dbg(adev, "Try to probe vendor node %s\n", anode->vendor->acpi_hid); + aest_dev_dbg(adev, "Try to probe vendor node %s\n", + anode->vendor->acpi_hid); for (i = 0; i < ARRAY_SIZE(vendor_match); i++) { if (!strncmp(vendor_match[i].hid, anode->vendor->acpi_hid, 8)) return vendor_match[i].probe(adev, ahnode); } - return -ENODEV; + return 0; } static int aest_device_probe(struct platform_device *pdev) diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 304c03839d31f..9d67d79eb4a2c 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -94,8 +94,16 @@ struct aest_event { /* Vendor node : hardware ID. */ char *hid; u32 index; + u64 ce_threshold; int addressing_mode; struct ras_ext_regs regs; + + /* + * This field is used to store vendor specific data for decoding error + * record by EDAC driver. + */ + void *vendor_data; + size_t vendor_data_size; }; struct aest_access { @@ -147,6 +155,9 @@ struct aest_record { enum ras_ce_threshold threshold_type; struct record_count count; struct dentry *debugfs; + + void *vendor_data; + size_t vendor_data_size; }; struct aest_group { @@ -208,6 +219,19 @@ struct aest_node { */ unsigned long *status_reporting; int version; + /* + * Usually bit[n] in errgsr indicates [n]th error record within this + * error node report error. But some compoent may have different rules. + * For example, CMN700 TRM 4.3.5.12 say: + * ``` Error occurs when the index is even and Fault + * occurs when the index is odd. ``` + * Bit[n]: record[n] report ERROR. + * Bit[n + 1]: record[n] report FAULT. + * errgsr_mapping function is used to map errgsr bit to record index + * for various components. + */ + int (*errgsr_mapping)(int errgsr_bit); + int errgsr_num; const struct aest_group *group; struct aest_device *adev; @@ -366,6 +390,21 @@ static inline bool aest_dev_is_oncore(struct aest_device *adev) return adev->type == ACPI_AEST_PROCESSOR_ERROR_NODE; } +static inline int default_errgsr_mapping(int errgsr_bit) +{ + return errgsr_bit; +} + +static inline int cmn700_errgsr_mapping(int errgsr_bit) +{ + return errgsr_bit / 2; +} + void aest_dev_init_debugfs(struct aest_device *adev); void aest_inject_init_debugfs(struct aest_record *record); void aest_proc_record(struct aest_record *record, void *data, bool fake); +void aest_node_foreach_record(void (*func)(struct aest_record *, void *, bool), + struct aest_node *node, void *data, + unsigned long *bitmap); + +int aest_cmn700_probe(struct aest_device *adev, struct aest_hnode *ahnode); From 79f71ed1fdc2dba283ed5dfb255e6e738cbb538b Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 22 Jan 2026 17:46:56 +0800 Subject: [PATCH 16/24] FROMLIST: trace, ras: add ARM RAS extension trace event Add a trace event for hardware errors reported by the ARMv8 RAS extension registers. userspace app can monitor this trace event and decode error information. Signed-off-by: Ruidong Tian Link: https://patch.msgid.link/20260122094656.73399-17-tianruidong@linux.alibaba.com Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 6 +++ drivers/ras/ras.c | 3 ++ include/ras/ras_event.h | 71 ++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index bbf8b1142be75..6a2d84b47721b 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -13,6 +13,8 @@ #include #include +#include + #include "aest.h" DEFINE_PER_CPU(struct aest_device, percpu_adev); @@ -90,6 +92,10 @@ static void aest_print(struct aest_event *event) pr_err("%s ERR%dMISC3: 0x%llx\n", pfx_seq, index, regs->err_misc[3]); } + + trace_arm_ras_ext_event(event->type, event->id0, event->id1, + event->index, event->hid, &event->regs, + event->vendor_data, event->vendor_data_size); } static void aest_handle_memory_failure(u64 addr) diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 03df3db623346..c8858b745021c 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -115,6 +115,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); EXPORT_TRACEPOINT_SYMBOL_GPL(non_standard_event); EXPORT_TRACEPOINT_SYMBOL_GPL(arm_event); +#ifdef CONFIG_ARM64_RAS_EXTN +EXPORT_TRACEPOINT_SYMBOL_GPL(arm_ras_ext_event); +#endif static int __init parse_ras_param(char *str) { diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index fdb785fa4613a..c4063f7ad7342 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -381,6 +381,77 @@ TRACE_EVENT(aer_event, "Not available") ); #endif /* CONFIG_PCIEAER */ + +/* + * ARM RAS Extension Events Report + * + * This event is generated when an error reported by the ARM RAS extension + * hardware is detected. + */ + +#ifdef CONFIG_ARM64_RAS_EXTN +#include +TRACE_EVENT(arm_ras_ext_event, + + TP_PROTO(const u8 type, + const u32 id0, + const u32 id1, + const u32 index, + char *hid, + struct ras_ext_regs *regs, + const u8 *data, + const u32 len), + + TP_ARGS(type, id0, id1, index, hid, regs, data, len), + + TP_STRUCT__entry( + __field(u8, type) + __field(u32, id0) + __field(u32, id1) + __field(u32, index) + __field(char *, hid) + __field(u64, err_fr) + __field(u64, err_ctlr) + __field(u64, err_status) + __field(u64, err_addr) + __field(u64, err_misc0) + __field(u64, err_misc1) + __field(u64, err_misc2) + __field(u64, err_misc3) + __field(u32, len) + __dynamic_array(u8, buf, len) + ), + + TP_fast_assign( + __entry->type = type; + __entry->id0 = id0; + __entry->id1 = id1; + __entry->index = index; + __entry->hid = hid; + __entry->err_fr = regs->err_fr; + __entry->err_ctlr = regs->err_ctlr; + __entry->err_status = regs->err_status; + __entry->err_addr = regs->err_addr; + __entry->err_misc0 = regs->err_misc[0]; + __entry->err_misc1 = regs->err_misc[1]; + __entry->err_misc2 = regs->err_misc[2]; + __entry->err_misc3 = regs->err_misc[3]; + __entry->len = len; + memcpy(__get_dynamic_array(buf), data, len); + ), + + TP_printk("type: %d; id0: %d; id1: %d; index: %d; hid: %s; " + "ERR_FR: %llx; ERR_CTLR: %llx; ERR_STATUS: %llx; " + "ERR_ADDR: %llx; ERR_MISC0: %llx; ERR_MISC1: %llx; " + "ERR_MISC2: %llx; ERR_MISC3: %llx; data len:%d; raw data:%s", + __entry->type, __entry->id0, __entry->id1, __entry->index, + __entry->hid, __entry->err_fr, __entry->err_ctlr, + __entry->err_status, __entry->err_addr, __entry->err_misc0, + __entry->err_misc1, __entry->err_misc2, __entry->err_misc3, + __entry->len, + __print_hex(__get_dynamic_array(buf), __entry->len)) +); +#endif /* CONFIG_ARM64_RAS_EXTN */ #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ From 514ff1f855ac5bb0dbea72faddeac71434b5522c Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:45 +0530 Subject: [PATCH 17/24] FROMLIST: ras: aest: Fix shared processor node handling and error log messages Two related fixes for processor nodes with ACPI_AEST_PROC_FLAG_SHARED or ACPI_AEST_PROC_FLAG_GLOBAL set (e.g. cluster L3 cache, DSU): 1. aest_dev_is_oncore() returns true for any PROCESSOR_ERROR_NODE, causing shared processor nodes (which use an SPI) to take the cpuhp/PPI path. cpuhp_setup_state() is called instead of aest_online_dev(), so aest_config_irq() is never called and the hardware IRQ-config register is never programmed. Fix aest_dev_is_oncore() to check irq_is_percpu() on the registered IRQ. Only nodes whose FHI or ERI is a per-CPU PPI take the oncore path, nodes with an SPI take aest_online_dev(). 2. alloc_aest_node_name() uses processor_id for the node name of all processor nodes. Shared/global nodes have processor_id=0 (the field is unused when SHARED/GLOBAL is set), so every shared node and the per-PE node for CPU 0 both got the name "processor.0", making error logs ambiguous. For shared/global nodes, build the name as "processor.." (e.g. "processor.cache.1") so each node has a unique, meaningful identifier. Per-PE nodes keep the original "processor." form. Also add proc_flags to struct aest_event so aest_print() can distinguish shared from per-PE nodes and print an appropriate message. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-1-d5d6ffacf0a5@oss.qualcomm.com/ Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 54 +++++++++++++++++++++++++++++++++--- drivers/ras/aest/aest.h | 15 +++++++++- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index 6a2d84b47721b..b4f4c975da1da 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -49,7 +49,19 @@ static void aest_print(struct aest_event *event) switch (event->type) { case ACPI_AEST_PROCESSOR_ERROR_NODE: - pr_err("%s Error from CPU%d\n", pfx_seq, event->id0); + /* + * For shared/global nodes (e.g. cluster L3 cache, DSU), + * id0 is the CPU that handled the interrupt — not the error + * source itself. The node_name already identifies the resource + * (e.g. "processor.cache.1"). Print a distinct message so the + * log is not confused with a per-PE CPU error. + */ + if (event->proc_flags & + (ACPI_AEST_PROC_FLAG_SHARED | ACPI_AEST_PROC_FLAG_GLOBAL)) + pr_err("%s Error from shared processor resource (interrupt handled on CPU%d)\n", + pfx_seq, event->id0); + else + pr_err("%s Error from CPU%d\n", pfx_seq, event->id0); break; case ACPI_AEST_MEMORY_ERROR_NODE: pr_err("%s Error from memory at SRAT proximity domain %#x\n", @@ -133,6 +145,7 @@ static void init_aest_event(struct aest_event *event, info->processor->processor_id); event->id1 = info->processor->resource_type; + event->proc_flags = info->processor->flags; break; case ACPI_AEST_MEMORY_ERROR_NODE: event->id0 = info->memory->srat_proximity_domain; @@ -175,6 +188,7 @@ static int aest_node_gen_pool_add(struct aest_device *adev, if (!event) return -ENOMEM; + memset(event, 0, sizeof(*event)); init_aest_event(event, record, regs); llist_add(&event->llnode, &adev->event_list); @@ -730,9 +744,41 @@ static char *alloc_aest_node_name(struct aest_node *node) switch (node->type) { case ACPI_AEST_PROCESSOR_ERROR_NODE: - name = devm_kasprintf(node->adev->dev, GFP_KERNEL, "%s.%d", - aest_node_name[node->type], - node->info->processor->processor_id); + /* + * Shared/global processor nodes (e.g. cluster L3 cache, DSU) + * have processor_id=0 and use smp_processor_id() at error-log + * time — using processor_id in the name would produce the same + * "processor.0" string for every shared node and every CPU0 + * per-PE node, making logs ambiguous. + * + * For shared/global nodes, build the name from the resource + * type and the device id so each node gets a unique, meaningful + * name (e.g. "processor.cache.1", "processor.tlb.2"). + * + * For per-PE nodes, keep the original "processor." form. + */ + if (node->info->processor->flags & + (ACPI_AEST_PROC_FLAG_SHARED | ACPI_AEST_PROC_FLAG_GLOBAL)) { + static const char *const res_name[] = { + [ACPI_AEST_CACHE_RESOURCE] = "cache", + [ACPI_AEST_TLB_RESOURCE] = "tlb", + [ACPI_AEST_GENERIC_RESOURCE] = "generic", + }; + u8 rtype = node->info->processor->resource_type; + const char *rstr = (rtype < ARRAY_SIZE(res_name) && + res_name[rtype]) ? res_name[rtype] : "unknown"; + + name = devm_kasprintf(node->adev->dev, GFP_KERNEL, + "%s.%s.%d", + aest_node_name[node->type], + rstr, + node->adev->id); + } else { + name = devm_kasprintf(node->adev->dev, GFP_KERNEL, + "%s.%d", + aest_node_name[node->type], + node->info->processor->processor_id); + } break; case ACPI_AEST_MEMORY_ERROR_NODE: case ACPI_AEST_SMMU_ERROR_NODE: diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h index 9d67d79eb4a2c..9704af97fee89 100644 --- a/drivers/ras/aest/aest.h +++ b/drivers/ras/aest/aest.h @@ -8,6 +8,7 @@ #include #include #include +#include #define MAX_GSI_PER_NODE 2 #define DEFAULT_CE_THRESHOLD 1 @@ -94,6 +95,8 @@ struct aest_event { /* Vendor node : hardware ID. */ char *hid; u32 index; + /* Processor node: ACPI_AEST_PROC_FLAG_* bitmask (SHARED/GLOBAL) */ + u8 proc_flags; u64 ce_threshold; int addressing_mode; struct ras_ext_regs regs; @@ -387,7 +390,17 @@ static inline void aest_sync(struct aest_node *node) static inline bool aest_dev_is_oncore(struct aest_device *adev) { - return adev->type == ACPI_AEST_PROCESSOR_ERROR_NODE; + /* + * A processor node is "on-core" (uses PPI + cpuhp) only when its + * interrupt is a per-CPU PPI. A shared processor node (e.g. cluster + * L3 cache, DSU) uses an SPI and must follow the non-oncore path + * (aest_online_dev) so that aest_config_irq and aest_online_dev are + * called instead of cpuhp_setup_state. + */ + if (adev->type != ACPI_AEST_PROCESSOR_ERROR_NODE) + return false; + return irq_is_percpu(adev->irq[ACPI_AEST_NODE_FAULT_HANDLING]) || + irq_is_percpu(adev->irq[ACPI_AEST_NODE_ERROR_RECOVERY]); } static inline int default_errgsr_mapping(int errgsr_bit) From 7a56094ebd6416067f2475a76011bc645b88f23a Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:46 +0530 Subject: [PATCH 18/24] FROMLIST: ras: aest: Fix CE/UE error counts not incrementing in debugfs The error counts visible under: /sys/kernel/debug/aest//processor//err_count always reported zero, even though corrected errors (CEs) were being serviced by the interrupt handler. aest_oncore_dev_init_debugfs() sets up per CPU debugfs entries but wired them up incorrectly in two places: - this_cpu_ptr(adev->adev_oncore) was used inside for_each_possible_cpu(). This always selects the slot for the CPU executing the init code, so all debugfs files ended up referencing the same per CPU aest_device instance instead of the CPU indicated by the loop variable. - The code referenced adev->nodes[i], i.e. the template nodes allocated before __setup_ppi, rather than the per-CPU copies at percpu_dev->nodes[i]. The IRQ handler updates CE counters in the per-CPU records created by __setup_ppi, the template records are never touched at runtime, so err_count always read as zero. Fix this by: - Using per_cpu_ptr(adev->adev_oncore, cpu) when iterating over CPUs. Wiring debugfs files to percpu_dev->nodes[i] so counters reflect the data updated by the IRQ handler. - Using adev->nodes[i].name for debugfs directory names. The per-CPU node receives name via a shallow memcpy and is not the authoritative source. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-2-d5d6ffacf0a5@oss.qualcomm.com/ Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-sysfs.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/ras/aest/aest-sysfs.c b/drivers/ras/aest/aest-sysfs.c index 66e9c1103f996..f710503e4d74a 100644 --- a/drivers/ras/aest/aest-sysfs.c +++ b/drivers/ras/aest/aest-sysfs.c @@ -189,16 +189,23 @@ aest_oncore_dev_init_debugfs(struct aest_device *adev) char name[16]; for_each_possible_cpu(cpu) { - percpu_dev = this_cpu_ptr(adev->adev_oncore); + percpu_dev = per_cpu_ptr(adev->adev_oncore, cpu); - snprintf(name, sizeof(name), "processor%u%u", cpu); + snprintf(name, sizeof(name), "processor%u", cpu); percpu_dev->debugfs = debugfs_create_dir(name, adev->debugfs); for (i = 0; i < adev->node_cnt; i++) { - node = &adev->nodes[i]; - - node->debugfs = debugfs_create_dir(node->name, - percpu_dev->debugfs); + node = &percpu_dev->nodes[i]; + + /* + * Use adev->nodes[i].name (the original) rather than + * node->name from the per-CPU copy. The per-CPU copy + * receives node->name via shallow memcpy in __setup_ppi; + * the original is the authoritative, guaranteed-valid + * string. + */ + node->debugfs = debugfs_create_dir(adev->nodes[i].name, + percpu_dev->debugfs); aest_node_init_debugfs(node); } } From 598c17b9608539f22712b4fa1f30813493223756 Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:47 +0530 Subject: [PATCH 19/24] FROMLIST: ras: aest: Skip unimplemented records in debugfs The record_implemented bitmap uses the same semantics as the rest of the driver: a SET bit means the record is NOT implemented (skip it), a CLEAR bit means the record IS implemented (process it). aest_node_init_debugfs() and aest_node_err_count_show() were iterating all record_count records unconditionally, creating debugfs entries and accumulating error counts for unimplemented records too. Fix both functions to skip records where the corresponding bit is set in node->record_implemented, consistent with how aest_node_foreach_record() handles the same bitmap. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-3-d5d6ffacf0a5@oss.qualcomm.com/ Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/ras/aest/aest-sysfs.c b/drivers/ras/aest/aest-sysfs.c index f710503e4d74a..b36190bb3b3e4 100644 --- a/drivers/ras/aest/aest-sysfs.c +++ b/drivers/ras/aest/aest-sysfs.c @@ -52,7 +52,8 @@ static int aest_node_err_count_show(struct seq_file *m, void *data) int i; for (i = 0; i < node->record_count; i++) - aest_error_count(&node->records[i], &count); + if (!test_bit(i, node->record_implemented)) + aest_error_count(&node->records[i], &count); seq_printf(m, "CE: %llu\n" "DE: %llu\n" @@ -174,8 +175,11 @@ aest_node_init_debugfs(struct aest_node *node) record = &node->records[i]; if (!record->name) continue; + /* Skip records not implemented on this node. */ + if (test_bit(i, node->record_implemented)) + continue; record->debugfs = debugfs_create_dir(record->name, - node->debugfs); + node->debugfs); aest_record_init_debugfs(record); } } From 8367438562ef1b01b63199c69b61f3a869e49ff1 Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:48 +0530 Subject: [PATCH 20/24] FROMLIST: ras: aest: Add panic_on_ue module parameter The driver unconditionally calls panic() whenever an unrecoverable, uncontainable UE (UET_UC or UET_UEU) is detected. There is no way for the user to suppress this behaviour, which makes it difficult to test UE injection or to run in environments where a kernel panic on every UE is undesirable. Add a module parameter `aest_panic_on_ue` When set to 0 the driver logs the UE and continues instead of panicking. Usage: # Boot time (kernel cmdline) aest.aest_panic_on_ue=0 # Runtime echo 0 > /sys/module/aest/parameters/aest_panic_on_ue Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-4-d5d6ffacf0a5@oss.qualcomm.com/ Signed-off-by: Umang Chheda --- drivers/ras/aest/aest-core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c index b4f4c975da1da..9ce782a66edfc 100644 --- a/drivers/ras/aest/aest-core.c +++ b/drivers/ras/aest/aest-core.c @@ -22,6 +22,11 @@ DEFINE_PER_CPU(struct aest_device, percpu_adev); #undef pr_fmt #define pr_fmt(fmt) "AEST: " fmt +static bool aest_panic_on_ue; +module_param(aest_panic_on_ue, bool, 0644); +MODULE_PARM_DESC(aest_panic_on_ue, + "Panic on unrecoverable error: 0=off 1=on (default: 1)"); + #ifdef CONFIG_DEBUG_FS struct dentry *aest_debugfs; #endif @@ -342,9 +347,11 @@ void aest_proc_record(struct aest_record *record, void *data, bool fake) aest_record_info( record, "Simulated error! Skip panic due to fault injection\n"); - else + else if (aest_panic_on_ue) aest_panic(record, ®s, "AEST: unrecoverable error encountered"); + else + aest_record_err(record, "UE detected, panic suppressed\n"); } aest_log(record, ®s); From 7f5012d410b4919321a7f91100fed00ba7cbc7a9 Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:49 +0530 Subject: [PATCH 21/24] FROMLIST: dt-bindings: arm: ras: Introduce bindings for ARM AEST The Arm Error Source Table (AEST) specification describes how firmware exposes RAS error source topology to the operating system. On ACPI systems this information is provided via the AEST ACPI table. Introduce Device Tree bindings that provide an equivalent description of AEST error sources for DT-based platforms. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-5-d5d6ffacf0a5@oss.qualcomm.com/ Signed-off-by: Umang Chheda --- .../devicetree/bindings/arm/arm,aest.yaml | 406 ++++++++++++++++++ include/dt-bindings/arm/aest.h | 43 ++ 2 files changed, 449 insertions(+) create mode 100644 Documentation/devicetree/bindings/arm/arm,aest.yaml create mode 100644 include/dt-bindings/arm/aest.h diff --git a/Documentation/devicetree/bindings/arm/arm,aest.yaml b/Documentation/devicetree/bindings/arm/arm,aest.yaml new file mode 100644 index 0000000000000..7809a0d382703 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/arm,aest.yaml @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/arm/arm,aest.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Arm Error Source Table (AEST) + +maintainers: + - Umang Chheda + +description: + The Arm Error Source Table (AEST) describes RAS error sources and their + register interfaces. Each error source exposes one or more error records + through either system registers or a memory-mapped register window, and + may signal errors via interrupts. The top-level node acts as a container + for one or more child nodes, each describing a single AEST error source. + Refer to the Arm AEST specification (DEN0085 / DDI 0587B) for details. + Flag bit constants for use in DT source files are defined in + . + +properties: + compatible: + const: arm,aest + + "#address-cells": + const: 2 + + "#size-cells": + const: 2 + + ranges: true + +required: + - compatible + +additionalProperties: false + +patternProperties: + "^aest-[a-z0-9-]+(@[0-9a-f]+)?$": + type: object + description: + An AEST error source node describing one error source defined by + the Arm AEST specification. + + properties: + compatible: + description: + Identifies the type of AEST error source. Each value corresponds to + a distinct error source class defined by the Arm AEST specification. + arm,aest-proxy represents a proxy error source that forwards errors + from another error source. + enum: + - arm,aest-processor + - arm,aest-memory + - arm,aest-smmu + - arm,aest-gic + - arm,aest-pcie + - arm,aest-vendor + - arm,aest-proxy + + reg: + description: + Register ranges for the error source. Absence of reg implies + system-register access (interface type 0). A single range implies + memory-mapped access (interface type 1). Two ranges imply + single-record memory-mapped access (interface type 2). + minItems: 1 + maxItems: 4 + + reg-names: + description: + Names for the register ranges. The base error-record window is + unnamed (or first entry). Optional named ranges provide access to + the fault-injection, error-group, and interrupt-config register + windows defined by the AEST specification. + minItems: 1 + maxItems: 4 + items: + enum: + - fault-inject + - err-group + - irq-config + + interrupts: + description: Interrupts associated with the error source. + minItems: 1 + maxItems: 2 + + interrupt-names: + description: Names of the interrupts associated with the error source. + minItems: 1 + maxItems: 2 + items: + enum: + - fhi + - eri + + arm,fhi-flags: + description: + Bitmask of flags for the fault-handling interrupt (FHI), as defined + in the AEST node interrupt structure flags field. Constants are + defined in - AEST_IRQ_MODE_LEVEL (0), + AEST_IRQ_MODE_EDGE (1). + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,eri-flags: + description: + Bitmask of flags for the error-recovery interrupt (ERI), as defined + in the AEST node interrupt structure flags field. Constants are + defined in . + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,interface-flags: + description: | + Bitmask of interface flags for the error source, as defined in the + AEST node interface flags field. Constants are defined in + : + AEST_XFACE_SHARED (bit 0) - shared error source, + AEST_XFACE_CLEAR_MISC (bit 1) - clear MISC registers on error, + AEST_XFACE_ERROR_DEVICE (bit 2) - error node device present, + AEST_XFACE_AFFINITY (bit 3) - affinity information valid, + AEST_XFACE_ERROR_GROUP (bit 4) - error group register window present, + AEST_XFACE_FAULT_INJECT (bit 5) - fault injection register window present, + AEST_XFACE_INT_CONFIG (bit 6) - interrupt config register window present. + For system-register interface nodes (no reg property), only + AEST_XFACE_CLEAR_MISC is meaningful; the MMIO window flags + (AEST_XFACE_ERROR_GROUP, AEST_XFACE_FAULT_INJECT, + AEST_XFACE_INT_CONFIG) have no effect without a base address. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,group-format: + description: | + Page-granularity of the error record group register window, which + determines the MMIO mapping size, the number of ERRGSR registers, + and the width of the record-implemented and status-reporting bitmaps. + Constants are defined in : + AEST_GROUP_FORMAT_4K (0) - 4K window, 1 ERRGSR, up to 64 records, + AEST_GROUP_FORMAT_16K (1) - 16K window, 4 ERRGSRs, up to 256 records, + AEST_GROUP_FORMAT_64K (2) - 64K window, 14 ERRGSRs, up to 896 records. + Required for memory-mapped nodes (reg present) where it controls + the ioremap size and ERRGSR layout. For system-register nodes + (no reg property) this property is optional and defaults to + AEST_GROUP_FORMAT_4K. + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2] + + arm,num-records: + description: Number of error records implemented by this error source. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,record-impl: + description: + Bitmap of implemented error records within this error source. Bit N + set to 0 means error record N is implemented and must be polled. + $ref: /schemas/types.yaml#/definitions/uint64-array + + arm,status-reporting: + description: + Bitmap indicating which error records support status reporting via + the ERRGSR register. Bit N set to 1 means record N does not report + through ERRGSR and must be polled explicitly. + $ref: /schemas/types.yaml#/definitions/uint64-array + + arm,addressing-mode: + description: + Bitmap indicating the address type reported in ERR_ADDR for each + error record. Bit N set to 0 means record N reports System Physical + Addresses (SPA); bit N set to 1 means record N reports node-specific + Logical Addresses (LA) that require OS translation to SPA. + $ref: /schemas/types.yaml#/definitions/uint64-array + + arm,processor-flags: + description: + Bitmask indicating the scope of a processor error source, as defined + in the AEST processor node flags field. Constants are defined in + - AEST_PROC_GLOBAL (bit 0), + AEST_PROC_SHARED (bit 1). + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,resource-type: + description: | + Type of processor resource associated with this error source. + Constants are defined in : + AEST_RESOURCE_CACHE (0), + AEST_RESOURCE_TLB (1), + AEST_RESOURCE_GENERIC (2). + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2] + + arm,cache-ref: + description: + Phandle to the cache node associated with this processor error source. + $ref: /schemas/types.yaml#/definitions/phandle + + arm,tlb-level: + description: TLB level identifier for this processor TLB error source. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,resource-ref: + description: + Generic resource reference identifier for this processor error source. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,proximity-domain: + description: + SRAT proximity domain of the memory node associated with this error + source. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,smmu-ref: + description: + Phandle to the SMMU node in the IORT associated with this error + source. + $ref: /schemas/types.yaml#/definitions/phandle + + arm,smmu-subcomponent: + description: + SMMU subcomponent reference identifier for this error source, as + defined in the AEST SMMU node structure. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,gic-type: + description: | + GIC component type for this error source, as defined in the AEST GIC + node structure. Constants are defined in : + AEST_GIC_CPU (0), + AEST_GIC_DISTRIBUTOR (1), + AEST_GIC_REDISTRIBUTOR (2), + AEST_GIC_ITS (3). + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + + arm,gic-instance: + description: + GIC instance identifier for this error source, used to distinguish + multiple instances of the same GIC component type. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,pcie-segment: + description: + PCI segment number of the PCIe root port associated with this error + source, corresponding to the IORT node reference. + $ref: /schemas/types.yaml#/definitions/uint32 + + arm,vendor-hid: + description: + 8-character ACPI Hardware ID string identifying the vendor error + source, as defined in the AEST vendor node structure. + $ref: /schemas/types.yaml#/definitions/string + + arm,vendor-uid: + description: + ACPI unique instance identifier for this vendor error source, used + to distinguish multiple instances with the same hardware ID. + $ref: /schemas/types.yaml#/definitions/uint32 + + required: + - compatible + - arm,num-records + + allOf: + - if: + required: + - reg + then: + required: + - arm,group-format + - if: + properties: + compatible: + contains: + const: arm,aest-processor + then: + properties: + arm,processor-flags: {} + arm,resource-type: {} + arm,cache-ref: {} + arm,tlb-level: {} + arm,resource-ref: {} + else: + properties: + arm,processor-flags: false + arm,resource-type: false + arm,cache-ref: false + arm,tlb-level: false + arm,resource-ref: false + + - if: + properties: + compatible: + contains: + const: arm,aest-memory + then: + required: + - arm,proximity-domain + properties: + arm,proximity-domain: {} + else: + properties: + arm,proximity-domain: false + + - if: + properties: + compatible: + contains: + const: arm,aest-smmu + then: + required: + - arm,smmu-ref + properties: + arm,smmu-ref: {} + arm,smmu-subcomponent: {} + else: + properties: + arm,smmu-ref: false + arm,smmu-subcomponent: false + + - if: + properties: + compatible: + contains: + const: arm,aest-gic + then: + properties: + arm,gic-type: {} + arm,gic-instance: {} + else: + properties: + arm,gic-type: false + arm,gic-instance: false + + - if: + properties: + compatible: + contains: + const: arm,aest-pcie + then: + required: + - arm,pcie-segment + properties: + arm,pcie-segment: {} + else: + properties: + arm,pcie-segment: false + + - if: + properties: + compatible: + contains: + const: arm,aest-vendor + then: + required: + - arm,vendor-hid + properties: + arm,vendor-hid: {} + arm,vendor-uid: {} + else: + properties: + arm,vendor-hid: false + arm,vendor-uid: false + + unevaluatedProperties: false + +examples: + - | + #include + #include + + aest { + compatible = "arm,aest"; + #address-cells = <2>; + #size-cells = <2>; + + /* System-register based processor error source (no reg property) */ + aest-processor-0 { + compatible = "arm,aest-processor"; + arm,num-records = <2>; + arm,record-impl = /bits/ 64 <0x3>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + arm,resource-type = ; + interrupts = ; + interrupt-names = "fhi"; + }; + + /* Memory-mapped memory controller error source */ + aest-memory-0@50010000 { + compatible = "arm,aest-memory"; + reg = <0x0 0x50010000 0x0 0x1000>, + <0x0 0x50011000 0x0 0x1000>, + <0x0 0x50012000 0x0 0x1000>; + reg-names = "err-group", "fault-inject", "irq-config"; + arm,group-format = ; + arm,num-records = <4>; + arm,record-impl = /bits/ 64 <0xf>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,interface-flags = ; + arm,proximity-domain = <0>; + interrupts = , + ; + interrupt-names = "fhi", "eri"; + }; + }; diff --git a/include/dt-bindings/arm/aest.h b/include/dt-bindings/arm/aest.h new file mode 100644 index 0000000000000..43679314e98e8 --- /dev/null +++ b/include/dt-bindings/arm/aest.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for the Arm Error Source Table (AEST) + * DT binding (Documentation/devicetree/bindings/arm/arm,aest.yaml). + */ + +#ifndef _DT_BINDINGS_ARM_AEST_H +#define _DT_BINDINGS_ARM_AEST_H + +/* arm,interface-flags - AEST node interface flags field */ +#define AEST_XFACE_SHARED 1 +#define AEST_XFACE_CLEAR_MISC 2 +#define AEST_XFACE_ERROR_DEVICE 4 +#define AEST_XFACE_AFFINITY 8 +#define AEST_XFACE_ERROR_GROUP 16 +#define AEST_XFACE_FAULT_INJECT 32 +#define AEST_XFACE_INT_CONFIG 64 + +/* arm,fhi-flags / arm,eri-flags - AEST node interrupt flags field */ +#define AEST_IRQ_MODE_LEVEL 0 +#define AEST_IRQ_MODE_EDGE 1 + +/* arm,processor-flags - AEST processor node flags field */ +#define AEST_PROC_GLOBAL 1 +#define AEST_PROC_SHARED 2 + +/* arm,group-format - error record group register window page size */ +#define AEST_GROUP_FORMAT_4K 0 +#define AEST_GROUP_FORMAT_16K 1 +#define AEST_GROUP_FORMAT_64K 2 + +/* arm,resource-type - processor resource type */ +#define AEST_RESOURCE_CACHE 0 +#define AEST_RESOURCE_TLB 1 +#define AEST_RESOURCE_GENERIC 2 + +/* arm,gic-type - GIC component type */ +#define AEST_GIC_CPU 0 +#define AEST_GIC_DISTRIBUTOR 1 +#define AEST_GIC_REDISTRIBUTOR 2 +#define AEST_GIC_ITS 3 + +#endif /* _DT_BINDINGS_ARM_AEST_H */ From 7d4f1febd38130ac50897f7b0c7b9e03dfcfa034 Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:50 +0530 Subject: [PATCH 22/24] FROMLIST: ras: aest: Add DT frontend for ARM AEST RAS error sources Add a Device Tree frontend for the Arm AEST RAS framework, allowing the existing AEST core driver to be used on DT-only systems. The DT frontend parses the "arm,aest" Device Tree hierarchy and populates the same internal structures as the ACPI-based implementation. It is initialized at the same layer as ACPI and is mutually exclusive with it, ensuring identical behaviour regardless of the firmware interface in use. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-6-d5d6ffacf0a5@oss.qualcomm.com/ Signed-off-by: Umang Chheda --- drivers/ras/aest/Kconfig | 15 +- drivers/ras/aest/Makefile | 2 + drivers/ras/aest/aest-of.c | 673 +++++++++++++++++++++++++++++++++++++ 3 files changed, 688 insertions(+), 2 deletions(-) create mode 100644 drivers/ras/aest/aest-of.c diff --git a/drivers/ras/aest/Kconfig b/drivers/ras/aest/Kconfig index 0b09a5d5acce3..ca034255faddf 100644 --- a/drivers/ras/aest/Kconfig +++ b/drivers/ras/aest/Kconfig @@ -7,11 +7,22 @@ config AEST tristate "ARM AEST Driver" - depends on ACPI_AEST && RAS - + depends on ACPI_AEST || OF_AEST + depends on RAS help The Arm Error Source Table (AEST) provides details on ACPI extensions that enable kernel-first handling of errors in a system that supports the Armv8 RAS extensions. If set, the kernel will report and log hardware errors. + +config OF_AEST + bool "ARM Error Source Table DT Support" + depends on ARM64_RAS_EXTN && OF + help + Enable support for discovering ARM RAS error sources using the + Device Tree based Arm Error Source Table (AEST) specification. + This allows the kernel to enumerate and manage hardware error + reporting blocks described in firmware for ARMv8 and later + systems. Select this option if your platform describes AEST + nodes in Device Tree and relies on RAS error handling. diff --git a/drivers/ras/aest/Makefile b/drivers/ras/aest/Makefile index e5a45fde6d362..2997952901c05 100644 --- a/drivers/ras/aest/Makefile +++ b/drivers/ras/aest/Makefile @@ -6,3 +6,5 @@ aest-y := aest-core.o aest-y += aest-sysfs.o aest-y += aest-inject.o aest-y += aest-cmn.o + +obj-$(CONFIG_OF_AEST) += aest-of.o diff --git a/drivers/ras/aest/aest-of.c b/drivers/ras/aest/aest-of.c new file mode 100644 index 0000000000000..939db2c417427 --- /dev/null +++ b/drivers/ras/aest/aest-of.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "DT AEST: " fmt + +struct dt_aest_priv { + struct xarray aest_array; + u32 node_id; +}; + +static const struct of_device_id dt_aest_child_match[] = { + { .compatible = "arm,aest-processor", .data = (void *)ACPI_AEST_PROCESSOR_ERROR_NODE }, + { .compatible = "arm,aest-memory", .data = (void *)ACPI_AEST_MEMORY_ERROR_NODE }, + { .compatible = "arm,aest-smmu", .data = (void *)ACPI_AEST_SMMU_ERROR_NODE }, + { .compatible = "arm,aest-vendor", .data = (void *)ACPI_AEST_VENDOR_ERROR_NODE }, + { .compatible = "arm,aest-gic", .data = (void *)ACPI_AEST_GIC_ERROR_NODE }, + { .compatible = "arm,aest-pcie", .data = (void *)ACPI_AEST_PCIE_ERROR_NODE }, + { .compatible = "arm,aest-proxy", .data = (void *)ACPI_AEST_PROXY_ERROR_NODE }, + { } +}; + +static int dt_aest_node_type(struct device_node *np) +{ + const struct of_device_id *match; + + match = of_match_node(dt_aest_child_match, np); + if (!match) { + pr_warn("unknown compatible for %pOF\n", np); + return -EINVAL; + } + return (int)(uintptr_t)match->data; +} + +static struct aest_hnode *dt_aest_alloc_hnode(int node_type, u32 id) +{ + struct aest_hnode *ahnode; + + ahnode = kzalloc_obj(*ahnode, GFP_KERNEL); + if (!ahnode) + return NULL; + + INIT_LIST_HEAD(&ahnode->list); + ahnode->count = 0; + ahnode->id = id; + ahnode->type = node_type; + return ahnode; +} + +static int dt_aest_build_interface(struct device_node *np, + struct acpi_aest_node *anode) +{ + struct acpi_aest_node_interface_header *hdr; + struct acpi_aest_node_interface_common *common; + struct resource res; + struct resource named_res; + u32 gfmt = 0, flags = 0, nrec = 1; + u32 itype; + int ret; + size_t body_sz; + + /* + * Deduce interface type from the presence and count of reg entries: + * no reg -> system-register access (type 0) + * 1 range -> memory-mapped access (type 1) + * 2+ ranges -> single-record MMIO (type 2) + */ + if (!of_property_present(np, "reg")) + itype = ACPI_AEST_NODE_SYSTEM_REGISTER; + else if (of_property_count_elems_of_size(np, "reg", sizeof(u32)) <= + (of_n_addr_cells(np) + of_n_size_cells(np))) + itype = ACPI_AEST_NODE_MEMORY_MAPPED; + else + itype = ACPI_AEST_NODE_SINGLE_RECORD_MEMORY_MAPPED; + + of_property_read_u32(np, "arm,group-format", &gfmt); + of_property_read_u32(np, "arm,interface-flags", &flags); + of_property_read_u32(np, "arm,num-records", &nrec); + + switch (gfmt) { + case ACPI_AEST_NODE_GROUP_FORMAT_16K: + body_sz = sizeof(struct acpi_aest_node_interface_16k); + break; + case ACPI_AEST_NODE_GROUP_FORMAT_64K: + body_sz = sizeof(struct acpi_aest_node_interface_64k); + break; + default: + body_sz = sizeof(struct acpi_aest_node_interface_4k); + break; + } + + hdr = kzalloc(sizeof(*hdr) + body_sz, GFP_KERNEL); + if (!hdr) + return -ENOMEM; + + /* Fill header */ + hdr->type = (u8)itype; + hdr->group_format = (u8)gfmt; + hdr->flags = flags; + hdr->error_record_count = nrec; + hdr->error_record_index = 0; + + if (itype != ACPI_AEST_NODE_SYSTEM_REGISTER) { + ret = of_address_to_resource(np, 0, &res); + if (ret) { + pr_err("node %pOF: missing 'reg' for MMIO interface\n", np); + kfree(hdr); + return ret; + } + hdr->address = res.start; + } + + switch (gfmt) { + case ACPI_AEST_NODE_GROUP_FORMAT_4K: { + struct acpi_aest_node_interface_4k *b = + (struct acpi_aest_node_interface_4k *)(hdr + 1); + of_property_read_u64(np, "arm,record-impl", + &b->error_record_implemented); + of_property_read_u64(np, "arm,status-reporting", + &b->error_status_reporting); + of_property_read_u64(np, "arm,addressing-mode", + &b->addressing_mode); + common = &b->common; + anode->record_implemented = + (unsigned long *)&b->error_record_implemented; + anode->status_reporting = + (unsigned long *)&b->error_status_reporting; + anode->addressing_mode = + (unsigned long *)&b->addressing_mode; + break; + } + case ACPI_AEST_NODE_GROUP_FORMAT_16K: { + struct acpi_aest_node_interface_16k *b = + (struct acpi_aest_node_interface_16k *)(hdr + 1); + of_property_read_u64_array(np, "arm,record-impl", + b->error_record_implemented, 4); + of_property_read_u64_array(np, "arm,status-reporting", + b->error_status_reporting, 4); + of_property_read_u64_array(np, "arm,addressing-mode", + b->addressing_mode, 4); + common = &b->common; + anode->record_implemented = + (unsigned long *)b->error_record_implemented; + anode->status_reporting = + (unsigned long *)b->error_status_reporting; + anode->addressing_mode = + (unsigned long *)b->addressing_mode; + break; + } + case ACPI_AEST_NODE_GROUP_FORMAT_64K: { + struct acpi_aest_node_interface_64k *b = + (struct acpi_aest_node_interface_64k *)(hdr + 1); + of_property_read_u64_array(np, "arm,record-impl", + b->error_record_implemented, 14); + of_property_read_u64_array(np, "arm,status-reporting", + b->error_status_reporting, 14); + of_property_read_u64_array(np, "arm,addressing-mode", + b->addressing_mode, 14); + common = &b->common; + anode->record_implemented = + (unsigned long *)b->error_record_implemented; + anode->status_reporting = + (unsigned long *)b->error_status_reporting; + anode->addressing_mode = + (unsigned long *)b->addressing_mode; + break; + } + default: + pr_err("node %pOF: unsupported group-format %u\n", np, gfmt); + kfree(hdr); + return -EINVAL; + } + + if (!of_address_to_resource(np, of_property_match_string( + np, "reg-names", "fault-inject"), &named_res)) + common->fault_inject_register_base = named_res.start; + + if (!of_address_to_resource(np, of_property_match_string( + np, "reg-names", "err-group"), &named_res)) + common->error_group_register_base = named_res.start; + + if (!of_address_to_resource(np, of_property_match_string( + np, "reg-names", "irq-config"), &named_res)) + common->interrupt_config_register_base = named_res.start; + + anode->interface_hdr = hdr; + anode->common = common; + + return 0; +} + +static int dt_aest_build_interrupt(struct device_node *np, + struct acpi_aest_node *anode) +{ + struct acpi_aest_node_interrupt_v2 *irq_arr; + int fhi_irq, eri_irq, count = 0; + u32 fhi_flags = 0, eri_flags = 0; + + of_property_read_u32(np, "arm,fhi-flags", &fhi_flags); + of_property_read_u32(np, "arm,eri-flags", &eri_flags); + + fhi_irq = of_irq_get_byname(np, "fhi"); + if (fhi_irq == -EPROBE_DEFER) + return -EPROBE_DEFER; + if (fhi_irq < 0 && fhi_irq != -EINVAL) { + const char *name = NULL; + + of_property_read_string(np, "interrupt-names", &name); + + pr_warn("node %pOF: failed to map FHI IRQ: %d (interrupt-names[0]=\"%s\", want \"%s\")\n", + np, fhi_irq, name ?: "", "fhi"); + } + eri_irq = of_irq_get_byname(np, "eri"); + if (eri_irq == -EPROBE_DEFER) + return -EPROBE_DEFER; + if (eri_irq < 0 && eri_irq != -EINVAL) { + const char *name = NULL; + + of_property_read_string_index(np, "interrupt-names", 1, &name); + + pr_warn("node %pOF: failed to map ERI IRQ: %d (interrupt-names[1]=\"%s\", want \"%s\")\n", + np, eri_irq, name ?: "", "eri"); + } + + if (fhi_irq > 0) + count++; + if (eri_irq > 0) + count++; + + if (!count) { + anode->interrupt = NULL; + anode->interrupt_count = 0; + return 0; + } + + irq_arr = kcalloc(count, sizeof(*irq_arr), GFP_KERNEL); + if (!irq_arr) + return -ENOMEM; + + count = 0; + if (fhi_irq > 0) { + irq_arr[count].gsiv = fhi_irq; + irq_arr[count].flags = AEST_INTERRUPT_MODE | fhi_flags; + irq_arr[count].type = ACPI_AEST_NODE_FAULT_HANDLING; + count++; + } + if (eri_irq > 0) { + irq_arr[count].gsiv = eri_irq; + irq_arr[count].flags = eri_flags; + irq_arr[count].type = ACPI_AEST_NODE_ERROR_RECOVERY; + count++; + } + + anode->interrupt = irq_arr; + anode->interrupt_count = count; + return 0; +} + +static int dt_aest_build_node_specific(struct device_node *np, + struct acpi_aest_node *anode, + int node_type) +{ + switch (node_type) { + + case ACPI_AEST_PROCESSOR_ERROR_NODE: { + struct acpi_aest_processor *proc; + u32 rtype = 0, pflags = 0; + + proc = kzalloc_obj(*proc, GFP_KERNEL); + if (!proc) + return -ENOMEM; + + of_property_read_u32(np, "arm,resource-type", &rtype); + of_property_read_u32(np, "arm,processor-flags", &pflags); + + proc->resource_type = (u8)rtype; + proc->flags = (u8)pflags; + + /* Processor cache/TLB/generic sub-structure */ + switch (rtype) { + case ACPI_AEST_CACHE_RESOURCE: { + struct acpi_aest_processor_cache *c; + struct device_node *cache_np; + + c = kzalloc_obj(*c, GFP_KERNEL); + if (!c) { + kfree(proc); + return -ENOMEM; + } + + cache_np = of_parse_phandle(np, "arm,cache-ref", 0); + if (cache_np) { + c->cache_reference = cache_np->phandle; + of_node_put(cache_np); + } + anode->cache = c; + break; + } + case ACPI_AEST_TLB_RESOURCE: { + struct acpi_aest_processor_tlb *t; + + t = kzalloc_obj(*t, GFP_KERNEL); + if (!t) { + kfree(proc); + return -ENOMEM; + } + of_property_read_u32(np, "arm,tlb-level", + &t->tlb_level); + anode->tlb = t; + break; + } + default: { + struct acpi_aest_processor_generic *g; + + g = kzalloc_obj(*g, GFP_KERNEL); + if (!g) { + kfree(proc); + return -ENOMEM; + } + of_property_read_u32(np, "arm,resource-ref", + &g->resource); + anode->generic = g; + break; + } + } + anode->processor = proc; + break; + } + + case ACPI_AEST_MEMORY_ERROR_NODE: { + struct acpi_aest_memory *mem; + + mem = kzalloc_obj(*mem, GFP_KERNEL); + + if (!mem) + return -ENOMEM; + of_property_read_u32(np, "arm,proximity-domain", + &mem->srat_proximity_domain); + anode->memory = mem; + break; + } + + case ACPI_AEST_SMMU_ERROR_NODE: { + struct acpi_aest_smmu *smmu; + struct device_node *smmu_np; + + smmu = kzalloc_obj(*smmu, GFP_KERNEL); + + if (!smmu) + return -ENOMEM; + smmu_np = of_parse_phandle(np, "arm,smmu-ref", 0); + if (smmu_np) { + /* Use the DT node offset as the IORT reference */ + smmu->iort_node_reference = smmu_np->phandle; + of_node_put(smmu_np); + } + of_property_read_u32(np, "arm,smmu-subcomponent", + &smmu->subcomponent_reference); + anode->smmu = smmu; + break; + } + + case ACPI_AEST_VENDOR_ERROR_NODE: { + struct acpi_aest_vendor_v2 *vendor; + const char *hid = "ARMHC000"; + + vendor = kzalloc_obj(*vendor, GFP_KERNEL); + + if (!vendor) + return -ENOMEM; + of_property_read_string(np, "arm,vendor-hid", &hid); + strscpy(vendor->acpi_hid, hid, sizeof(vendor->acpi_hid)); + of_property_read_u32(np, "arm,vendor-uid", + &vendor->acpi_uid); + anode->vendor = vendor; + break; + } + + case ACPI_AEST_GIC_ERROR_NODE: { + struct acpi_aest_gic *gic; + + gic = kzalloc_obj(*gic, GFP_KERNEL); + + if (!gic) + return -ENOMEM; + of_property_read_u32(np, "arm,gic-type", + &gic->interface_type); + of_property_read_u32(np, "arm,gic-instance", + &gic->instance_id); + anode->gic = gic; + break; + } + + case ACPI_AEST_PCIE_ERROR_NODE: { + struct acpi_aest_pcie *pcie; + + pcie = kzalloc_obj(*pcie, GFP_KERNEL); + + if (!pcie) + return -ENOMEM; + of_property_read_u32(np, "arm,pcie-segment", + &pcie->iort_node_reference); + anode->pcie = pcie; + break; + } + + case ACPI_AEST_PROXY_ERROR_NODE: + /* No node-specific data for proxy nodes */ + anode->spec_pointer = NULL; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static struct acpi_aest_node * +dt_aest_alloc_anode(struct device_node *np, int node_type) +{ + struct acpi_aest_node *anode; + int ret; + + anode = kzalloc_obj(*anode, GFP_KERNEL); + if (!anode) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&anode->list); + anode->type = node_type; + + ret = dt_aest_build_interface(np, anode); + if (ret) + goto err_free; + + ret = dt_aest_build_node_specific(np, anode, node_type); + if (ret) + goto err_free; + + ret = dt_aest_build_interrupt(np, anode); + if (ret) + goto err_free; + + return anode; + +err_free: + kfree(anode->interface_hdr); + kfree(anode->spec_pointer); + kfree(anode->processor_spec_pointer); + kfree(anode); + return ERR_PTR(ret); +} + +static int dt_aest_init_one_node(struct device_node *np, + struct dt_aest_priv *priv) +{ + int node_type; + struct aest_hnode *ahnode; + struct acpi_aest_node *anode; + + node_type = dt_aest_node_type(np); + if (node_type < 0) { + pr_warn("unknown node type for %pOF, skipping\n", np); + return 0; + } + + ahnode = dt_aest_alloc_hnode(node_type, priv->node_id); + if (!ahnode) + return -ENOMEM; + + anode = dt_aest_alloc_anode(np, node_type); + if (IS_ERR(anode)) { + kfree(ahnode); + return PTR_ERR(anode); + } + + list_add_tail(&anode->list, &ahnode->list); + ahnode->count = 1; + + if (xa_err(xa_store(&priv->aest_array, priv->node_id, + ahnode, GFP_KERNEL))) { + kfree(anode); + kfree(ahnode); + return -ENOMEM; + } + priv->node_id++; + return 0; +} + +static int dt_aest_init_nodes(struct device_node *aest_root, + struct dt_aest_priv *priv) +{ + struct device_node *np; + int ret; + + for_each_available_child_of_node(aest_root, np) { + ret = dt_aest_init_one_node(np, priv); + if (ret) { + pr_err("failed to init node %pOF: %d\n", np, ret); + of_node_put(np); + return ret; + } + } + return 0; +} + +static struct platform_device *dt_aest_alloc_pdev(struct aest_hnode *ahnode, + int index) +{ + struct platform_device *pdev; + struct resource *res; + struct acpi_aest_node *anode; + int ret, size, j; + int irq[AEST_MAX_INTERRUPT_PER_NODE] = { 0 }; + + pdev = platform_device_alloc("AEST", index); + if (!pdev) + return ERR_PTR(-ENOMEM); + + res = kcalloc(ahnode->count + AEST_MAX_INTERRUPT_PER_NODE, + sizeof(*res), GFP_KERNEL); + if (!res) { + platform_device_put(pdev); + return ERR_PTR(-ENOMEM); + } + + j = 0; + list_for_each_entry(anode, &ahnode->list, list) { + if (anode->interface_hdr->type != + ACPI_AEST_NODE_SYSTEM_REGISTER) { + res[j].name = AEST_NODE_NAME; + res[j].start = anode->interface_hdr->address; + + switch (anode->interface_hdr->group_format) { + case ACPI_AEST_NODE_GROUP_FORMAT_4K: + size = 4 * KB; break; + case ACPI_AEST_NODE_GROUP_FORMAT_16K: + size = 16 * KB; break; + case ACPI_AEST_NODE_GROUP_FORMAT_64K: + size = 64 * KB; break; + default: + size = 4 * KB; + } + res[j].end = res[j].start + size - 1; + res[j].flags = IORESOURCE_MEM; + j++; + } + + if (anode->interrupt && anode->interrupt_count > 0) { + int k; + + for (k = 0; k < anode->interrupt_count && + k < AEST_MAX_INTERRUPT_PER_NODE; k++) { + + struct acpi_aest_node_interrupt_v2 *intr = + &anode->interrupt[k]; + int itype = intr->type; + int virq = intr->gsiv; + struct irq_data *irqd; + + if (!virq) + continue; + if (itype >= AEST_MAX_INTERRUPT_PER_NODE) + continue; + if (irq[itype] == virq) + continue; + irq[itype] = virq; + /* + * aest_config_irq() writes intr->gsiv directly + * to the hardware IRQ-config register, so it + * must hold the GIC hardware SPI number, not the + * Linux virtual IRQ. Convert here now that we + * have the virq in hand; the resource still gets + * the virq so devm_request_irq() works correctly. + */ + irqd = irq_get_irq_data(virq); + if (irqd) + intr->gsiv = irqd->hwirq; + + res[j].name = (itype == ACPI_AEST_NODE_FAULT_HANDLING) + ? AEST_FHI_NAME : AEST_ERI_NAME; + res[j].start = virq; + res[j].end = virq; + res[j].flags = IORESOURCE_IRQ; + j++; + } + } + } + + ret = platform_device_add_resources(pdev, res, j); + kfree(res); + if (ret) { + platform_device_put(pdev); + return ERR_PTR(ret); + } + + ret = platform_device_add_data(pdev, &ahnode, sizeof(ahnode)); + if (ret) { + platform_device_put(pdev); + return ERR_PTR(ret); + } + + ret = platform_device_add(pdev); + if (ret) { + platform_device_put(pdev); + return ERR_PTR(ret); + } + + return pdev; +} + +static int dt_aest_alloc_pdevs(struct dt_aest_priv *priv) +{ + struct aest_hnode *ahnode; + unsigned long i; + int ret = 0, index = 0; + + xa_for_each(&priv->aest_array, i, ahnode) { + struct platform_device *pdev = + dt_aest_alloc_pdev(ahnode, index++); + if (IS_ERR(pdev)) { + ret = PTR_ERR(pdev); + pr_err("failed to alloc pdev for node %u: %d\n", + ahnode->id, ret); + break; + } + } + return ret; +} + +static int __init dt_aest_init(void) +{ + struct device_node *aest_root; + struct dt_aest_priv priv = {}; + int ret; + + if (!acpi_disabled) + return 0; + + aest_root = of_find_compatible_node(NULL, NULL, "arm,aest"); + if (!aest_root) + return 0; + + xa_init(&priv.aest_array); + + ret = dt_aest_init_nodes(aest_root, &priv); + of_node_put(aest_root); + if (ret) { + pr_err("failed to init AEST nodes: %d\n", ret); + return ret; + } + + ret = dt_aest_alloc_pdevs(&priv); + if (ret) { + pr_err("failed to alloc AEST pdevs: %d\n", ret); + return ret; + } + + pr_info("registered %u AEST error source(s) from DT\n", priv.node_id); + + return 0; +} +subsys_initcall_sync(dt_aest_init); From 4db0cd878e351c86ba5d334360202c096f261d47 Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:51 +0530 Subject: [PATCH 23/24] FROMLIST: arm64: dts: qcom: lemans: add AEST error nodes Add AEST RAS error source nodes for the Lemans SoC. The DT describes a processor error source covering all CPU cores and a shared L3 cache error source for the cluster. These nodes model the hardware error reporting blocks and associated interrupts as required by the Arm AEST specification. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-7-d5d6ffacf0a5@oss.qualcomm.com/ Co-developed-by: Faruque Ansari Signed-off-by: Faruque Ansari Signed-off-by: Umang Chheda --- arch/arm64/boot/dts/qcom/lemans.dtsi | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/lemans.dtsi b/arch/arm64/boot/dts/qcom/lemans.dtsi index fe6e763518230..199ea1f9a8d58 100644 --- a/arch/arm64/boot/dts/qcom/lemans.dtsi +++ b/arch/arm64/boot/dts/qcom/lemans.dtsi @@ -4,6 +4,7 @@ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */ +#include #include #include #include @@ -29,6 +30,46 @@ #address-cells = <2>; #size-cells = <2>; + aest { + compatible = "arm,aest"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + + aest-processor-0 { + compatible = "arm,aest-processor"; + arm,num-records = <1>; + arm,record-impl = /bits/ 64 <0x0>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + interrupts = ; + interrupt-names = "fhi"; + }; + + aest-l3-cluster0 { + compatible = "arm,aest-processor"; + arm,num-records = <2>; + arm,record-impl = /bits/ 64 <0x1>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + interrupts = ; + interrupt-names = "fhi"; + }; + + aest-l3-cluster1 { + compatible = "arm,aest-processor"; + arm,num-records = <2>; + arm,record-impl = /bits/ 64 <0x1>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + interrupts = ; + interrupt-names = "fhi"; + }; + }; + clocks { xo_board_clk: xo-board-clk { compatible = "fixed-clock"; From fa9ef2c33e58fe027b95c40bc8eadc0a1b6678da Mon Sep 17 00:00:00 2001 From: Umang Chheda Date: Tue, 5 May 2026 17:53:52 +0530 Subject: [PATCH 24/24] FROMLIST: arm64: dts: qcom: monaco: add AEST error nodes Add AEST RAS error source nodes for the Monaco SoC. The DT describes a processor error source covering all CPU cores and a shared L3 cache error source for the cluster. These nodes model the hardware error reporting blocks and associated interrupts as required by the Arm AEST specification. Link: https://lore.kernel.org/lkml/20260505-aest-devicetree-support-v1-8-d5d6ffacf0a5@oss.qualcomm.com/ Co-developed-by: Faruque Ansari Signed-off-by: Faruque Ansari Signed-off-by: Umang Chheda --- arch/arm64/boot/dts/qcom/monaco.dtsi | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/monaco.dtsi b/arch/arm64/boot/dts/qcom/monaco.dtsi index 7b1d57460f1e6..8e43ceed7d84a 100644 --- a/arch/arm64/boot/dts/qcom/monaco.dtsi +++ b/arch/arm64/boot/dts/qcom/monaco.dtsi @@ -3,6 +3,7 @@ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */ +#include #include #include #include @@ -29,6 +30,46 @@ #address-cells = <2>; #size-cells = <2>; + aest { + compatible = "arm,aest"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + + aest-processor-0 { + compatible = "arm,aest-processor"; + arm,num-records = <1>; + arm,record-impl = /bits/ 64 <0x0>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + interrupts = ; + interrupt-names = "fhi"; + }; + + aest-l3-cluster0 { + compatible = "arm,aest-processor"; + arm,num-records = <2>; + arm,record-impl = /bits/ 64 <0x1>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + interrupts = ; + interrupt-names = "fhi"; + }; + + aest-l3-cluster1 { + compatible = "arm,aest-processor"; + arm,num-records = <2>; + arm,record-impl = /bits/ 64 <0x1>; + arm,status-reporting = /bits/ 64 <0x0>; + arm,addressing-mode = /bits/ 64 <0x0>; + arm,processor-flags = ; + interrupts = ; + interrupt-names = "fhi"; + }; + }; + clocks { xo_board_clk: xo-board-clk { compatible = "fixed-clock";