Add the "inject-error" command that can be used to inject CXL protocol errors into CXL downstream ports and poison in to memory devices. The available error types can be found by using 'cxl-list' with the "-N"/"--injectable-errors" option.
The full list of supported device and error types can be found in the command's documentation. Signed-off-by: Ben Cheatham <benjamin.cheat...@amd.com> --- Documentation/cxl/cxl-inject-error.txt | 139 ++++++++++++++++ Documentation/cxl/meson.build | 1 + cxl/builtin.h | 1 + cxl/cxl.c | 1 + cxl/inject-error.c | 211 +++++++++++++++++++++++++ cxl/meson.build | 1 + 6 files changed, 354 insertions(+) create mode 100644 Documentation/cxl/cxl-inject-error.txt create mode 100644 cxl/inject-error.c diff --git a/Documentation/cxl/cxl-inject-error.txt b/Documentation/cxl/cxl-inject-error.txt new file mode 100644 index 0000000..50b25fe --- /dev/null +++ b/Documentation/cxl/cxl-inject-error.txt @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 + +cxl-inject-error(1) +=================== + +NAME +---- +cxl-inject-error - Inject CXL errors into CXL devices + +SYNOPSIS +-------- +[verse] +'cxl inject-error' <device name> [<options>] + +Inject an error into a CXL device. The type of errors supported depend on the +device specified. The types of devices supported are: + +"Downstream Ports":: A CXL RCH downstream port (dport) or a CXL VH root port. +Eligible CXL 2.0+ ports are dports of ports at depth 1 in the output of cxl-list. +Dports are specified by host name ("0000:0e:01.1"). +"memdevs":: A CXL memory device. Memory devices are specified by device name +("mem0"), device id ("0"), and/or host device name ("0000:35:00.0"). + +There are two types of errors which can be injected: CXL protocol errors +and device poison. + +CXL protocol errors can only be used with downstream ports (as defined above). +Protocol errors follow the format of "<protocol>-<severity>". For example, +a "mem-fatal" error is a CXL.mem fatal protocol error. Protocol errors can be +found with the '-N' option of 'cxl-list' under a CXL bus object. For example: + +---- + +# cxl list -NB +[ + { + "bus":"root0", + "provider":"ACPI.CXL", + "injectable_protocol_errors":[ + "mem-correctable", + "mem-fatal", + ] + } +] + +---- + +CXL protocol (CXL.cache/mem) error injection requires the platform to support +ACPI v6.5+ error injection (EINJ). In addition to platform support, the +CONFIG_ACPI_APEI_EINJ and CONFIG_ACPI_APEI_EINJ_CXL kernel configuration options +will need to be enabled. For more information, view the Linux kernel documentation +on EINJ. + +Device poison can only by used with CXL memory devices. A device physical address +(DPA) is required to do poison injection. DPAs range from 0 to the size of +device's memory, which can be found using 'cxl-list'. An example injection: + +---- + +# cxl inject-error mem0 -t poison -a 0x1000 +poison injected at mem0:0x1000 +# cxl list -m mem0 -u --media-errors +{ + "memdev":"mem0", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0", + "host":"0000:0d:00.0", + "firmware_version":"BWFW VERSION 00", + "media_errors":[ + { + "offset":"0x1000", + "length":64, + "source":"Injected" + } + ] +} + +---- + +Not all devices support poison injection. To see if a device supports poison injection +through debugfs, use 'cxl-list' with the '-N' option and look for the "poison-injectable" +attribute under the device. Example: + +---- + +# cxl list -Nu -m mem0 +{ + "memdev":"mem0", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0", + "host":"0000:0d:00.0", + "firmware_version":"BWFW VERSION 00", + "poison_injectable":true +} + +---- + +This command depends on the kernel debug filesystem (debugfs) to do CXL protocol +error and device poison injection. If your kernel debugfs is not mounted at +the normal spot (/sys/kernel/debug) you will need to provide the path for it +using the '--debugfs' option. + + +OPTIONS +------- +-a:: +--address:: + Device physical address (DPA) to use for poison injection. Address can + be specified in hex or decimal. Required for poison injection. + +-t:: +--type:: + Type of error to inject into <device name>. The type of error is restricted + by device type. The following shows the possible types under their associated + device type(s): +---- + +Downstream Ports: :: + cache-correctable, cache-uncorrectable, cache-fatal, mem-correctable, + mem-fatal + +Memdevs: :: + poison + +---- + +--clear:: + Clear poison previously injected into a device. + +--debug:: + Enable debug output + +--debugfs:: + The mount point of the Linux kernel debug filesystem (debugfs). Defaults + to "/sys/kernel/debug" if left unspecified. + +SEE ALSO +-------- +linkcxl:cxl-list[1] diff --git a/Documentation/cxl/meson.build b/Documentation/cxl/meson.build index 8085c1c..1502d25 100644 --- a/Documentation/cxl/meson.build +++ b/Documentation/cxl/meson.build @@ -50,6 +50,7 @@ cxl_manpages = [ 'cxl-update-firmware.txt', 'cxl-set-alert-config.txt', 'cxl-wait-sanitize.txt', + 'cxl-inject-error.txt', ] foreach man : cxl_manpages diff --git a/cxl/builtin.h b/cxl/builtin.h index c483f30..e82fcb5 100644 --- a/cxl/builtin.h +++ b/cxl/builtin.h @@ -25,6 +25,7 @@ int cmd_create_region(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_enable_region(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_disable_region(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_destroy_region(int argc, const char **argv, struct cxl_ctx *ctx); +int cmd_inject_error(int argc, const char **argv, struct cxl_ctx *ctx); #ifdef ENABLE_LIBTRACEFS int cmd_monitor(int argc, const char **argv, struct cxl_ctx *ctx); #else diff --git a/cxl/cxl.c b/cxl/cxl.c index 1643667..a98bd6b 100644 --- a/cxl/cxl.c +++ b/cxl/cxl.c @@ -80,6 +80,7 @@ static struct cmd_struct commands[] = { { "disable-region", .c_fn = cmd_disable_region }, { "destroy-region", .c_fn = cmd_destroy_region }, { "monitor", .c_fn = cmd_monitor }, + { "inject-error", .c_fn = cmd_inject_error }, }; int main(int argc, const char **argv) diff --git a/cxl/inject-error.c b/cxl/inject-error.c new file mode 100644 index 0000000..907bfc2 --- /dev/null +++ b/cxl/inject-error.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 AMD. All rights reserved. */ +#include <util/parse-options.h> +#include <cxl/libcxl.h> +#include <cxl/filter.h> +#include <util/log.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <errno.h> +#include <limits.h> + +#define EINJ_TYPES_BUF_SIZE 512 + +static const char *debugfs; +static bool debug; + +static struct inject_params { + const char *type; + const char *address; + bool clear; +} param; + +static const struct option inject_options[] = { + OPT_STRING('t', "type", ¶m.type, "Error type", + "Error type to inject into <device>"), + OPT_STRING('a', "address", ¶m.address, "Address for poison injection", + "Device physical address for poison injection in hex or decimal"), + OPT_BOOLEAN(0, "clear", ¶m.clear, "Clear poison instead of inject"), + OPT_STRING(0, "debugfs", &debugfs, "debugfs mount point", + "Mount point for debug file system, defaults to /sys/kernel/debug"), +#ifdef ENABLE_DEBUG + OPT_BOOLEAN(0, "debug", &debug, "turn on debug output"), +#endif + OPT_END(), +}; + +static struct log_ctx iel; + +static struct cxl_protocol_error *find_cxl_proto_err(struct cxl_ctx *ctx, + const char *type) +{ + struct cxl_protocol_error *perror; + + cxl_protocol_error_foreach(ctx, perror) { + if (strcmp(type, cxl_protocol_error_get_str(perror)) == 0) + return perror; + } + + log_err(&iel, "Invalid CXL protocol error type: %s\n", type); + return NULL; +} + +static struct cxl_dport *find_cxl_dport(struct cxl_ctx *ctx, const char *devname) +{ + struct cxl_port *port, *top; + struct cxl_dport *dport; + struct cxl_bus *bus; + + cxl_bus_foreach(ctx, bus) { + top = cxl_bus_get_port(bus); + + cxl_port_foreach_all(top, port) + cxl_dport_foreach(port, dport) + if (!strcmp(devname, + cxl_dport_get_devname(dport))) + return dport; + } + + log_err(&iel, "Downstream port \"%s\" not found\n", devname); + return NULL; +} + +static struct cxl_memdev *find_cxl_memdev(struct cxl_ctx *ctx, const char *filter) +{ + struct cxl_memdev *memdev; + + cxl_memdev_foreach(ctx, memdev) { + if (util_cxl_memdev_filter(memdev, filter, NULL)) + return memdev; + } + + log_err(&iel, "Memdev \"%s\" not found\n", filter); + return NULL; +} + +static int inject_proto_err(struct cxl_ctx *ctx, const char *devname, + struct cxl_protocol_error *perror) +{ + struct cxl_dport *dport; + int rc; + + if (!devname) { + log_err(&iel, "No downstream port specified for injection\n"); + return -EINVAL; + } + + dport = find_cxl_dport(ctx, devname); + if (!dport) + return -ENODEV; + + rc = cxl_dport_protocol_error_inject(dport, + cxl_protocol_error_get_num(perror)); + if (rc) + return rc; + + printf("injected %s protocol error.\n", + cxl_protocol_error_get_str(perror)); + return 0; +} + +static int inject_poison(struct cxl_ctx *ctx, const char *filter, + const char *addr, bool clear) +{ + struct cxl_memdev *memdev; + size_t a; + int rc; + + memdev = find_cxl_memdev(ctx, filter); + if (!memdev) + return -ENODEV; + + if (!cxl_memdev_has_poison_injection(memdev)) { + log_err(&iel, "%s does not support error injection\n", + cxl_memdev_get_devname(memdev)); + return -EINVAL; + } + + if (!addr) { + log_err(&iel, "no address provided\n"); + return -EINVAL; + } + + a = strtoull(addr, NULL, 0); + if (a == ULLONG_MAX && errno == ERANGE) { + log_err(&iel, "invalid address %s: %s", addr, strerror(-EINVAL)); + return -EINVAL; + } + + if (clear) + rc = cxl_memdev_clear_poison(memdev, a); + else + rc = cxl_memdev_inject_poison(memdev, a); + + if (rc) { + log_err(&iel, "failed to %s %s:%s: %s\n", + clear ? "clear poison at" : "inject point at", + cxl_memdev_get_devname(memdev), addr, strerror(-rc)); + } else { + printf("poison %s at %s:%s\n", clear ? "cleared" : "injected", + cxl_memdev_get_devname(memdev), addr); + } + + return rc; +} + +static int inject_action(int argc, const char **argv, struct cxl_ctx *ctx, + const struct option *options, const char *usage) +{ + struct cxl_protocol_error *perr; + const char * const u[] = { + usage, + NULL + }; + int rc = -EINVAL; + + log_init(&iel, "cxl inject-error", "CXL_INJECT_LOG"); + argc = parse_options(argc, argv, options, u, 0); + + if (debug) { + cxl_set_log_priority(ctx, LOG_DEBUG); + iel.log_priority = LOG_DEBUG; + } else { + iel.log_priority = LOG_INFO; + } + + if (debugfs) + cxl_set_debugfs(ctx, debugfs); + + if (argc != 1) { + usage_with_options(u, options); + return rc; + } + + if (strcmp(param.type, "poison") == 0) { + rc = inject_poison(ctx, argv[0], param.address, param.clear); + if (rc) + log_err(&iel, "Failed to inject poison into %s: %s\n", + argv[0], strerror(-rc)); + + return rc; + } + + perr = find_cxl_proto_err(ctx, param.type); + if (perr) { + rc = inject_proto_err(ctx, argv[0], perr); + if (rc) + log_err(&iel, "Failed to inject error: %d\n", rc); + } + + log_err(&iel, "Invalid error type %s", param.type); + return rc; +} + +int cmd_inject_error(int argc, const char **argv, struct cxl_ctx *ctx) +{ + int rc = inject_action(argc, argv, ctx, inject_options, + "inject-error <device> [<options>]"); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/cxl/meson.build b/cxl/meson.build index e4d1683..29918e4 100644 --- a/cxl/meson.build +++ b/cxl/meson.build @@ -7,6 +7,7 @@ cxl_src = [ 'memdev.c', 'json.c', 'filter.c', + 'inject-error.c', '../daxctl/json.c', '../daxctl/filter.c', ] -- 2.34.1