Exercise the full driver path on real Grace and Vera hardware using ACPI EINJ to inject CPER sections and validate the kernel log output. KUnit covers the parser in isolation; this test covers the path from firmware notification through GUID dispatch to decoded output.
Signed-off-by: Kai-Heng Feng <[email protected]> --- tools/testing/selftests/firmware/Makefile | 4 +- tools/testing/selftests/firmware/config | 5 + tools/testing/selftests/firmware/einj_lib.sh | 189 ++++++++++++++++++ .../selftests/firmware/ghes_nvidia_einj.sh | 144 +++++++++++++ .../firmware/ghes_nvidia_einj_profiles.sh | 46 +++++ 5 files changed, 386 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/firmware/einj_lib.sh create mode 100755 tools/testing/selftests/firmware/ghes_nvidia_einj.sh create mode 100755 tools/testing/selftests/firmware/ghes_nvidia_einj_profiles.sh diff --git a/tools/testing/selftests/firmware/Makefile b/tools/testing/selftests/firmware/Makefile index 7992969deaa2..b753dd123860 100644 --- a/tools/testing/selftests/firmware/Makefile +++ b/tools/testing/selftests/firmware/Makefile @@ -3,8 +3,8 @@ CFLAGS = -Wall \ -O2 -TEST_PROGS := fw_run_tests.sh -TEST_FILES := fw_fallback.sh fw_filesystem.sh fw_upload.sh fw_lib.sh +TEST_PROGS := fw_run_tests.sh ghes_nvidia_einj.sh +TEST_FILES := fw_fallback.sh fw_filesystem.sh fw_upload.sh fw_lib.sh einj_lib.sh ghes_nvidia_einj_profiles.sh TEST_GEN_FILES := fw_namespace include ../lib.mk diff --git a/tools/testing/selftests/firmware/config b/tools/testing/selftests/firmware/config index 6e402519b117..1b68e638d0b7 100644 --- a/tools/testing/selftests/firmware/config +++ b/tools/testing/selftests/firmware/config @@ -4,3 +4,8 @@ CONFIG_FW_LOADER_USER_HELPER=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_FW_UPLOAD=y +CONFIG_DEBUG_FS=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_EINJ=y +CONFIG_ACPI_APEI_GHES_NVIDIA=y diff --git a/tools/testing/selftests/firmware/einj_lib.sh b/tools/testing/selftests/firmware/einj_lib.sh new file mode 100644 index 000000000000..ca569a9fe5b0 --- /dev/null +++ b/tools/testing/selftests/firmware/einj_lib.sh @@ -0,0 +1,189 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +EINJ_TABLE=/sys/firmware/acpi/tables/EINJ +EINJ_DEBUGFS=/sys/kernel/debug/apei/einj +NVIDIA_PLATFORM_GLOB=/sys/bus/platform/devices/NVDA2012:* +NVIDIA_DRIVER_DIR=/sys/bus/platform/drivers/nvidia-ghes + +einj_skip() +{ + echo "$0: $1" >&2 + exit $ksft_skip +} + +einj_require_root() +{ + [ "$(id -u)" -eq 0 ] || einj_skip "must be run as root" +} + +einj_require_debugfs() +{ + [ -d /sys/kernel/debug ] || einj_skip "debugfs is not mounted at /sys/kernel/debug" +} + +einj_require_einj() +{ + [ -e "$EINJ_TABLE" ] || einj_skip "ACPI EINJ table is missing" + if [ ! -d "$EINJ_DEBUGFS" ]; then + modprobe einj 2>/dev/null || true + fi + [ -d "$EINJ_DEBUGFS" ] || einj_skip "EINJ debugfs directory is missing" +} + +einj_require_vendor_einj() +{ + [ -e "$EINJ_DEBUGFS/vendor" ] || einj_skip "NVIDIA vendor EINJ metadata is missing" + [ -e "$EINJ_DEBUGFS/vendor_flags" ] || einj_skip "NVIDIA vendor EINJ flags are missing" +} + +einj_require_available_error_type() +{ + local available + + available=$(einj_read_trimmed_value available_error_type) + [ -n "$available" ] || einj_skip "available_error_type is missing" +} + +einj_read_trimmed_value() +{ + local file=$1 + + einj_read_value "$file" | tr -d '\n' +} + +einj_require_writable_value() +{ + local file=$1 + + [ -w "$EINJ_DEBUGFS/$file" ] || einj_skip "$file is not writable" +} + +einj_require_writable_profile() +{ + local file + + for file in error_type flags vendor_flags param1 param2 param3 param4 notrigger; do + einj_require_writable_value "$file" + done +} + +einj_find_bound_nvidia_device() +{ + local dev + + for dev in $NVIDIA_PLATFORM_GLOB; do + [ -e "$dev" ] || continue + if [ "$(readlink -f "$dev/driver" 2>/dev/null)" = "$NVIDIA_DRIVER_DIR" ]; then + echo "$dev" + return 0 + fi + done + + return 1 +} + +einj_require_bound_nvidia_device() +{ + local dev + + dev=$(einj_find_bound_nvidia_device) || einj_skip "no bound NVIDIA GHES device" + echo "$dev" +} + +einj_read_value() +{ + local file=$1 + + cat "$EINJ_DEBUGFS/$file" +} + +einj_write_value() +{ + local file=$1 + local value=$2 + + printf '%s\n' "$value" > "$EINJ_DEBUGFS/$file" +} + +einj_restore_value() +{ + local file=$1 + local value=$2 + + # Some EINJ controls read back as an empty string when unset, but the + # debugfs write handler has no matching "clear" operation. + [ -n "$value" ] || return 0 + einj_write_value "$file" "$value" +} + +einj_save_state() +{ + EINJ_SAVED_ERROR_TYPE=$(einj_read_value error_type) + EINJ_SAVED_FLAGS=$(einj_read_value flags) + EINJ_SAVED_PARAM1=$(einj_read_value param1) + EINJ_SAVED_PARAM2=$(einj_read_value param2) + EINJ_SAVED_PARAM3=$(einj_read_value param3) + EINJ_SAVED_PARAM4=$(einj_read_value param4) + EINJ_SAVED_VENDOR_FLAGS=$(einj_read_value vendor_flags) + EINJ_SAVED_NOTRIGGER=$(einj_read_value notrigger) +} + +einj_restore_state() +{ + [ -n "${EINJ_SAVED_ERROR_TYPE+x}" ] || return 0 + + einj_restore_value error_type "$EINJ_SAVED_ERROR_TYPE" + einj_restore_value flags "$EINJ_SAVED_FLAGS" + einj_restore_value param1 "$EINJ_SAVED_PARAM1" + einj_restore_value param2 "$EINJ_SAVED_PARAM2" + einj_restore_value param3 "$EINJ_SAVED_PARAM3" + einj_restore_value param4 "$EINJ_SAVED_PARAM4" + einj_restore_value vendor_flags "$EINJ_SAVED_VENDOR_FLAGS" + einj_restore_value notrigger "$EINJ_SAVED_NOTRIGGER" +} + +einj_emit_kmsg_marker() +{ + local tag=$1 + local marker + + marker="ghes-nvidia-einj:${tag}:$$:${RANDOM}" + printf '%s\n' "$marker" > /dev/kmsg + printf '%s\n' "$marker" +} + +einj_capture_dmesg_after_marker() +{ + local marker=$1 + + dmesg | awk -v marker="$marker" ' + found { print } + index($0, marker) { found = 1 } + ' +} + +einj_wait_for_dmesg_after_marker_contains() +{ + local marker=$1 + local needle=$2 + local timeout=${3:-10} + local i + local slice + + for i in $(seq 1 "$timeout"); do + slice=$(einj_capture_dmesg_after_marker "$marker") + if printf '%s\n' "$slice" | grep -Fq "$needle"; then + printf '%s\n' "$slice" + return 0 + fi + sleep 1 + done + + return 1 +} diff --git a/tools/testing/selftests/firmware/ghes_nvidia_einj.sh b/tools/testing/selftests/firmware/ghes_nvidia_einj.sh new file mode 100755 index 000000000000..6fc4d3189235 --- /dev/null +++ b/tools/testing/selftests/firmware/ghes_nvidia_einj.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e + +TEST_DIR=$(dirname "$0") +source "$TEST_DIR/einj_lib.sh" +source "$TEST_DIR/ghes_nvidia_einj_profiles.sh" + +einj_assert_nvidia_cper_output() +{ + local profile=$1 + local output=$2 + + if printf '%s\n' "$output" | grep -Fq 'Malformed NVIDIA'; then + echo "$0: $profile produced malformed NVIDIA CPER output" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + + if printf '%s\n' "$output" | grep -Fq 'NVIDIA Grace CPER section'; then + if ! printf '%s\n' "$output" | grep -Fq 'signature:'; then + echo "$0: $profile Grace output missing signature line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + if ! printf '%s\n' "$output" | grep -Fq 'error_type:'; then + echo "$0: $profile Grace output missing error_type line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + if ! printf '%s\n' "$output" | grep -Fq 'number_regs:'; then + echo "$0: $profile Grace output missing number_regs line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + if ! printf '%s\n' "$output" | grep -Fq 'instance_base:'; then + echo "$0: $profile Grace output missing instance_base line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + return 0 + fi + + if printf '%s\n' "$output" | grep -Fq 'NVIDIA Vera CPER section'; then + if ! printf '%s\n' "$output" | grep -Fq 'signature:'; then + echo "$0: $profile Vera output missing signature line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + if ! printf '%s\n' "$output" | grep -Fq 'event_type:'; then + echo "$0: $profile Vera output missing event_type line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + if ! printf '%s\n' "$output" | grep -Fq 'event_sub_type:'; then + echo "$0: $profile Vera output missing event_sub_type line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + if ! printf '%s\n' "$output" | grep -Fq 'event_context_count:'; then + echo "$0: $profile Vera output missing event_context_count line" >&2 + printf '%s\n' "$output" >&2 + return 1 + fi + return 0 + fi + + echo "$0: $profile did not emit a recognized NVIDIA CPER section" >&2 + printf '%s\n' "$output" >&2 + return 1 +} + +einj_run_profile() +{ + local profile=$1 + local marker + local output + + if ! einj_select_profile "$profile"; then + echo "$0: unknown safe NVIDIA EINJ profile: $profile" >&2 + return 1 + fi + + einj_require_writable_profile + + printf '%s: running safe sample %s\n' "$0" "$profile" + marker=$(einj_emit_kmsg_marker "$profile") + + einj_write_value error_type "$EINJ_PROFILE_ERROR_TYPE" + einj_write_value flags 0 + einj_write_value vendor_flags "$EINJ_PROFILE_VENDOR_FLAGS" + einj_write_value param1 "$EINJ_PROFILE_PARAM1" + einj_write_value param2 "$EINJ_PROFILE_PARAM2" + einj_write_value param3 "$EINJ_PROFILE_PARAM3" + einj_write_value param4 "$EINJ_PROFILE_PARAM4" + einj_write_value notrigger 0 + einj_write_value error_inject 1 + + output=$(einj_wait_for_dmesg_after_marker_contains "$marker" "$EINJ_PROFILE_BANNER" 10) || { + printf '%s: %s not supported on this platform\n' "$0" "$profile" + return "$ksft_skip" + } + + einj_assert_nvidia_cper_output "$profile" "$output" +} + +einj_cleanup() +{ + local status=$1 + + if ! einj_restore_state; then + echo "$0: failed to restore EINJ state" >&2 + [ "$status" -eq 0 ] && status=1 + fi + + exit "$status" +} + +main() +{ + local profile + local passed=0 + + einj_require_root + einj_require_debugfs + einj_require_einj + einj_require_vendor_einj + einj_require_available_error_type + einj_save_state + trap 'einj_cleanup "$?"' EXIT + + einj_require_bound_nvidia_device + + for profile in $(einj_list_profiles); do + einj_run_profile "$profile" && passed=$((passed + 1)) || { + [ "$?" -eq "$ksft_skip" ] || exit 1 + } + done + + [ "$passed" -gt 0 ] || einj_skip "no NVIDIA EINJ profiles produced output" +} + +main "$@" diff --git a/tools/testing/selftests/firmware/ghes_nvidia_einj_profiles.sh b/tools/testing/selftests/firmware/ghes_nvidia_einj_profiles.sh new file mode 100755 index 000000000000..b25461d2238c --- /dev/null +++ b/tools/testing/selftests/firmware/ghes_nvidia_einj_profiles.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e + +# Run both architecture profiles on every platform; firmware silently ignores +# selectors it does not support, so a timeout just means "not this platform". +EINJ_PROFILE_NAMES="cmet_dump_status_grace cmet_dump_status_vera" + +einj_list_profiles() +{ + printf '%s\n' $EINJ_PROFILE_NAMES +} + +einj_select_profile() +{ + local profile=$1 + + case "$profile" in + cmet_dump_status_grace) + # Grace CMET dump/status: informational sample, selector 3. + EINJ_PROFILE_ERROR_TYPE=0x80000010 + EINJ_PROFILE_VENDOR_FLAGS=1 + EINJ_PROFILE_PARAM1=3 + EINJ_PROFILE_PARAM2=0 + EINJ_PROFILE_PARAM3=0 + EINJ_PROFILE_PARAM4=0 + EINJ_PROFILE_BANNER='NVIDIA Grace CPER section' + ;; + cmet_dump_status_vera) + # Vera CMET-NULL dump/status: informational sample, selector 0. + EINJ_PROFILE_ERROR_TYPE=0x80000010 + EINJ_PROFILE_VENDOR_FLAGS=1 + EINJ_PROFILE_PARAM1=0 + EINJ_PROFILE_PARAM2=0 + EINJ_PROFILE_PARAM3=0 + EINJ_PROFILE_PARAM4=0 + EINJ_PROFILE_BANNER='NVIDIA Vera CPER section' + ;; + *) + return 1 + ;; + esac + + return 0 +} -- 2.50.1 (Apple Git-155)

