Module Name: src Committed By: mrg Date: Fri Oct 27 05:45:00 UTC 2023
Modified Files: src/sys/arch/x86/x86: errata.c Log Message: x86: handle AMD errata 1474: A CPU core may hang after about 1044 days from the new comment: * This requires disabling CC6 power level, which can be a performance * issue since it stops full turbo in some implementations (eg, half the * cores must be in CC6 to achieve the highest boost level.) Set a timer * to fire in 1000 days -- except NetBSD timers end up having a signed * 32-bit hz-based value, which rolls over in under 25 days with HZ=1000, * and doing xcall(9) or kthread(9) from a callout is not allowed anyway, * so just have a kthread wait 1 day for 1000 times. documented in: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/revision-guides/56323-PUB_1_01.pdf To generate a diff of this commit: cvs rdiff -u -r1.34 -r1.35 src/sys/arch/x86/x86/errata.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/x86/errata.c diff -u src/sys/arch/x86/x86/errata.c:1.34 src/sys/arch/x86/x86/errata.c:1.35 --- src/sys/arch/x86/x86/errata.c:1.34 Fri Oct 27 03:06:04 2023 +++ src/sys/arch/x86/x86/errata.c Fri Oct 27 05:45:00 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: errata.c,v 1.34 2023/10/27 03:06:04 mrg Exp $ */ +/* $NetBSD: errata.c,v 1.35 2023/10/27 05:45:00 mrg Exp $ */ /*- * Copyright (c) 2007 The NetBSD Foundation, Inc. @@ -47,10 +47,13 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: errata.c,v 1.34 2023/10/27 03:06:04 mrg Exp $"); +__KERNEL_RCSID(0, "$NetBSD: errata.c,v 1.35 2023/10/27 05:45:00 mrg Exp $"); -#include <sys/types.h> +#include <sys/param.h> #include <sys/systm.h> +#include <sys/xcall.h> +#include <sys/kthread.h> +#include <sys/clock.h> #include <machine/cpu.h> #include <machine/cpufunc.h> @@ -255,6 +258,7 @@ static const uint8_t x86_errata_zen2[] = static bool x86_errata_setmsr(struct cpu_info *, errata_t *); static bool x86_errata_testmsr(struct cpu_info *, errata_t *); +static bool x86_errata_amd_1474(struct cpu_info *, errata_t *); static errata_t errata[] = { /* @@ -453,6 +457,13 @@ static errata_t errata[] = { x86_errata_setmsr, LS_CFG_ERRATA_1095, NULL }, /* + * 1474: A CPU core may hang after about 1044 days + */ + { + 1474, FALSE, MSR_CC6_CFG, x86_errata_zen2, + x86_errata_amd_1474, CC6_CFG_DISABLE_BITS, NULL + }, + /* * Zenbleed: * https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7008.html * https://github.com/google/security-research/security/advisories/GHSA-v6wh-rxpg-cmm8 @@ -465,6 +476,96 @@ static errata_t errata[] = { }, }; +/* + * 1474: A CPU core may hang after about 1044 days + * + * This requires disabling CC6 power level, which can be a performance + * issue since it stops full turbo in some implementations (eg, half the + * cores must be in CC6 to achieve the highest boost level.) Set a timer + * to fire in 1000 days -- except NetBSD timers end up having a signed + * 32-bit hz-based value, which rolls over in under 25 days with HZ=1000, + * and doing xcall(9) or kthread(9) from a callout is not allowed anyway, + * so just have a kthread wait 1 day for 1000 times. + */ + +#define AMD_ERRATA_1474_WARN_DAYS 950 +#define AMD_ERRATA_1474_BAD_DAYS 1000 + +static void +amd_errata_1474_disable_cc6(void *a1, void *a2) +{ + errata_t *e = a1; + uint64_t val; + + val = rdmsr_locked(e->e_data1); + if ((val & e->e_data2) == 0) + return; + wrmsr_locked(e->e_data1, val & ~e->e_data2); + aprint_debug_dev(curcpu()->ci_dev, "erratum %u patched\n", + e->e_num); +} + +static void +amd_errata_1474_thread(void *arg) +{ + int loops = 0; + int ticks; + + ticks = hz * SECS_PER_DAY; +#ifdef X86_ERRATA_TEST_AMD_1474 + /* + * Make this trigger warning after 50 seconds, and workaround + * at 100 seconds, for easy testing. + */ + ticks = hz; + loops = 900; +#endif + + while (loops++ < AMD_ERRATA_1474_BAD_DAYS) { + if (loops == AMD_ERRATA_1474_WARN_DAYS) { + printf("warning: AMD Errata 1474 workaround scheduled " + "for %u days.\n", AMD_ERRATA_1474_BAD_DAYS - + AMD_ERRATA_1474_WARN_DAYS); + printf("warning: reboot required to avoid.\n"); + } + kpause("amd1474", false, ticks, NULL); + } + + /* Been 1000 days, disable CC6 and warn about it. */ + uint64_t xc = xc_broadcast(0, amd_errata_1474_disable_cc6, arg, NULL); + xc_wait(xc); + + printf("warning: AMD CC6 disabled due to errata 1474.\n"); + printf("warning: reboot required to restore full turbo speeds.\n"); + + kthread_exit(0); +} + +static bool +x86_errata_amd_1474(struct cpu_info *ci, errata_t *e) +{ + int error; + + /* Don't do anything on non-primary CPUs. */ + if (!CPU_IS_PRIMARY(ci)) + return FALSE; + + error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, + amd_errata_1474_thread, e, NULL, "amd1474"); + if (error) { + printf("WARNING: Unable to disable AMD errata 1474!\n"); + printf("WARNING: reboot system after %u days to avoid CPU " + "hangs.\n", AMD_ERRATA_1474_BAD_DAYS); + } else { + aprint_debug_dev(ci->ci_dev, "workaround for erratum %u " + "scheduled for %u days\n", e->e_num, + AMD_ERRATA_1474_BAD_DAYS); + } + + /* Do own warning here, it's not like most others. */ + return FALSE; +} + static void x86_errata_log(device_t dev, errata_t *e, const char *msg) {