From: Andrei Gherzan <andrei.gher...@huawei.com> It was observerd that with glibc 2.33, sysconf reports unsupported option (-1) for _SC_LEVEL1_ICACHE_LINESIZE.
This can be reproduced with sysconf tool: ``` └─❯ docker run -ti --rm archlinux:base-20210214.0.15477 getconf -a | grep "GNU_LIBC_VERSION\|LEVEL1_ICACHE_LINESIZE" GNU_LIBC_VERSION glibc 2.33 LEVEL1_ICACHE_LINESIZE └─❯ docker run -ti --rm archlinux:base-20210131.0.14634 getconf -a | grep "GNU_LIBC_VERSION\|LEVEL1_ICACHE_LINESIZE" GNU_LIBC_VERSION glibc 2.32 LEVEL1_ICACHE_LINESIZE 64 ``` The offending patch in glibc is: commit 2d651eb9265d1366d7b9e881bfddd46db9c1ecc4 Author: H.J. Lu <hjl.to...@gmail.com> Date: Fri Sep 18 07:55:14 2020 -0700 x86: Move x86 processor cache info to cpu_features This patch reverts the above mentioned glibc change. It was tested on qemux86. Extra small cosmetic tweaks brought you by devtool (a superflous newline and whitespace). Signed-off-by: Andrei Gherzan <andrei.gher...@huawei.com> --- ...x86-processor-cache-info-to-cpu_feat.patch | 1074 +++++++++++++++++ meta/recipes-core/glibc/glibc_2.33.bb | 4 +- 2 files changed, 1076 insertions(+), 2 deletions(-) create mode 100644 meta/recipes-core/glibc/glibc/0032-Revert-x86-Move-x86-processor-cache-info-to-cpu_feat.patch diff --git a/meta/recipes-core/glibc/glibc/0032-Revert-x86-Move-x86-processor-cache-info-to-cpu_feat.patch b/meta/recipes-core/glibc/glibc/0032-Revert-x86-Move-x86-processor-cache-info-to-cpu_feat.patch new file mode 100644 index 0000000000..0ff1eba82b --- /dev/null +++ b/meta/recipes-core/glibc/glibc/0032-Revert-x86-Move-x86-processor-cache-info-to-cpu_feat.patch @@ -0,0 +1,1074 @@ +From 961d681e38d30a4de06c980de0a96464fa3b4d74 Mon Sep 17 00:00:00 2001 +From: Andrei Gherzan <and...@gherzan.com> +Date: Fri, 19 Feb 2021 23:06:50 +0000 +Subject: [PATCH] Revert "x86: Move x86 processor cache info to cpu_features" + +This reverts commit 2d651eb9265d1366d7b9e881bfddd46db9c1ecc4. + +Upstream-Status: Pending +Signed-off-by: Andrei Gherzan <andrei.gher...@huawei.com> +--- + sysdeps/x86/cacheinfo.c | 46 +-- + sysdeps/x86/cacheinfo.h | 400 +++++++++++++++++++++++-- + sysdeps/x86/cpu-features.c | 35 ++- + sysdeps/x86/dl-cacheinfo.h | 460 ----------------------------- + sysdeps/x86/include/cpu-features.h | 22 -- + 5 files changed, 412 insertions(+), 551 deletions(-) + +diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c +index 7b8df45e3b..948dbea3db 100644 +--- a/sysdeps/x86/cacheinfo.c ++++ b/sysdeps/x86/cacheinfo.c +@@ -18,8 +18,11 @@ + + #if IS_IN (libc) + ++#include <assert.h> + #include <unistd.h> ++#include <cpuid.h> + #include <ldsodefs.h> ++#include <dl-cacheinfo.h> + + /* Get the value of the system variable NAME. */ + long int +@@ -27,45 +30,20 @@ attribute_hidden + __cache_sysconf (int name) + { + const struct cpu_features *cpu_features = __get_cpu_features (); +- switch (name) +- { +- case _SC_LEVEL1_ICACHE_SIZE: +- return cpu_features->level1_icache_size; + +- case _SC_LEVEL1_DCACHE_SIZE: +- return cpu_features->level1_dcache_size; ++ if (cpu_features->basic.kind == arch_kind_intel) ++ return handle_intel (name, cpu_features); + +- case _SC_LEVEL1_DCACHE_ASSOC: +- return cpu_features->level1_dcache_assoc; ++ if (cpu_features->basic.kind == arch_kind_amd) ++ return handle_amd (name); + +- case _SC_LEVEL1_DCACHE_LINESIZE: +- return cpu_features->level1_dcache_linesize; ++ if (cpu_features->basic.kind == arch_kind_zhaoxin) ++ return handle_zhaoxin (name); + +- case _SC_LEVEL2_CACHE_SIZE: +- return cpu_features->level2_cache_size; ++ // XXX Fill in more vendors. + +- case _SC_LEVEL2_CACHE_ASSOC: +- return cpu_features->level2_cache_assoc; +- +- case _SC_LEVEL2_CACHE_LINESIZE: +- return cpu_features->level2_cache_linesize; +- +- case _SC_LEVEL3_CACHE_SIZE: +- return cpu_features->level3_cache_size; +- +- case _SC_LEVEL3_CACHE_ASSOC: +- return cpu_features->level3_cache_assoc; +- +- case _SC_LEVEL3_CACHE_LINESIZE: +- return cpu_features->level3_cache_linesize; +- +- case _SC_LEVEL4_CACHE_SIZE: +- return cpu_features->level4_cache_size; +- +- default: +- break; +- } +- return -1; ++ /* CPU not known, we have no information. */ ++ return 0; + } + + # ifdef SHARED +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index 68c253542f..736189f7f2 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -18,16 +18,7 @@ + + #include <assert.h> + #include <unistd.h> +-#include <cpuid.h> +-#include <cpu-features.h> + +-#if HAVE_TUNABLES +-# define TUNABLE_NAMESPACE cpu +-# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */ +-# include <elf/dl-tunables.h> +-#endif +- +-#if IS_IN (libc) + /* Data cache size for use in memory and string routines, typically + L1 size, rounded to multiple of 256 bytes. */ + long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2; +@@ -54,30 +45,385 @@ long int __x86_rep_movsb_threshold attribute_hidden = 2048; + /* Threshold to use Enhanced REP STOSB. */ + long int __x86_rep_stosb_threshold attribute_hidden = 2048; + ++static void ++get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, ++ long int core) ++{ ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ ++ /* Number of logical processors sharing L2 cache. */ ++ int threads_l2; ++ ++ /* Number of logical processors sharing L3 cache. */ ++ int threads_l3; ++ ++ const struct cpu_features *cpu_features = __get_cpu_features (); ++ int max_cpuid = cpu_features->basic.max_cpuid; ++ unsigned int family = cpu_features->basic.family; ++ unsigned int model = cpu_features->basic.model; ++ long int shared = *shared_ptr; ++ unsigned int threads = *threads_ptr; ++ bool inclusive_cache = true; ++ bool support_count_mask = true; ++ ++ /* Try L3 first. */ ++ unsigned int level = 3; ++ ++ if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6) ++ support_count_mask = false; ++ ++ if (shared <= 0) ++ { ++ /* Try L2 otherwise. */ ++ level = 2; ++ shared = core; ++ threads_l2 = 0; ++ threads_l3 = -1; ++ } ++ else ++ { ++ threads_l2 = 0; ++ threads_l3 = 0; ++ } ++ ++ /* A value of 0 for the HTT bit indicates there is only a single ++ logical processor. */ ++ if (HAS_CPU_FEATURE (HTT)) ++ { ++ /* Figure out the number of logical threads that share the ++ highest cache level. */ ++ if (max_cpuid >= 4) ++ { ++ int i = 0; ++ ++ /* Query until cache level 2 and 3 are enumerated. */ ++ int check = 0x1 | (threads_l3 == 0) << 1; ++ do ++ { ++ __cpuid_count (4, i++, eax, ebx, ecx, edx); ++ ++ /* There seems to be a bug in at least some Pentium Ds ++ which sometimes fail to iterate all cache parameters. ++ Do not loop indefinitely here, stop in this case and ++ assume there is no such information. */ ++ if (cpu_features->basic.kind == arch_kind_intel ++ && (eax & 0x1f) == 0 ) ++ goto intel_bug_no_cache_info; ++ ++ switch ((eax >> 5) & 0x7) ++ { ++ default: ++ break; ++ case 2: ++ if ((check & 0x1)) ++ { ++ /* Get maximum number of logical processors ++ sharing L2 cache. */ ++ threads_l2 = (eax >> 14) & 0x3ff; ++ check &= ~0x1; ++ } ++ break; ++ case 3: ++ if ((check & (0x1 << 1))) ++ { ++ /* Get maximum number of logical processors ++ sharing L3 cache. */ ++ threads_l3 = (eax >> 14) & 0x3ff; ++ ++ /* Check if L2 and L3 caches are inclusive. */ ++ inclusive_cache = (edx & 0x2) != 0; ++ check &= ~(0x1 << 1); ++ } ++ break; ++ } ++ } ++ while (check); ++ ++ /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum ++ numbers of addressable IDs for logical processors sharing ++ the cache, instead of the maximum number of threads ++ sharing the cache. */ ++ if (max_cpuid >= 11 && support_count_mask) ++ { ++ /* Find the number of logical processors shipped in ++ one core and apply count mask. */ ++ i = 0; ++ ++ /* Count SMT only if there is L3 cache. Always count ++ core if there is no L3 cache. */ ++ int count = ((threads_l2 > 0 && level == 3) ++ | ((threads_l3 > 0 ++ || (threads_l2 > 0 && level == 2)) << 1)); ++ ++ while (count) ++ { ++ __cpuid_count (11, i++, eax, ebx, ecx, edx); ++ ++ int shipped = ebx & 0xff; ++ int type = ecx & 0xff00; ++ if (shipped == 0 || type == 0) ++ break; ++ else if (type == 0x100) ++ { ++ /* Count SMT. */ ++ if ((count & 0x1)) ++ { ++ int count_mask; ++ ++ /* Compute count mask. */ ++ asm ("bsr %1, %0" ++ : "=r" (count_mask) : "g" (threads_l2)); ++ count_mask = ~(-1 << (count_mask + 1)); ++ threads_l2 = (shipped - 1) & count_mask; ++ count &= ~0x1; ++ } ++ } ++ else if (type == 0x200) ++ { ++ /* Count core. */ ++ if ((count & (0x1 << 1))) ++ { ++ int count_mask; ++ int threads_core ++ = (level == 2 ? threads_l2 : threads_l3); ++ ++ /* Compute count mask. */ ++ asm ("bsr %1, %0" ++ : "=r" (count_mask) : "g" (threads_core)); ++ count_mask = ~(-1 << (count_mask + 1)); ++ threads_core = (shipped - 1) & count_mask; ++ if (level == 2) ++ threads_l2 = threads_core; ++ else ++ threads_l3 = threads_core; ++ count &= ~(0x1 << 1); ++ } ++ } ++ } ++ } ++ if (threads_l2 > 0) ++ threads_l2 += 1; ++ if (threads_l3 > 0) ++ threads_l3 += 1; ++ if (level == 2) ++ { ++ if (threads_l2) ++ { ++ threads = threads_l2; ++ if (cpu_features->basic.kind == arch_kind_intel ++ && threads > 2 ++ && family == 6) ++ switch (model) ++ { ++ case 0x37: ++ case 0x4a: ++ case 0x4d: ++ case 0x5a: ++ case 0x5d: ++ /* Silvermont has L2 cache shared by 2 cores. */ ++ threads = 2; ++ break; ++ default: ++ break; ++ } ++ } ++ } ++ else if (threads_l3) ++ threads = threads_l3; ++ } ++ else ++ { ++intel_bug_no_cache_info: ++ /* Assume that all logical threads share the highest cache ++ level. */ ++ threads ++ = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) ++ & 0xff); ++ } ++ ++ /* Cap usage of highest cache level to the number of supported ++ threads. */ ++ if (shared > 0 && threads > 0) ++ shared /= threads; ++ } ++ ++ /* Account for non-inclusive L2 and L3 caches. */ ++ if (!inclusive_cache) ++ { ++ if (threads_l2 > 0) ++ core /= threads_l2; ++ shared += core; ++ } ++ ++ *shared_ptr = shared; ++ *threads_ptr = threads; ++} ++ + static void + init_cacheinfo (void) + { ++ /* Find out what brand of processor. */ ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ int max_cpuid_ex; ++ long int data = -1; ++ long int shared = -1; ++ long int core; ++ unsigned int threads = 0; + const struct cpu_features *cpu_features = __get_cpu_features (); +- long int data = cpu_features->data_cache_size; +- __x86_raw_data_cache_size_half = data / 2; +- __x86_raw_data_cache_size = data; +- /* Round data cache size to multiple of 256 bytes. */ +- data = data & ~255L; +- __x86_data_cache_size_half = data / 2; +- __x86_data_cache_size = data; +- +- long int shared = cpu_features->shared_cache_size; +- __x86_raw_shared_cache_size_half = shared / 2; +- __x86_raw_shared_cache_size = shared; +- /* Round shared cache size to multiple of 256 bytes. */ +- shared = shared & ~255L; +- __x86_shared_cache_size_half = shared / 2; +- __x86_shared_cache_size = shared; + ++ /* NB: In libc.so, cpu_features is defined in ld.so and is initialized ++ by DL_PLATFORM_INIT or IFUNC relocation before init_cacheinfo is ++ called by IFUNC relocation. In libc.a, init_cacheinfo is called ++ from init_cpu_features by ARCH_INIT_CPU_FEATURES. */ ++ assert (cpu_features->basic.kind != arch_kind_unknown); ++ ++ if (cpu_features->basic.kind == arch_kind_intel) ++ { ++ data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); ++ core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); ++ shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); ++ ++ get_common_cache_info (&shared, &threads, core); ++ } ++ else if (cpu_features->basic.kind == arch_kind_zhaoxin) ++ { ++ data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); ++ core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); ++ shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); ++ ++ get_common_cache_info (&shared, &threads, core); ++ } ++ else if (cpu_features->basic.kind == arch_kind_amd) ++ { ++ data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); ++ long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); ++ shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); ++ ++ /* Get maximum extended function. */ ++ __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); ++ ++ if (shared <= 0) ++ /* No shared L3 cache. All we have is the L2 cache. */ ++ shared = core; ++ else ++ { ++ /* Figure out the number of logical threads that share L3. */ ++ if (max_cpuid_ex >= 0x80000008) ++ { ++ /* Get width of APIC ID. */ ++ __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx); ++ threads = 1 << ((ecx >> 12) & 0x0f); ++ } ++ ++ if (threads == 0 || cpu_features->basic.family >= 0x17) ++ { ++ /* If APIC ID width is not available, use logical ++ processor count. */ ++ __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx); ++ ++ if ((edx & (1 << 28)) != 0) ++ threads = (ebx >> 16) & 0xff; ++ } ++ ++ /* Cap usage of highest cache level to the number of ++ supported threads. */ ++ if (threads > 0) ++ shared /= threads; ++ ++ /* Get shared cache per ccx for Zen architectures. */ ++ if (cpu_features->basic.family >= 0x17) ++ { ++ unsigned int eax; ++ ++ /* Get number of threads share the L3 cache in CCX. */ ++ __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); ++ ++ unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; ++ shared *= threads_per_ccx; ++ } ++ else ++ { ++ /* Account for exclusive L2 and L3 caches. */ ++ shared += core; ++ } ++ } ++ } ++ ++ /* Prefer cache size configure via tuning. */ ++ if (cpu_features->data_cache_size != 0) ++ data = cpu_features->data_cache_size; ++ ++ if (data > 0) ++ { ++ __x86_raw_data_cache_size_half = data / 2; ++ __x86_raw_data_cache_size = data; ++ /* Round data cache size to multiple of 256 bytes. */ ++ data = data & ~255L; ++ __x86_data_cache_size_half = data / 2; ++ __x86_data_cache_size = data; ++ } ++ ++ /* Prefer cache size configure via tuning. */ ++ if (cpu_features->shared_cache_size != 0) ++ shared = cpu_features->shared_cache_size; ++ ++ if (shared > 0) ++ { ++ __x86_raw_shared_cache_size_half = shared / 2; ++ __x86_raw_shared_cache_size = shared; ++ /* Round shared cache size to multiple of 256 bytes. */ ++ shared = shared & ~255L; ++ __x86_shared_cache_size_half = shared / 2; ++ __x86_shared_cache_size = shared; ++ } ++ ++ /* The default setting for the non_temporal threshold is 3/4 of one ++ thread's share of the chip's cache. For most Intel and AMD processors ++ with an initial release date between 2017 and 2020, a thread's typical ++ share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 ++ threshold leaves 125 KBytes to 500 KBytes of the thread's data ++ in cache after a maximum temporal copy, which will maintain ++ in cache a reasonable portion of the thread's stack and other ++ active data. If the threshold is set higher than one thread's ++ share of the cache, it has a substantial risk of negatively ++ impacting the performance of other threads running on the chip. */ + __x86_shared_non_temporal_threshold +- = cpu_features->non_temporal_threshold; ++ = (cpu_features->non_temporal_threshold != 0 ++ ? cpu_features->non_temporal_threshold ++ : __x86_shared_cache_size * 3 / 4); ++ ++ /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ ++ unsigned int minimum_rep_movsb_threshold; ++ /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ ++ unsigned int rep_movsb_threshold; ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) ++ && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) ++ { ++ rep_movsb_threshold = 2048 * (64 / 16); ++ minimum_rep_movsb_threshold = 64 * 8; ++ } ++ else if (CPU_FEATURE_PREFERRED_P (cpu_features, ++ AVX_Fast_Unaligned_Load)) ++ { ++ rep_movsb_threshold = 2048 * (32 / 16); ++ minimum_rep_movsb_threshold = 32 * 8; ++ } ++ else ++ { ++ rep_movsb_threshold = 2048 * (16 / 16); ++ minimum_rep_movsb_threshold = 16 * 8; ++ } ++ if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold) ++ __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; ++ else ++ __x86_rep_movsb_threshold = rep_movsb_threshold; + +- __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; ++# if HAVE_TUNABLES + __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; ++# endif + } +-#endif +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 73b0a4dc9a..c9e51b5e5a 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -16,13 +16,22 @@ + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + ++#include <cpuid.h> + #include <dl-hwcap.h> + #include <libc-pointer-arith.h> + #include <get-isa-level.h> +-#include <cacheinfo.h> +-#include <dl-cacheinfo.h> ++#if IS_IN (libc) && !defined SHARED ++# include <assert.h> ++# include <unistd.h> ++# include <dl-cacheinfo.h> ++# include <cacheinfo.h> ++#endif + + #if HAVE_TUNABLES ++# define TUNABLE_NAMESPACE cpu ++# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */ ++# include <elf/dl-tunables.h> ++ + extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) + attribute_hidden; + +@@ -639,14 +648,24 @@ no_cpuid: + cpu_features->basic.model = model; + cpu_features->basic.stepping = stepping; + +- dl_init_cacheinfo (cpu_features); +- + #if HAVE_TUNABLES + TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps)); +-#elif defined SHARED +- /* Reuse dl_platform, dl_hwcap and dl_hwcap_mask for x86. The +- glibc.cpu.hwcap_mask tunable is initialized already, so no +- need to do this. */ ++ cpu_features->non_temporal_threshold ++ = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); ++ cpu_features->rep_movsb_threshold ++ = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); ++ cpu_features->rep_stosb_threshold ++ = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL); ++ cpu_features->data_cache_size ++ = TUNABLE_GET (x86_data_cache_size, long int, NULL); ++ cpu_features->shared_cache_size ++ = TUNABLE_GET (x86_shared_cache_size, long int, NULL); ++#endif ++ ++ /* Reuse dl_platform, dl_hwcap and dl_hwcap_mask for x86. */ ++#if !HAVE_TUNABLES && defined SHARED ++ /* The glibc.cpu.hwcap_mask tunable is initialized already, so no need to do ++ this. */ + GLRO(dl_hwcap_mask) = HWCAP_IMPORTANT; + #endif + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index a31fa0783a..6adce4147c 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -476,463 +476,3 @@ handle_zhaoxin (int name) + /* Nothing found. */ + return 0; + } +- +-static void +-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, +- long int core) +-{ +- unsigned int eax; +- unsigned int ebx; +- unsigned int ecx; +- unsigned int edx; +- +- /* Number of logical processors sharing L2 cache. */ +- int threads_l2; +- +- /* Number of logical processors sharing L3 cache. */ +- int threads_l3; +- +- const struct cpu_features *cpu_features = __get_cpu_features (); +- int max_cpuid = cpu_features->basic.max_cpuid; +- unsigned int family = cpu_features->basic.family; +- unsigned int model = cpu_features->basic.model; +- long int shared = *shared_ptr; +- unsigned int threads = *threads_ptr; +- bool inclusive_cache = true; +- bool support_count_mask = true; +- +- /* Try L3 first. */ +- unsigned int level = 3; +- +- if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6) +- support_count_mask = false; +- +- if (shared <= 0) +- { +- /* Try L2 otherwise. */ +- level = 2; +- shared = core; +- threads_l2 = 0; +- threads_l3 = -1; +- } +- else +- { +- threads_l2 = 0; +- threads_l3 = 0; +- } +- +- /* A value of 0 for the HTT bit indicates there is only a single +- logical processor. */ +- if (HAS_CPU_FEATURE (HTT)) +- { +- /* Figure out the number of logical threads that share the +- highest cache level. */ +- if (max_cpuid >= 4) +- { +- int i = 0; +- +- /* Query until cache level 2 and 3 are enumerated. */ +- int check = 0x1 | (threads_l3 == 0) << 1; +- do +- { +- __cpuid_count (4, i++, eax, ebx, ecx, edx); +- +- /* There seems to be a bug in at least some Pentium Ds +- which sometimes fail to iterate all cache parameters. +- Do not loop indefinitely here, stop in this case and +- assume there is no such information. */ +- if (cpu_features->basic.kind == arch_kind_intel +- && (eax & 0x1f) == 0 ) +- goto intel_bug_no_cache_info; +- +- switch ((eax >> 5) & 0x7) +- { +- default: +- break; +- case 2: +- if ((check & 0x1)) +- { +- /* Get maximum number of logical processors +- sharing L2 cache. */ +- threads_l2 = (eax >> 14) & 0x3ff; +- check &= ~0x1; +- } +- break; +- case 3: +- if ((check & (0x1 << 1))) +- { +- /* Get maximum number of logical processors +- sharing L3 cache. */ +- threads_l3 = (eax >> 14) & 0x3ff; +- +- /* Check if L2 and L3 caches are inclusive. */ +- inclusive_cache = (edx & 0x2) != 0; +- check &= ~(0x1 << 1); +- } +- break; +- } +- } +- while (check); +- +- /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum +- numbers of addressable IDs for logical processors sharing +- the cache, instead of the maximum number of threads +- sharing the cache. */ +- if (max_cpuid >= 11 && support_count_mask) +- { +- /* Find the number of logical processors shipped in +- one core and apply count mask. */ +- i = 0; +- +- /* Count SMT only if there is L3 cache. Always count +- core if there is no L3 cache. */ +- int count = ((threads_l2 > 0 && level == 3) +- | ((threads_l3 > 0 +- || (threads_l2 > 0 && level == 2)) << 1)); +- +- while (count) +- { +- __cpuid_count (11, i++, eax, ebx, ecx, edx); +- +- int shipped = ebx & 0xff; +- int type = ecx & 0xff00; +- if (shipped == 0 || type == 0) +- break; +- else if (type == 0x100) +- { +- /* Count SMT. */ +- if ((count & 0x1)) +- { +- int count_mask; +- +- /* Compute count mask. */ +- asm ("bsr %1, %0" +- : "=r" (count_mask) : "g" (threads_l2)); +- count_mask = ~(-1 << (count_mask + 1)); +- threads_l2 = (shipped - 1) & count_mask; +- count &= ~0x1; +- } +- } +- else if (type == 0x200) +- { +- /* Count core. */ +- if ((count & (0x1 << 1))) +- { +- int count_mask; +- int threads_core +- = (level == 2 ? threads_l2 : threads_l3); +- +- /* Compute count mask. */ +- asm ("bsr %1, %0" +- : "=r" (count_mask) : "g" (threads_core)); +- count_mask = ~(-1 << (count_mask + 1)); +- threads_core = (shipped - 1) & count_mask; +- if (level == 2) +- threads_l2 = threads_core; +- else +- threads_l3 = threads_core; +- count &= ~(0x1 << 1); +- } +- } +- } +- } +- if (threads_l2 > 0) +- threads_l2 += 1; +- if (threads_l3 > 0) +- threads_l3 += 1; +- if (level == 2) +- { +- if (threads_l2) +- { +- threads = threads_l2; +- if (cpu_features->basic.kind == arch_kind_intel +- && threads > 2 +- && family == 6) +- switch (model) +- { +- case 0x37: +- case 0x4a: +- case 0x4d: +- case 0x5a: +- case 0x5d: +- /* Silvermont has L2 cache shared by 2 cores. */ +- threads = 2; +- break; +- default: +- break; +- } +- } +- } +- else if (threads_l3) +- threads = threads_l3; +- } +- else +- { +-intel_bug_no_cache_info: +- /* Assume that all logical threads share the highest cache +- level. */ +- threads +- = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) +- & 0xff); +- } +- +- /* Cap usage of highest cache level to the number of supported +- threads. */ +- if (shared > 0 && threads > 0) +- shared /= threads; +- } +- +- /* Account for non-inclusive L2 and L3 caches. */ +- if (!inclusive_cache) +- { +- if (threads_l2 > 0) +- core /= threads_l2; +- shared += core; +- } +- +- *shared_ptr = shared; +- *threads_ptr = threads; +-} +- +-static void +-dl_init_cacheinfo (struct cpu_features *cpu_features) +-{ +- /* Find out what brand of processor. */ +- unsigned int ebx; +- unsigned int ecx; +- unsigned int edx; +- int max_cpuid_ex; +- long int data = -1; +- long int shared = -1; +- long int core; +- unsigned int threads = 0; +- unsigned long int level1_icache_size = -1; +- unsigned long int level1_dcache_size = -1; +- unsigned long int level1_dcache_assoc = -1; +- unsigned long int level1_dcache_linesize = -1; +- unsigned long int level2_cache_size = -1; +- unsigned long int level2_cache_assoc = -1; +- unsigned long int level2_cache_linesize = -1; +- unsigned long int level3_cache_size = -1; +- unsigned long int level3_cache_assoc = -1; +- unsigned long int level3_cache_linesize = -1; +- unsigned long int level4_cache_size = -1; +- +- if (cpu_features->basic.kind == arch_kind_intel) +- { +- data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); +- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); +- shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); +- +- level1_icache_size +- = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); +- level1_dcache_size = data; +- level1_dcache_assoc +- = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); +- level1_dcache_linesize +- = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); +- level2_cache_size = core; +- level2_cache_assoc +- = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); +- level2_cache_linesize +- = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features); +- level3_cache_size = shared; +- level3_cache_assoc +- = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features); +- level3_cache_linesize +- = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features); +- level4_cache_size +- = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); +- +- get_common_cache_info (&shared, &threads, core); +- } +- else if (cpu_features->basic.kind == arch_kind_zhaoxin) +- { +- data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); +- shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); +- +- level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); +- level1_dcache_size = data; +- level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); +- level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; +- level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); +- level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); +- level3_cache_size = shared; +- level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); +- level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); +- +- get_common_cache_info (&shared, &threads, core); +- } +- else if (cpu_features->basic.kind == arch_kind_amd) +- { +- data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_amd (_SC_LEVEL2_CACHE_SIZE); +- shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); +- +- level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); +- level1_dcache_size = data; +- level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); +- level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; +- level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); +- level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); +- level3_cache_size = shared; +- level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC); +- level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE); +- +- /* Get maximum extended function. */ +- __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); +- +- if (shared <= 0) +- /* No shared L3 cache. All we have is the L2 cache. */ +- shared = core; +- else +- { +- /* Figure out the number of logical threads that share L3. */ +- if (max_cpuid_ex >= 0x80000008) +- { +- /* Get width of APIC ID. */ +- __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx); +- threads = 1 << ((ecx >> 12) & 0x0f); +- } +- +- if (threads == 0 || cpu_features->basic.family >= 0x17) +- { +- /* If APIC ID width is not available, use logical +- processor count. */ +- __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx); +- +- if ((edx & (1 << 28)) != 0) +- threads = (ebx >> 16) & 0xff; +- } +- +- /* Cap usage of highest cache level to the number of +- supported threads. */ +- if (threads > 0) +- shared /= threads; +- +- /* Get shared cache per ccx for Zen architectures. */ +- if (cpu_features->basic.family >= 0x17) +- { +- unsigned int eax; +- +- /* Get number of threads share the L3 cache in CCX. */ +- __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); +- +- unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; +- shared *= threads_per_ccx; +- } +- else +- { +- /* Account for exclusive L2 and L3 caches. */ +- shared += core; +- } +- } +- } +- +- cpu_features->level1_icache_size = level1_icache_size; +- cpu_features->level1_dcache_size = level1_dcache_size; +- cpu_features->level1_dcache_assoc = level1_dcache_assoc; +- cpu_features->level1_dcache_linesize = level1_dcache_linesize; +- cpu_features->level2_cache_size = level2_cache_size; +- cpu_features->level2_cache_assoc = level2_cache_assoc; +- cpu_features->level2_cache_linesize = level2_cache_linesize; +- cpu_features->level3_cache_size = level3_cache_size; +- cpu_features->level3_cache_assoc = level3_cache_assoc; +- cpu_features->level3_cache_linesize = level3_cache_linesize; +- cpu_features->level4_cache_size = level4_cache_size; +- +- /* The default setting for the non_temporal threshold is 3/4 of one +- thread's share of the chip's cache. For most Intel and AMD processors +- with an initial release date between 2017 and 2020, a thread's typical +- share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 +- threshold leaves 125 KBytes to 500 KBytes of the thread's data +- in cache after a maximum temporal copy, which will maintain +- in cache a reasonable portion of the thread's stack and other +- active data. If the threshold is set higher than one thread's +- share of the cache, it has a substantial risk of negatively +- impacting the performance of other threads running on the chip. */ +- unsigned long int non_temporal_threshold = shared * 3 / 4; +- +-#if HAVE_TUNABLES +- /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ +- unsigned int minimum_rep_movsb_threshold; +-#endif +- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ +- unsigned int rep_movsb_threshold; +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) +- && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) +- { +- rep_movsb_threshold = 2048 * (64 / 16); +-#if HAVE_TUNABLES +- minimum_rep_movsb_threshold = 64 * 8; +-#endif +- } +- else if (CPU_FEATURE_PREFERRED_P (cpu_features, +- AVX_Fast_Unaligned_Load)) +- { +- rep_movsb_threshold = 2048 * (32 / 16); +-#if HAVE_TUNABLES +- minimum_rep_movsb_threshold = 32 * 8; +-#endif +- } +- else +- { +- rep_movsb_threshold = 2048 * (16 / 16); +-#if HAVE_TUNABLES +- minimum_rep_movsb_threshold = 16 * 8; +-#endif +- } +- +- /* The default threshold to use Enhanced REP STOSB. */ +- unsigned long int rep_stosb_threshold = 2048; +- +-#if HAVE_TUNABLES +- long int tunable_size; +- +- tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL); +- /* NB: Ignore the default value 0. */ +- if (tunable_size != 0) +- data = tunable_size; +- +- tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL); +- /* NB: Ignore the default value 0. */ +- if (tunable_size != 0) +- shared = tunable_size; +- +- tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); +- /* NB: Ignore the default value 0. */ +- if (tunable_size != 0) +- non_temporal_threshold = tunable_size; +- +- tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); +- if (tunable_size > minimum_rep_movsb_threshold) +- rep_movsb_threshold = tunable_size; +- +- /* NB: The default value of the x86_rep_stosb_threshold tunable is the +- same as the default value of __x86_rep_stosb_threshold and the +- minimum value is fixed. */ +- rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, +- long int, NULL); +- +- TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, long int, data, +- 0, (long int) -1); +- TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, long int, shared, +- 0, (long int) -1); +- TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, long int, +- non_temporal_threshold, 0, (long int) -1); +- TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, long int, +- rep_movsb_threshold, +- minimum_rep_movsb_threshold, (long int) -1); +- TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, long int, +- rep_stosb_threshold, 1, (long int) -1); +-#endif +- +- cpu_features->data_cache_size = data; +- cpu_features->shared_cache_size = shared; +- cpu_features->non_temporal_threshold = non_temporal_threshold; +- cpu_features->rep_movsb_threshold = rep_movsb_threshold; +- cpu_features->rep_stosb_threshold = rep_stosb_threshold; +-} +diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h +index 624736b40e..fb02f0607b 100644 +--- a/sysdeps/x86/include/cpu-features.h ++++ b/sysdeps/x86/include/cpu-features.h +@@ -872,28 +872,6 @@ struct cpu_features + unsigned long int rep_movsb_threshold; + /* Threshold to use "rep stosb". */ + unsigned long int rep_stosb_threshold; +- /* _SC_LEVEL1_ICACHE_SIZE. */ +- unsigned long int level1_icache_size; +- /* _SC_LEVEL1_DCACHE_SIZE. */ +- unsigned long int level1_dcache_size; +- /* _SC_LEVEL1_DCACHE_ASSOC. */ +- unsigned long int level1_dcache_assoc; +- /* _SC_LEVEL1_DCACHE_LINESIZE. */ +- unsigned long int level1_dcache_linesize; +- /* _SC_LEVEL2_CACHE_ASSOC. */ +- unsigned long int level2_cache_size; +- /* _SC_LEVEL2_DCACHE_ASSOC. */ +- unsigned long int level2_cache_assoc; +- /* _SC_LEVEL2_CACHE_LINESIZE. */ +- unsigned long int level2_cache_linesize; +- /* /_SC_LEVEL3_CACHE_SIZE. */ +- unsigned long int level3_cache_size; +- /* _SC_LEVEL3_CACHE_ASSOC. */ +- unsigned long int level3_cache_assoc; +- /* _SC_LEVEL3_CACHE_LINESIZE. */ +- unsigned long int level3_cache_linesize; +- /* /_SC_LEVEL4_CACHE_SIZE. */ +- unsigned long int level4_cache_size; + }; + + /* Get a pointer to the CPU features structure. */ diff --git a/meta/recipes-core/glibc/glibc_2.33.bb b/meta/recipes-core/glibc/glibc_2.33.bb index e0002e6046..dd4087f80b 100644 --- a/meta/recipes-core/glibc/glibc_2.33.bb +++ b/meta/recipes-core/glibc/glibc_2.33.bb @@ -15,11 +15,10 @@ NATIVESDKFIXES_class-nativesdk = "\ file://faccessat2-perm.patch \ " -SRC_URI = "${GLIBC_GIT_URI};branch=${SRCBRANCH};name=glibc \ +SRC_URI = "${GLIBC_GIT_URI};branch=${SRCBRANCH};name=glibc \ file://etc/ld.so.conf \ file://generate-supported.mk \ file://makedbs.sh \ - \ ${NATIVESDKFIXES} \ file://0008-fsl-e500-e5500-e6500-603e-fsqrt-implementation.patch \ file://0009-ppc-sqrt-Fix-undefined-reference-to-__sqrt_finite.patch \ @@ -44,6 +43,7 @@ SRC_URI = "${GLIBC_GIT_URI};branch=${SRCBRANCH};name=glibc \ file://0029-wordsize.h-Unify-the-header-between-arm-and-aarch64.patch \ file://0030-powerpc-Do-not-ask-compiler-for-finding-arch.patch \ file://0031-x86-Require-full-ISA-support-for-x86-64-level-marker.patch \ + file://0032-Revert-x86-Move-x86-processor-cache-info-to-cpu_feat.patch \ " S = "${WORKDIR}/git" B = "${WORKDIR}/build-${TARGET_SYS}" -- 2.30.1
-=-=-=-=-=-=-=-=-=-=-=- Links: You receive all messages sent to this group. View/Reply Online (#148366): https://lists.openembedded.org/g/openembedded-core/message/148366 Mute This Topic: https://lists.openembedded.org/mt/80769661/21656 Group Owner: openembedded-core+ow...@lists.openembedded.org Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [arch...@mail-archive.com] -=-=-=-=-=-=-=-=-=-=-=-