On 13/09/2016 22:57, Richard Henderson wrote: > -#if defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && > defined(__SSE2__)) > -#include <cpuid.h> > - > +#if (defined(CONFIG_AVX2_OPT) && defined(CONFIG_CPUID_H)) || > defined(__SSE2__)
Your __SSE2__ version is better than mine which required cpuid.h just to simplify the logic a bit. On the other hand, CONFIG_CPUID_H is not needed in CONFIG_AVX2_OPT, because the test already requires cpuid.h. > +#ifdef CONFIG_CPUID_H > +# define INIT_CACHE > +# define INIT_ACCEL > +#else > +# ifndef __SSE2__ > +# error "ISA selection confusion" > +# endif > +# define INIT_CACHE = CACHE_SSE2 > +# define INIT_ACCEL = buffer_zero_sse2 > #endif This is ugly, any reason not to initialize INIT_CACHE/INIT_ACCEL to respectively 0 and NULL, or 0 and buffer_zero_int in the #ifdef CONFIG_CPUID_H case? > -#define CACHE_AVX2 2 > -#define CACHE_AVX1 4 > -#define CACHE_SSE4 8 > -#define CACHE_SSE2 16 > +static unsigned cpuid_cache INIT_CACHE; > +static bool (*buffer_accel)(const void *, size_t) INIT_ACCEL; > > -static unsigned cpuid_cache; > +#undef INIT_CACHE > +#undef INIT_ACCEL The #undef is not really necessary since this file hardly has anything after the toplevel #endif. Just tell me which changes you agree with, I can make them locally. Paolo > +static void init_accel(unsigned cache) > +{ > + bool (*fn)(const void *, size_t) = buffer_zero_int; > + if (cache & CACHE_SSE2) { > + fn = buffer_zero_sse2; > + } > +#ifdef CONFIG_AVX2_OPT > + if (cache & CACHE_SSE4) { > + fn = buffer_zero_sse4; > + } > + if (cache & CACHE_AVX2) { > + fn = buffer_zero_avx2; > + } > +#endif > + buffer_accel = fn; > +} > + > +#ifdef CONFIG_CPUID_H > +#include <cpuid.h> > static void __attribute__((constructor)) init_cpuid_cache(void) > { > int max = __get_cpuid_max(0, NULL); > @@ -145,24 +245,23 @@ static void __attribute__((constructor)) > init_cpuid_cache(void) > cache |= CACHE_SSE4; > } > > +#ifdef bit_AVX2 > /* We must check that AVX is not just available, but usable. */ > - if ((c & bit_OSXSAVE) && (c & bit_AVX)) { > - __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(0)); > - if ((a & 6) == 6) { > - cache |= CACHE_AVX1; > - if (max >= 7) { > - __cpuid_count(7, 0, a, b, c, d); > - if (b & bit_AVX2) { > - cache |= CACHE_AVX2; > - } > - } > + if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { > + int bv; > + __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); > + __cpuid_count(7, 0, a, b, c, d); > + if ((bv & 6) == 6 && (b & bit_AVX2)) { > + cache |= CACHE_AVX2; > } > } > +#endif > } > cpuid_cache = cache; > + init_accel(cache); > } > +#endif /* CONFIG_CPUID_H */ > > -#define HAVE_NEXT_ACCEL > bool test_buffer_is_zero_next_accel(void) > { > /* If no bits set, we just tested buffer_zero_int, and there > @@ -172,31 +271,20 @@ bool test_buffer_is_zero_next_accel(void) > } > /* Disable the accelerator we used before and select a new one. */ > cpuid_cache &= cpuid_cache - 1; > + init_accel(cpuid_cache); > return true; > } > > static bool select_accel_fn(const void *buf, size_t len) > { > - uintptr_t ibuf = (uintptr_t)buf; > -#ifdef CONFIG_AVX2_OPT > - if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) { > - return buffer_zero_avx2(buf, len); > - } > - if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) { > - return buffer_zero_sse4(buf, len); > - } > -#endif > - if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) { > - return buffer_zero_sse2(buf, len); > + if (likely(len >= 64)) { > + return buffer_accel(buf, len); > } > return buffer_zero_int(buf, len); > } > > #else > #define select_accel_fn buffer_zero_int > -#endif > - > -#ifndef HAVE_NEXT_ACCEL > bool test_buffer_is_zero_next_accel(void) > { > return false; >