Series looks good to me. Jose
----- Original Message ----- > From: Roland Scheidegger <srol...@vmware.com> > > Going to need this soon (not going to bother with avx2 intrinsics at this > time > but don't want to do workarounds for true vector shifts if llvm itself can > use > them just fine and won't need the gazillion instruction emulation). > Not really tested other than my cpu returns 0 for these features... > (I have no idea if llvm actually would emit avx2/xop instructions neither...) > --- > src/gallium/auxiliary/gallivm/lp_bld_init.c | 11 ++++-- > src/gallium/auxiliary/util/u_cpu_detect.c | 48 > +++++++++++++++++++++++++++ > src/gallium/auxiliary/util/u_cpu_detect.h | 2 ++ > 3 files changed, 59 insertions(+), 2 deletions(-) > > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c > b/src/gallium/auxiliary/gallivm/lp_bld_init.c > index 61eadb8..61b561f 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c > @@ -461,12 +461,15 @@ lp_build_init(void) > lp_native_vector_width); > > if (lp_native_vector_width <= 128) { > - /* Hide AVX support, as often LLVM AVX instrinsics are only guarded by > + /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by > * "util_cpu_caps.has_avx" predicate, and lack the > * "lp_native_vector_width > 128" predicate. And also to ensure a more > * consistent behavior, allowing one to test SSE2 on AVX machines. > + * XXX: should not play games with util_cpu_caps directly as it might > + * get used for other things outside llvm too. > */ > util_cpu_caps.has_avx = 0; > + util_cpu_caps.has_avx2 = 0; > } > > if (!HAVE_AVX) { > @@ -476,13 +479,17 @@ lp_build_init(void) > * omit it unnecessarily on amd cpus, see above). > */ > util_cpu_caps.has_f16c = 0; > + util_cpu_caps.has_xop = 0; > } > > #ifdef PIPE_ARCH_PPC_64 > /* Set the NJ bit in VSCR to 0 so denormalized values are handled as > - * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This > garantees > + * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This > guarantees > * that some rounding and half-float to float handling does not round > * incorrectly to 0. > + * XXX: should eventually follow same logic on all platforms. > + * Right now denorms get explicitly disabled (but elsewhere) for x86, > + * whereas ppc64 explicitly enables them... > */ > if (util_cpu_caps.has_altivec) { > unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, > diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c > b/src/gallium/auxiliary/util/u_cpu_detect.c > index 87ad780..2ff40bb 100644 > --- a/src/gallium/auxiliary/util/u_cpu_detect.c > +++ b/src/gallium/auxiliary/util/u_cpu_detect.c > @@ -212,6 +212,44 @@ cpuid(uint32_t ax, uint32_t *p) > #endif > } > > +/** > + * @sa cpuid.h included in gcc-4.4 onwards. > + * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx > + */ > +static INLINE void > +cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) > +{ > +#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && > defined(PIPE_ARCH_X86) > + __asm __volatile ( > + "xchgl %%ebx, %1\n\t" > + "cpuid\n\t" > + "xchgl %%ebx, %1" > + : "=a" (p[0]), > + "=S" (p[1]), > + "=c" (p[2]), > + "=d" (p[3]) > + : "0" (ax), "2" (cx) > + ); > +#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && > defined(PIPE_ARCH_X86_64) > + __asm __volatile ( > + "cpuid\n\t" > + : "=a" (p[0]), > + "=b" (p[1]), > + "=c" (p[2]), > + "=d" (p[3]) > + : "0" (ax), "2" (cx) > + ); > +#elif defined(PIPE_CC_MSVC) > + __cpuidex(p, ax, cx); > +#else > + p[0] = 0; > + p[1] = 0; > + p[2] = 0; > + p[3] = 0; > +#endif > +} > + > + > static INLINE uint64_t xgetbv(void) > { > #if defined(PIPE_CC_GCC) > @@ -341,6 +379,11 @@ util_cpu_detect(void) > if (cacheline > 0) > util_cpu_caps.cacheline = cacheline; > } > + if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { > + uint32_t regs7[4]; > + cpuid_count(0x00000007, 0x00000000, regs7); > + util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; > + } > > if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == > 0x49656e69) { > /* GenuineIntel */ > @@ -357,6 +400,9 @@ util_cpu_detect(void) > util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; > util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; > util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; > + > + util_cpu_caps.has_xop = util_cpu_caps.has_avx && > + ((regs2[2] >> 11) & 1); > } > > if (regs[0] >= 0x80000006) { > @@ -394,10 +440,12 @@ util_cpu_detect(void) > debug_printf("util_cpu_caps.has_sse4_1 = %u\n", > util_cpu_caps.has_sse4_1); > debug_printf("util_cpu_caps.has_sse4_2 = %u\n", > util_cpu_caps.has_sse4_2); > debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); > + debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); > debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); > debug_printf("util_cpu_caps.has_popcnt = %u\n", > util_cpu_caps.has_popcnt); > debug_printf("util_cpu_caps.has_3dnow = %u\n", > util_cpu_caps.has_3dnow); > debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", > util_cpu_caps.has_3dnow_ext); > + debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); > debug_printf("util_cpu_caps.has_altivec = %u\n", > util_cpu_caps.has_altivec); > debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); > } > diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h > b/src/gallium/auxiliary/util/u_cpu_detect.h > index cc3e0ce..5ccfc93 100644 > --- a/src/gallium/auxiliary/util/u_cpu_detect.h > +++ b/src/gallium/auxiliary/util/u_cpu_detect.h > @@ -64,9 +64,11 @@ struct util_cpu_caps { > unsigned has_sse4_2:1; > unsigned has_popcnt:1; > unsigned has_avx:1; > + unsigned has_avx2:1; > unsigned has_f16c:1; > unsigned has_3dnow:1; > unsigned has_3dnow_ext:1; > + unsigned has_xop:1; > unsigned has_altivec:1; > unsigned has_daz:1; > }; > -- > 1.7.9.5 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev