Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com> > On Oct 26, 2016, at 7:08 PM, George Kyriazis <george.kyria...@intel.com> > wrote: > > Used in abandoned all-or-nothing approach to converting to AVX512 > --- > .../drivers/swr/rasterizer/common/simdintrin.h | 633 --------------------- > .../drivers/swr/rasterizer/core/format_types.h | 189 ------ > src/gallium/drivers/swr/rasterizer/core/knobs.h | 5 - > src/gallium/drivers/swr/rasterizer/core/utils.h | 164 +----- > 4 files changed, 1 insertion(+), 990 deletions(-) > > diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h > b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h > index 7671031..10c0955 100644 > --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h > +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h > @@ -36,30 +36,6 @@ > typedef __m256 simdscalar; > typedef __m256i simdscalari; > typedef uint8_t simdmask; > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > -struct simdscalar > -{ > - __m256 lo; > - __m256 hi; > -}; > -struct simdscalard > -{ > - __m256d lo; > - __m256d hi; > -}; > -struct simdscalari > -{ > - __m256i lo; > - __m256i hi; > -}; > -typedef uint16_t simdmask; > -#else > -typedef __m512 simdscalar; > -typedef __m512d simdscalard; > -typedef __m512i simdscalari; > -typedef __mask16 simdmask; > -#endif > #else > #error Unsupported vector width > #endif > @@ -655,615 +631,6 @@ void _simdvec_transpose(simdvector &v) > SWR_ASSERT(false, "Need to implement 8 wide version"); > } > > -#elif KNOB_SIMD_WIDTH == 16 > - > -#if ENABLE_AVX512_EMULATION > - > -#define SIMD_EMU_AVX512_0(type, func, intrin) \ > -INLINE type func()\ > -{\ > - type result;\ > -\ > - result.lo = intrin();\ > - result.hi = intrin();\ > -\ > - return result;\ > -} > - > -#define SIMD_EMU_AVX512_1(type, func, intrin) \ > -INLINE type func(type a)\ > -{\ > - type result;\ > -\ > - result.lo = intrin(a.lo);\ > - result.hi = intrin(a.hi);\ > -\ > - return result;\ > -} > - > -#define SIMD_EMU_AVX512_2(type, func, intrin) \ > -INLINE type func(type a, type b)\ > -{\ > - type result;\ > -\ > - result.lo = intrin(a.lo, b.lo);\ > - result.hi = intrin(a.hi, b.hi);\ > -\ > - return result;\ > -} > - > -#define SIMD_EMU_AVX512_3(type, func, intrin) \ > -INLINE type func(type a, type b, type c)\ > -{\ > - type result;\ > -\ > - result.lo = intrin(a.lo, b.lo, c.lo);\ > - result.hi = intrin(a.hi, b.hi, c.hi);\ > -\ > - return result;\ > -} > - > -SIMD_EMU_AVX512_0(simdscalar, _simd_setzero_ps, _mm256_setzero_ps) > -SIMD_EMU_AVX512_0(simdscalari, _simd_setzero_si, _mm256_setzero_si256) > - > -INLINE simdscalar _simd_set1_ps(float a) > -{ > - simdscalar result; > - > - result.lo = _mm256_set1_ps(a); > - result.hi = _mm256_set1_ps(a); > - > - return result; > -} > - > -INLINE simdscalari _simd_set1_epi8(char a) > -{ > - simdscalari result; > - > - result.lo = _mm256_set1_epi8(a); > - result.hi = _mm256_set1_epi8(a); > - > - return result; > -} > - > -INLINE simdscalari _simd_set1_epi32(int a) > -{ > - simdscalari result; > - > - result.lo = _mm256_set1_epi32(a); > - result.hi = _mm256_set1_epi32(a); > - > - return result; > -} > - > -INLINE simdscalari _simd_set_epi32(int e7, int e6, int e5, int e4, int e3, > int e2, int e1, int e0) > -{ > - simdscalari result; > - > - result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); > - result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); > - > - return result; > -} > - > -INLINE simdscalari _simd_set_epi32(int e15, int e14, int e13, int e12, int > e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, > int e1, int e0) > -{ > - simdscalari result; > - > - result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); > - result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8); > - > - return result; > -} > - > -INLINE simdscalar _simd_load_ps(float const *m) > -{ > - float const *n = reinterpret_cast<float const > *>(reinterpret_cast<uint8_t const *>(m) + sizeof(simdscalar::lo)); > - > - simdscalar result; > - > - result.lo = _mm256_load_ps(m); > - result.hi = _mm256_load_ps(n); > - > - return result; > -} > - > -INLINE simdscalar _simd_loadu_ps(float const *m) > -{ > - float const *n = reinterpret_cast<float const > *>(reinterpret_cast<uint8_t const *>(m) + sizeof(simdscalar::lo)); > - > - simdscalar result; > - > - result.lo = _mm256_loadu_ps(m); > - result.hi = _mm256_loadu_ps(n); > - > - return result; > -} > - > -INLINE simdscalar _simd_load1_ps(float const *m) > -{ > - simdscalar result; > - > - result.lo = _mm256_broadcast_ss(m); > - result.hi = _mm256_broadcast_ss(m); > - > - return result; > -} > - > -INLINE simdscalari _simd_load_si(simdscalari const *m) > -{ > - simdscalari result; > - > - result.lo = _mm256_load_si256(&m[0].lo); > - result.hi = _mm256_load_si256(&m[0].hi); > - > - return result; > -} > - > -INLINE simdscalari _simd_loadu_si(simdscalari const *m) > -{ > - simdscalari result; > - > - result.lo = _mm256_loadu_si256(&m[0].lo); > - result.hi = _mm256_loadu_si256(&m[0].hi); > - > - return result; > -} > - > -INLINE simdscalar _simd_broadcast_ss(float const *m) > -{ > - simdscalar result; > - > - result.lo = _mm256_broadcast_ss(m); > - result.hi = _mm256_broadcast_ss(m); > - > - return result; > -} > - > -INLINE simdscalar _simd_broadcast_ps(__m128 const *m) > -{ > - simdscalar result; > - > - result.lo = _mm256_broadcast_ps(m); > - result.hi = _mm256_broadcast_ps(m); > - > - return result; > -} > - > -INLINE void _simd_store_ps(float *m, simdscalar a) > -{ > - float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + > sizeof(simdscalar::lo)); > - > - _mm256_store_ps(m, a.lo); > - _mm256_store_ps(n, a.hi); > -} > - > -INLINE void _simd_maskstore_ps(float *m, simdscalari mask, simdscalar a) > -{ > - float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + > sizeof(simdscalar::lo)); > - > - _mm256_maskstore_ps(m, mask.lo, a.lo); > - _mm256_maskstore_ps(n, mask.hi, a.hi); > -} > - > -INLINE void _simd_store_si(simdscalari *m, simdscalari a) > -{ > - _mm256_store_si256(&m[0].lo, a.lo); > - _mm256_store_si256(&m[0].hi, a.hi); > -} > - > -INLINE simdscalar _simd_blend_ps(simdscalar a, simdscalar b, const simdmask > mask) > -{ > - simdscalar result; > - > - result.lo = _mm256_blend_ps(a.lo, b.lo, reinterpret_cast<const uint8_t > *>(&mask)[0]); > - result.hi = _mm256_blend_ps(a.hi, b.hi, reinterpret_cast<const uint8_t > *>(&mask)[1]); > - > - return result; > -} > - > -SIMD_EMU_AVX512_3(simdscalar, _simd_blendv_ps, _mm256_blendv_ps) > - > -INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, const > simdscalar mask) > -{ > - simdscalari result; > - > - result.lo = > _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), > _mm256_castsi256_ps(b.lo), mask.lo)); > - result.hi = > _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), > _mm256_castsi256_ps(b.hi), mask.hi)); > - > - return result; > -} > - > -INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, const > simdscalari mask) > -{ > - simdscalari result; > - > - result.lo = > _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), > _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo))); > - result.hi = > _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), > _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi))); > - > - return result; > -} > - > -SIMD_EMU_AVX512_2(simdscalar, _simd_mul_ps, _mm256_mul_ps) > -SIMD_EMU_AVX512_2(simdscalar, _simd_add_ps, _mm256_add_ps) > -SIMD_EMU_AVX512_2(simdscalar, _simd_sub_ps, _mm256_sub_ps) > -SIMD_EMU_AVX512_1(simdscalar, _simd_rsqrt_ps, _mm256_rsqrt_ps) > -SIMD_EMU_AVX512_2(simdscalar, _simd_min_ps, _mm256_min_ps) > -SIMD_EMU_AVX512_2(simdscalar, _simd_max_ps, _mm256_max_ps) > - > -INLINE simdmask _simd_movemask_ps(simdscalar a) > -{ > - simdmask mask; > - > - reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo); > - reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi); > - > - return mask; > -} > - > -INLINE simdmask _simd_movemask_pd(simdscalard a) > -{ > - simdmask mask; > - > - reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo); > - reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi); > - > - return mask; > -} > - > -INLINE simdmask _simd_movemask_epi8(simdscalari a) > -{ > - simdmask mask; > - > - reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo); > - reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi); > - > - return mask; > -} > - > -INLINE simdscalari _simd_cvtps_epi32(simdscalar a) > -{ > - simdscalari result; > - > - result.lo = _mm256_cvtps_epi32(a.lo); > - result.hi = _mm256_cvtps_epi32(a.hi); > - > - return result; > -} > - > -INLINE simdscalari _simd_cvttps_epi32(simdscalar a) > -{ > - simdscalari result; > - > - result.lo = _mm256_cvttps_epi32(a.lo); > - result.hi = _mm256_cvttps_epi32(a.hi); > - > - return result; > -} > - > -INLINE simdscalar _simd_cvtepi32_ps(simdscalari a) > -{ > - simdscalar result; > - > - result.lo = _mm256_cvtepi32_ps(a.lo); > - result.hi = _mm256_cvtepi32_ps(a.hi); > - > - return result; > -} > - > -INLINE simdscalar _simd_cmp_ps(simdscalar a, simdscalar b, const int comp) > -{ > - simdscalar result; > - > - result.lo = _mm256_cmp_ps(a.lo, b.lo, comp); > - result.hi = _mm256_cmp_ps(a.hi, b.hi, comp); > - > - return result; > -} > - > -#define _simd_cmplt_ps(a, b) _simd_cmp_ps(a, b, _CMP_LT_OQ) > -#define _simd_cmpgt_ps(a, b) _simd_cmp_ps(a, b, _CMP_GT_OQ) > -#define _simd_cmpneq_ps(a, b) _simd_cmp_ps(a, b, _CMP_NEQ_OQ) > -#define _simd_cmpeq_ps(a, b) _simd_cmp_ps(a, b, _CMP_EQ_OQ) > -#define _simd_cmpge_ps(a, b) _simd_cmp_ps(a, b, _CMP_GE_OQ) > -#define _simd_cmple_ps(a, b) _simd_cmp_ps(a, b, _CMP_LE_OQ) > - > -SIMD_EMU_AVX512_2(simdscalar, _simd_and_ps, _mm256_and_ps) > -SIMD_EMU_AVX512_2(simdscalar, _simd_or_ps, _mm256_or_ps) > -SIMD_EMU_AVX512_1(simdscalar, _simd_rcp_ps, _mm256_rcp_ps) > -SIMD_EMU_AVX512_2(simdscalar, _simd_div_ps, _mm256_div_ps) > - > -INLINE simdscalar _simd_castsi_ps(simdscalari a) > -{ > - return *reinterpret_cast<simdscalar *>(&a); > -} > - > -INLINE simdscalari _simd_castps_si(simdscalar a) > -{ > - return *reinterpret_cast<simdscalari *>(&a); > -} > - > -INLINE simdscalard _simd_castsi_pd(simdscalari a) > -{ > - return *reinterpret_cast<simdscalard *>(&a); > -} > - > -INLINE simdscalari _simd_castpd_si(simdscalard a) > -{ > - return *reinterpret_cast<simdscalari *>(&a); > -} > - > -INLINE simdscalar _simd_castpd_ps(simdscalard a) > -{ > - return *reinterpret_cast<simdscalar *>(&a); > -} > - > -INLINE simdscalard _simd_castps_pd(simdscalar a) > -{ > - return *reinterpret_cast<simdscalard *>(&a); > -} > - > -SIMD_EMU_AVX512_2(simdscalar, _simd_andnot_ps, _mm256_andnot_ps) > - > -INLINE simdscalar _simd_round_ps(simdscalar a, const int mode) > -{ > - simdscalar result; > - > - result.lo = _mm256_round_ps(a.lo, mode); > - result.hi = _mm256_round_ps(a.hi, mode); > - > - return result; > -} > - > -SIMD_EMU_AVX512_2(simdscalari, _simd_mul_epi32, _mm256_mul_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_mullo_epi32, _mm256_mullo_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_sub_epi32, _mm256_sub_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_sub_epi64, _mm256_sub_epi64) > -SIMD_EMU_AVX512_2(simdscalari, _simd_min_epi32, _mm256_min_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_max_epi32, _mm256_max_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_min_epu32, _mm256_min_epu32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_max_epu32, _mm256_max_epu32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_add_epi32, _mm256_add_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_and_si, _mm256_and_si256) > -SIMD_EMU_AVX512_2(simdscalari, _simd_andnot_si, _mm256_andnot_si256) > -SIMD_EMU_AVX512_2(simdscalari, _simd_or_si, _mm256_or_si256) > -SIMD_EMU_AVX512_2(simdscalari, _simd_xor_si, _mm256_xor_si256) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi32, _mm256_cmpeq_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi32, _mm256_cmpgt_epi32) > - > -INLINE int _simd_testz_ps(simdscalar a, simdscalar b) > -{ > - int lo = _mm256_testz_ps(a.lo, b.lo); > - int hi = _mm256_testz_ps(a.hi, b.hi); > - > - return lo & hi; > -} > - > -#define _simd_cmplt_epi32(a, b) _simd_cmpgt_epi32(b, a) > - > -SIMD_EMU_AVX512_2(simdscalari, _simd_unpacklo_epi32, _mm256_unpacklo_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_unpackhi_epi32, _mm256_unpackhi_epi32) > - > -INLINE simdscalari _simd_slli_epi32(simdscalari a, const int imm8) > -{ > - simdscalari result; > - > - result.lo = _mm256_slli_epi32(a.lo, imm8); > - result.hi = _mm256_slli_epi32(a.hi, imm8); > - > - return result; > -} > - > -INLINE simdscalari _simd_srai_epi32(simdscalari a, const int imm8) > -{ > - simdscalari result; > - > - result.lo = _mm256_srai_epi32(a.lo, imm8); > - result.hi = _mm256_srai_epi32(a.hi, imm8); > - > - return result; > -} > - > -INLINE simdscalari _simd_srli_epi32(simdscalari a, const int imm8) > -{ > - simdscalari result; > - > - result.lo = _mm256_srli_epi32(a.lo, imm8); > - result.hi = _mm256_srli_epi32(a.hi, imm8); > - > - return result; > -} > - > -#define _simd128_fmadd_ps _mm_fmadd_ps > - > -SIMD_EMU_AVX512_3(simdscalar, _simd_fmadd_ps, _mm256_fmadd_ps) > -SIMD_EMU_AVX512_3(simdscalar, _simd_fmsub_ps, _mm256_fmsub_ps) > - > -SIMD_EMU_AVX512_2(simdscalari, _simd_shuffle_epi8, _mm256_shuffle_epi8) > -SIMD_EMU_AVX512_2(simdscalari, _simd_adds_epu8, _mm256_adds_epu8) > -SIMD_EMU_AVX512_2(simdscalari, _simd_subs_epu8, _mm256_subs_epu8) > -SIMD_EMU_AVX512_2(simdscalari, _simd_add_epi8, _mm256_add_epi8) > - > -INLINE simdscalar _simd_i32gather_ps(float const *m, simdscalari a, const > int imm8) > -{ > - simdscalar result; > - > - result.lo = _mm256_i32gather_ps(m, a.lo, imm8); > - result.hi = _mm256_i32gather_ps(m, a.hi, imm8); > - > - return result; > -} > - > -SIMD_EMU_AVX512_1(simdscalari, _simd_abs_epi32, _mm256_abs_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi64, _mm256_cmpeq_epi64) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi64, _mm256_cmpgt_epi64) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi16, _mm256_cmpeq_epi16) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi16, _mm256_cmpgt_epi16) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi8, _mm256_cmpeq_epi8) > -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi8, _mm256_cmpgt_epi8) > - > -INLINE simdscalar _simd_permute_ps(simdscalar a, simdscalari b) > -{ > - simdscalar result; > - > - result.lo = _mm256_permutevar8x32_ps(a.lo, b.lo); > - result.hi = _mm256_permutevar8x32_ps(a.hi, b.hi); > - > - return result; > -} > - > -SIMD_EMU_AVX512_2(simdscalari, _simd_permute_epi32, > _mm256_permutevar8x32_epi32) > - > -SIMD_EMU_AVX512_2(simdscalari, _simd_srlv_epi32, _mm256_srlv_epi32) > -SIMD_EMU_AVX512_2(simdscalari, _simd_sllv_epi32, _mm256_sllv_epi32) > - > -INLINE simdscalar _simd_shuffle_ps(simdscalar a, simdscalar b, const int > imm8) > -{ > - simdscalar result; > - > - result.lo = _mm256_shuffle_ps(a.lo, b.lo, imm8); > - result.hi = _mm256_shuffle_ps(a.hi, b.hi, imm8); > - > - return result; > -} > - > -// convert bitmask to vector mask > -INLINE simdscalar vMask(int32_t mask) > -{ > - simdscalari temp = _simd_set1_epi32(mask); > - > - simdscalari bits = _simd_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, > 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, > 0x0004, 0x0002, 0x0001); > - > - simdscalari result = _simd_cmplt_epi32(_simd_setzero_si(), > _simd_and_si(temp, bits)); > - > - return _simd_castsi_ps(result); > -} > - > -#else > - > -INLINE __m512 _m512_broadcast_ss(void const *m) > -{ > - return _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0); > -} > - > -INLINE __m512 _m512_broadcast_ps(void const *m) > -{ > - return _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0); > -} > - > -INLINE __m512 _m512_blend_ps(__m512 a, __m512 b, const int mask) > -{ > - const __mask16 mask16 = _mm512_int2mask(mask); > - > - return _mm512_mask_blend_ps(mask16, a, b); > -} > - > -INLINE __m512 _m512_blendv_ps(__m512 a, __m512 b, __m512 mask) > -{ > - const __mask16 mask16 = _mm512_cmpeq_ps_mask(mask, _mm512_setzero_ps()); > - > - return _mm512_mask_blend_ps(mask16, a, b); > -} > - > -INLINE int _m512_movemask_ps(__m512 a) > -{ > - __m512 mask = _mm512_set1_epi32(0x80000000); > - > - __m512 temp = _mm512_and_epi32(a, mask); > - > - const __mask16 mask16 = _mm512_cmpeq_epu32_mask(temp, mask); > - > - return _mm512mask2int(mask16); > -} > - > -INLINE int _m512_movemask_pd(__m512 a) > -{ > - __m512 mask = _mm512_set1_epi64(0x8000000000000000); > - > - __m512 temp = _mm512_and_epi64(a, mask); > - > - const __mask16 mask16 = _mm512_cmpeq_epu64_mask(temp, mask); > - > - return _mm512mask2int(mask16); > -} > - > -INLINE __m512 _m512_cmp_ps(__m512 a, __m512 b, __m512 comp) > -{ > - const __mask16 mask16 = _mm512_cmpeq_ps_mask(a, b, comp); > - > - return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), > _mm512_set1_epi32(0xFFFFFFFF)); > -} > - > -INLINE __m512 _mm512_cmplt_epi32(__m512 a, __m512 b) > -{ > - const __mask16 mask16 = _mm512_cmplt_epi32_mask(a, b); > - > - return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), > _mm512_set1_epi32(0xFFFFFFFF)); > -} > - > -INLINE __m512 _mm512_cmpgt_epi32(__m512 a, __m512 b) > -{ > - const __mask16 mask16 = _mm512_cmpgt_epi32_mask(a, b); > - > - return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), > _mm512_set1_epi32(0xFFFFFFFF)); > -} > - > -#define _simd_load_ps _mm512_load_ps > -#define _simd_load1_ps _mm256_broadcast_ss > -#define _simd_loadu_ps _mm512_loadu_ps > -#define _simd_setzero_ps _mm512_setzero_ps > -#define _simd_set1_ps _mm512_set1_ps > -#define _simd_blend_ps _mm512_blend_ps > -#define _simd_blendv_ps _mm512_blendv_ps > -#define _simd_store_ps _mm512_store_ps > -#define _simd_mul_ps _mm512_mul_ps > -#define _simd_add_ps _mm512_add_ps > -#define _simd_sub_ps _mm512_sub_ps > -#define _simd_rsqrt_ps _mm512_rsqrt28_ps > -#define _simd_min_ps _mm512_min_ps > -#define _simd_max_ps _mm512_max_ps > -#define _simd_movemask_ps _mm512_movemask_ps > -#define _simd_cvtps_epi32 _mm512_cvtps_epi32 > -#define _simd_cvttps_epi32 _mm512_cvttps_epi32 > -#define _simd_cvtepi32_ps _mm512_cvtepi32_ps > -#define _simd_cmplt_ps(a, b) _mm512_cmp_ps(a, b, _CMP_LT_OQ) > -#define _simd_cmpgt_ps(a, b) _mm512_cmp_ps(a, b, _CMP_GT_OQ) > -#define _simd_cmpneq_ps(a, b) _mm512_cmp_ps(a, b, _CMP_NEQ_OQ) > -#define _simd_cmpeq_ps(a, b) _mm512_cmp_ps(a, b, _CMP_EQ_OQ) > -#define _simd_cmpge_ps(a, b) _mm512_cmp_ps(a, b, _CMP_GE_OQ) > -#define _simd_cmple_ps(a, b) _mm512_cmp_ps(a, b, _CMP_LE_OQ) > -#define _simd_cmp_ps(a, b, comp) _mm512_cmp_ps(a, b, comp) > -#define _simd_and_ps _mm512_and_ps > -#define _simd_or_ps _mm512_or_ps > -#define _simd_rcp_ps _mm512_rcp28_ps > -#define _simd_div_ps _mm512_div_ps > -#define _simd_castsi_ps _mm512_castsi512_ps > -#define _simd_andnot_ps _mm512_andnot_ps > -#define _simd_round_ps _mm512_round_ps > -#define _simd_castpd_ps _mm512_castpd_ps > -#define _simd_broadcast_ps _m512_broadcast_ps > -#define _simd_movemask_pd _mm512_movemask_pd > -#define _simd_castsi_pd _mm512_castsi512_pd > - > -#define _simd_mul_epi32 _mm512_mul_epi32 > -#define _simd_mullo_epi32 _mm512_mullo_epi32 > -#define _simd_sub_epi32 _mm512_sub_epi32 > -#define _simd_sub_epi64 _mm512_sub_epi64 > -#define _simd_min_epi32 _mm512_min_epi32 > -#define _simd_max_epi32 _mm512_max_epi32 > -#define _simd_min_epu32 _mm512_min_epu32 > -#define _simd_max_epu32 _mm512_max_epu32 > -#define _simd_add_epi32 _mm512_add_epi32 > -#define _simd_and_si _mm512_and_si512 > -#define _simd_andnot_si _mm512_andnot_si512 > -#define _simd_cmpeq_epi32 _mm512_cmpeq_epi32 > -#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a) > -#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b) > -#define _simd_or_si _mm512_or_si512 > -#define _simd_castps_si _mm512_castps_si512 > - > -#endif > - > #else > #error Unsupported vector width > #endif > diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h > b/src/gallium/drivers/swr/rasterizer/core/format_types.h > index fcb137d..a242924 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h > +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h > @@ -82,16 +82,6 @@ struct PackTraits<8, false> > __m256 result = _mm256_setzero_ps(); > __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); > return _mm256_insertf128_ps(result, vLo, 0); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalar result = _simd_setzero_ps(); > - > - __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); > - > - result.lo = _mm256_insertf128_ps(result.lo, src, 0); > - > - return result; > -#endif > #else > #error Unsupported vector width > #endif > @@ -102,10 +92,6 @@ struct PackTraits<8, false> > // store simd bytes > #if KNOB_SIMD_WIDTH == 8 > _mm_storel_pd((double*)pDst, > _mm_castps_pd(_mm256_castps256_ps128(src))); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - _mm_store_ps(reinterpret_cast<float*>(pDst), > _mm256_castps256_ps128(src.lo)); > -#endif > #else > #error Unsupported vector width > #endif > @@ -126,18 +112,6 @@ struct PackTraits<8, false> > #elif KNOB_ARCH>=KNOB_ARCH_AVX2 > return > _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result; > - > - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo)); > - > - result.lo = _mm256_cvtepu8_epi32(src); > - > - result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8)); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -150,20 +124,6 @@ struct PackTraits<8, false> > __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), > _mm256_extractf128_si256(src, 1)); > __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); > return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result = _simd_setzero_si(); > - > - __m128i templo = > _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), > _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1)); > - > - __m128i temphi = > _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), > _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1)); > - > - __m128i temp = _mm_packus_epi16(templo, temphi); > - > - result.lo = _mm256_insertf128_si256(result.lo, temp, 0); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -233,16 +193,6 @@ struct PackTraits<8, true> > __m256 result = _mm256_setzero_ps(); > __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); > return _mm256_insertf128_ps(result, vLo, 0); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalar result = _simd_setzero_ps(); > - > - __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); > - > - result.lo = _mm256_insertf128_ps(result.lo, src, 0); > - > - return result; > -#endif > #else > #error Unsupported vector width > #endif > @@ -253,10 +203,6 @@ struct PackTraits<8, true> > // store simd bytes > #if KNOB_SIMD_WIDTH == 8 > _mm_storel_pd((double*)pDst, > _mm_castps_pd(_mm256_castps256_ps128(src))); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - _mm_store_ps(reinterpret_cast<float*>(pDst), > _mm256_castps256_ps128(src.lo)); > -#endif > #else > #error Unsupported vector width > #endif > @@ -278,18 +224,6 @@ struct PackTraits<8, true> > #elif KNOB_ARCH>=KNOB_ARCH_AVX2 > return > _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result; > - > - __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo)); > - > - result.lo = _mm256_cvtepu8_epi32(src); > - > - result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8)); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -302,20 +236,6 @@ struct PackTraits<8, true> > __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), > _mm256_extractf128_si256(src, 1)); > __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); > return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result = _simd_setzero_si(); > - > - __m128i templo = > _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), > _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1)); > - > - __m128i temphi = > _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), > _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1)); > - > - __m128i temp = _mm_packs_epi16(templo, temphi); > - > - result.lo = _mm256_insertf128_si256(result.lo, temp, 0); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -385,16 +305,6 @@ struct PackTraits<16, false> > __m256 result = _mm256_setzero_ps(); > __m128 vLo = _mm_load_ps((const float*)pSrc); > return _mm256_insertf128_ps(result, vLo, 0); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalar result; > - > - result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc)); > - > - result.hi = _mm256_undefined_ps(); > - > - return result; > -#endif > #else > #error Unsupported vector width > #endif > @@ -405,10 +315,6 @@ struct PackTraits<16, false> > #if KNOB_SIMD_WIDTH == 8 > // store 16B (2B * 8) > _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo); > -#endif > #else > #error Unsupported vector width > #endif > @@ -429,16 +335,6 @@ struct PackTraits<16, false> > #elif KNOB_ARCH>=KNOB_ARCH_AVX2 > return > _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result; > - > - result.lo = > _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), > 0)); > - > - result.hi = > _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), > 1)); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -450,21 +346,6 @@ struct PackTraits<16, false> > simdscalari src = _simd_castps_si(in); > __m256i res = > _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), > _mm256_extractf128_si256(src, 1))); > return _mm256_castsi256_ps(res); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result; > - > - __m256i inlo = _mm256_castps_si256(in.lo); > - __m256i inhi = _mm256_castps_si256(in.hi); > - > - __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20); > - __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31); > - > - result.lo = _mm256_packus_epi32(templo, temphi); > - result.hi = _mm256_undefined_si256(); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -528,16 +409,6 @@ struct PackTraits<16, true> > __m256 result = _mm256_setzero_ps(); > __m128 vLo = _mm_load_ps((const float*)pSrc); > return _mm256_insertf128_ps(result, vLo, 0); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalar result; > - > - result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc)); > - > - result.hi = _mm256_undefined_ps(); > - > - return result; > -#endif > #else > #error Unsupported vector width > #endif > @@ -548,10 +419,6 @@ struct PackTraits<16, true> > #if KNOB_SIMD_WIDTH == 8 > // store 16B (2B * 8) > _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo); > -#endif > #else > #error Unsupported vector width > #endif > @@ -573,16 +440,6 @@ struct PackTraits<16, true> > #elif KNOB_ARCH>=KNOB_ARCH_AVX2 > return > _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result; > - > - result.lo = > _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), > 0)); > - > - result.hi = > _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), > 1)); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -594,21 +451,6 @@ struct PackTraits<16, true> > simdscalari src = _simd_castps_si(in); > __m256i res = > _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), > _mm256_extractf128_si256(src, 1))); > return _mm256_castsi256_ps(res); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari result; > - > - __m256i inlo = _mm256_castps_si256(in.lo); > - __m256i inhi = _mm256_castps_si256(in.hi); > - > - __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20); > - __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31); > - > - result.lo = _mm256_packs_epi32(templo, temphi); > - result.hi = _mm256_undefined_si256(); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -1193,20 +1035,6 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : > PackTraits<16> > #else > return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, > _MM_FROUND_TRUNC))); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > -simdscalari result; > - > - __m128i templo = _mm256_cvtps_ph(in.lo, _MM_FROUND_TRUNC); > - __m128i temphi = _mm256_cvtps_ph(in.hi, _MM_FROUND_TRUNC); > - > - result.lo = _mm256_castsi128_si256(templo); > - result.lo = _mm256_insertf128_si256(result.lo, temphi, 1); > - > - result.hi = _mm256_undefined_si256(); > - > - return _simd_castsi_ps(result); > -#endif > #else > #error Unsupported vector width > #endif > @@ -1275,23 +1103,6 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : > PackTraits<32> > in = _mm256_insertf128_ps(in, srcLo, 0); > in = _mm256_insertf128_ps(in, srcHi, 1); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - __m128 inlo0 = _mm256_extractf128_ps(in.lo, 0); > - __m128 inlo1 = _mm256_extractf128_ps(in.lo, 1); > - __m128 inhi0 = _mm256_extractf128_ps(in.hi, 0); > - __m128 inhi1 = _mm256_extractf128_ps(in.hi, 1); > - > - inlo0 = ConvertFloatToSRGB2(inlo0); > - inlo1 = ConvertFloatToSRGB2(inlo1); > - inhi0 = ConvertFloatToSRGB2(inhi0); > - inhi1 = ConvertFloatToSRGB2(inhi1); > - > - in.lo = _mm256_insertf128_ps(in.lo, inlo0, 0); > - in.lo = _mm256_insertf128_ps(in.lo, inlo1, 1); > - in.hi = _mm256_insertf128_ps(in.hi, inhi0, 0); > - in.hi = _mm256_insertf128_ps(in.hi, inhi1, 1); > -#endif > #else > #error Unsupported vector width > #endif > diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h > b/src/gallium/drivers/swr/rasterizer/core/knobs.h > index b108526..bbe15c1 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h > +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h > @@ -141,8 +141,6 @@ > > #if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4 > #error "incompatible width/tile dimensions" > -#elif KNOB_SIMD_WIDTH==16 && KNOB_TILE_X_DIM < 4 > -#error "incompatible width/tile dimensions" > #endif > > #if ENABLE_AVX512_SIMD16 > @@ -154,9 +152,6 @@ > #if KNOB_SIMD_WIDTH == 8 > #define SIMD_TILE_X_DIM 4 > #define SIMD_TILE_Y_DIM 2 > -#elif KNOB_SIMD_WIDTH == 16 > -#define SIMD_TILE_X_DIM 4 > -#define SIMD_TILE_Y_DIM 4 > #else > #error "Invalid simd width" > #endif > diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h > b/src/gallium/drivers/swr/rasterizer/core/utils.h > index dd4fa3e..91a994e 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/utils.h > +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h > @@ -145,7 +145,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i > &row2, __m128i &row3) > #endif > #endif > > -#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16 > +#if KNOB_SIMD_WIDTH == 8 > INLINE > void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 > &vSrc2) > { > @@ -288,20 +288,6 @@ struct Transpose8_8_8_8 > simdscalari dst = _mm256_or_si256(dst01, dst23); > _simd_store_si((simdscalari*)pDst, dst); > #endif > -#elif KNOB_SIMD_WIDTH == 16 > - simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, > 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800); > - > - simdscalari dst01 = _simd_shuffle_epi8(src, mask0); > - > - simdscalari perm1 = _simd_permute_128(src, src, 1); > - > - simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, > 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080); > - > - simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1); > - > - simdscalari dst = _simd_or_si(dst01, dst23); > - > - _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); > #else > #error Unsupported vector width > #endif > @@ -363,16 +349,6 @@ struct Transpose8_8 > __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg > gggggggg > rg = _mm_unpacklo_epi8(rg, g); > _mm_store_si128((__m128i*)pDst, rg); > -#elif KNOB_SIMD_WIDTH == 16 > - __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i > *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg > - > - __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = > 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx > - > - __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = > 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx > - > - __m256i dst = _mm256_unpacklo_epi8(r, g); > // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg > - > - _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst); > #else > #error Unsupported vector width > #endif > @@ -421,37 +397,6 @@ struct Transpose32_32_32_32 > _mm_store_ps((float*)pDst+20, vDst[5]); > _mm_store_ps((float*)pDst+24, vDst[6]); > _mm_store_ps((float*)pDst+28, vDst[7]); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalar src0 = _simd_load_ps(reinterpret_cast<const > float*>(pSrc)); > - simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) > + 16); > - simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) > + 32); > - simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) > + 48); > - > - __m128 vDst[8]; > - > - vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo); > - > - _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]); > - > - vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi); > - > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]); > -#endif > #else > #error Unsupported vector width > #endif > @@ -528,36 +473,6 @@ struct Transpose32_32_32 > _mm_store_ps((float*)pDst + 20, vDst[5]); > _mm_store_ps((float*)pDst + 24, vDst[6]); > _mm_store_ps((float*)pDst + 28, vDst[7]); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalar src0 = _simd_load_ps(reinterpret_cast<const > float*>(pSrc)); > - simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) > + 16); > - simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) > + 32); > - > - __m128 vDst[8]; > - > - vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo); > - > - _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]); > - > - vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi); > - > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]); > - _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]); > -#endif > #else > #error Unsupported vector width > #endif > @@ -635,23 +550,6 @@ struct Transpose32_32 > _mm_store_ps(pfDst + 4, dst1); > _mm_store_ps(pfDst + 8, dst2); > _mm_store_ps(pfDst + 12, dst3); > -#elif KNOB_SIMD_WIDTH == 16 > - const float* pfSrc = (const float*)pSrc; > - __m256 src_r0 = _mm256_load_ps(pfSrc + 0); > - __m256 src_r1 = _mm256_load_ps(pfSrc + 8); > - __m256 src_g0 = _mm256_load_ps(pfSrc + 16); > - __m256 src_g1 = _mm256_load_ps(pfSrc + 24); > - > - __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0); > - __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0); > - __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1); > - __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1); > - > - float* pfDst = (float*)pDst; > - _mm256_store_ps(pfDst + 0, dst0); > - _mm256_store_ps(pfDst + 8, dst1); > - _mm256_store_ps(pfDst + 16, dst2); > - _mm256_store_ps(pfDst + 24, dst3); > #else > #error Unsupported vector width > #endif > @@ -716,31 +614,6 @@ struct Transpose16_16_16_16 > _mm_store_si128(((__m128i*)pDst) + 1, dst1); > _mm_store_si128(((__m128i*)pDst) + 2, dst2); > _mm_store_si128(((__m128i*)pDst) + 3, dst3); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari src_rg = _simd_load_si(reinterpret_cast<const > simdscalari*>(pSrc)); > - simdscalari src_ba = _simd_load_si(reinterpret_cast<const > simdscalari*>(pSrc + sizeof(simdscalari))); > - > - __m256i src_r = src_rg.lo; > - __m256i src_g = src_rg.hi; > - __m256i src_b = src_ba.lo; > - __m256i src_a = src_ba.hi; > - > - __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); > - __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); > - __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); > - __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); > - > - __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); > - __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); > - __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); > - __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); > - > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); > -#endif > #else > #error Unsupported vector width > #endif > @@ -808,30 +681,6 @@ struct Transpose16_16_16 > _mm_store_si128(((__m128i*)pDst) + 1, dst1); > _mm_store_si128(((__m128i*)pDst) + 2, dst2); > _mm_store_si128(((__m128i*)pDst) + 3, dst3); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari src_rg = _simd_load_si(reinterpret_cast<const > simdscalari*>(pSrc)); > - > - __m256i src_r = src_rg.lo; > - __m256i src_g = src_rg.hi; > - __m256i src_b = _mm256_load_si256(reinterpret_cast<const > __m256i*>(pSrc + sizeof(simdscalari))); > - __m256i src_a = _mm256_undefined_si256(); > - > - __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); > - __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); > - __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); > - __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); > - > - __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); > - __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); > - __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); > - __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); > - > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); > - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); > -#endif > #else > #error Unsupported vector width > #endif > @@ -890,17 +739,6 @@ struct Transpose16_16 > > _mm_store_si128((__m128i*)pDst, resLo); > _mm_store_si128((__m128i*)pDst + 1, resHi); > -#elif KNOB_SIMD_WIDTH == 16 > -#if ENABLE_AVX512_EMULATION > - simdscalari src = > _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc))); > - > - simdscalari result; > - > - result.lo = _mm256_unpacklo_epi16(src.lo, src.hi); > - result.hi = _mm256_unpackhi_epi16(src.lo, src.hi); > - > - _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result); > -#endif > #else > #error Unsupported vector width > #endif > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev