Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com> 

> On Oct 26, 2016, at 7:08 PM, George Kyriazis <george.kyria...@intel.com> 
> wrote:
> 
> Used in abandoned all-or-nothing approach to converting to AVX512
> ---
> .../drivers/swr/rasterizer/common/simdintrin.h     | 633 ---------------------
> .../drivers/swr/rasterizer/core/format_types.h     | 189 ------
> src/gallium/drivers/swr/rasterizer/core/knobs.h    |   5 -
> src/gallium/drivers/swr/rasterizer/core/utils.h    | 164 +-----
> 4 files changed, 1 insertion(+), 990 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h 
> b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
> index 7671031..10c0955 100644
> --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
> +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
> @@ -36,30 +36,6 @@
> typedef __m256 simdscalar;
> typedef __m256i simdscalari;
> typedef uint8_t simdmask;
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -struct simdscalar
> -{
> -    __m256  lo;
> -    __m256  hi;
> -};
> -struct simdscalard
> -{
> -    __m256d lo;
> -    __m256d hi;
> -};
> -struct simdscalari
> -{
> -    __m256i lo;
> -    __m256i hi;
> -};
> -typedef uint16_t simdmask;
> -#else
> -typedef __m512 simdscalar;
> -typedef __m512d simdscalard;
> -typedef __m512i simdscalari;
> -typedef __mask16 simdmask;
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -655,615 +631,6 @@ void _simdvec_transpose(simdvector &v)
>     SWR_ASSERT(false, "Need to implement 8 wide version");
> }
> 
> -#elif KNOB_SIMD_WIDTH == 16
> -
> -#if ENABLE_AVX512_EMULATION
> -
> -#define SIMD_EMU_AVX512_0(type, func, intrin) \
> -INLINE type func()\
> -{\
> -    type result;\
> -\
> -    result.lo = intrin();\
> -    result.hi = intrin();\
> -\
> -    return result;\
> -}
> -
> -#define SIMD_EMU_AVX512_1(type, func, intrin) \
> -INLINE type func(type a)\
> -{\
> -    type result;\
> -\
> -    result.lo = intrin(a.lo);\
> -    result.hi = intrin(a.hi);\
> -\
> -    return result;\
> -}
> -
> -#define SIMD_EMU_AVX512_2(type, func, intrin) \
> -INLINE type func(type a, type b)\
> -{\
> -    type result;\
> -\
> -    result.lo = intrin(a.lo, b.lo);\
> -    result.hi = intrin(a.hi, b.hi);\
> -\
> -    return result;\
> -}
> -
> -#define SIMD_EMU_AVX512_3(type, func, intrin) \
> -INLINE type func(type a, type b, type c)\
> -{\
> -    type result;\
> -\
> -    result.lo = intrin(a.lo, b.lo, c.lo);\
> -    result.hi = intrin(a.hi, b.hi, c.hi);\
> -\
> -    return result;\
> -}
> -
> -SIMD_EMU_AVX512_0(simdscalar, _simd_setzero_ps, _mm256_setzero_ps)
> -SIMD_EMU_AVX512_0(simdscalari, _simd_setzero_si, _mm256_setzero_si256)
> -
> -INLINE simdscalar _simd_set1_ps(float a)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_set1_ps(a);
> -    result.hi = _mm256_set1_ps(a);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_set1_epi8(char a)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_set1_epi8(a);
> -    result.hi = _mm256_set1_epi8(a);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_set1_epi32(int a)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_set1_epi32(a);
> -    result.hi = _mm256_set1_epi32(a);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_set_epi32(int e7, int e6, int e5, int e4, int e3, 
> int e2, int e1, int e0)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
> -    result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_set_epi32(int e15, int e14, int e13, int e12, int 
> e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, 
> int e1, int e0)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
> -    result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_load_ps(float const *m)
> -{
> -    float const *n = reinterpret_cast<float const 
> *>(reinterpret_cast<uint8_t const *>(m) + sizeof(simdscalar::lo));
> -
> -    simdscalar result;
> -
> -    result.lo = _mm256_load_ps(m);
> -    result.hi = _mm256_load_ps(n);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_loadu_ps(float const *m)
> -{
> -    float const *n = reinterpret_cast<float const 
> *>(reinterpret_cast<uint8_t const *>(m) + sizeof(simdscalar::lo));
> -
> -    simdscalar result;
> -
> -    result.lo = _mm256_loadu_ps(m);
> -    result.hi = _mm256_loadu_ps(n);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_load1_ps(float const *m)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_broadcast_ss(m);
> -    result.hi = _mm256_broadcast_ss(m);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_load_si(simdscalari const *m)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_load_si256(&m[0].lo);
> -    result.hi = _mm256_load_si256(&m[0].hi);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_loadu_si(simdscalari const *m)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_loadu_si256(&m[0].lo);
> -    result.hi = _mm256_loadu_si256(&m[0].hi);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_broadcast_ss(float const *m)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_broadcast_ss(m);
> -    result.hi = _mm256_broadcast_ss(m);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_broadcast_ps(__m128 const *m)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_broadcast_ps(m);
> -    result.hi = _mm256_broadcast_ps(m);
> -
> -    return result;
> -}
> -
> -INLINE void _simd_store_ps(float *m, simdscalar a)
> -{
> -    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + 
> sizeof(simdscalar::lo));
> -
> -    _mm256_store_ps(m, a.lo);
> -    _mm256_store_ps(n, a.hi);
> -}
> -
> -INLINE void _simd_maskstore_ps(float *m, simdscalari mask, simdscalar a)
> -{
> -    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + 
> sizeof(simdscalar::lo));
> -
> -    _mm256_maskstore_ps(m, mask.lo, a.lo);
> -    _mm256_maskstore_ps(n, mask.hi, a.hi);
> -}
> -
> -INLINE void _simd_store_si(simdscalari *m, simdscalari a)
> -{
> -    _mm256_store_si256(&m[0].lo, a.lo);
> -    _mm256_store_si256(&m[0].hi, a.hi);
> -}
> -
> -INLINE simdscalar _simd_blend_ps(simdscalar a, simdscalar b, const simdmask 
> mask)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_blend_ps(a.lo, b.lo, reinterpret_cast<const uint8_t 
> *>(&mask)[0]);
> -    result.hi = _mm256_blend_ps(a.hi, b.hi, reinterpret_cast<const uint8_t 
> *>(&mask)[1]);
> -
> -    return result;
> -}
> -
> -SIMD_EMU_AVX512_3(simdscalar, _simd_blendv_ps, _mm256_blendv_ps)
> -
> -INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, const 
> simdscalar mask)
> -{
> -    simdscalari result;
> -
> -    result.lo = 
> _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), 
> _mm256_castsi256_ps(b.lo), mask.lo));
> -    result.hi = 
> _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), 
> _mm256_castsi256_ps(b.hi), mask.hi));
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, const 
> simdscalari mask)
> -{
> -    simdscalari result;
> -
> -    result.lo = 
> _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), 
> _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
> -    result.hi = 
> _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), 
> _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
> -
> -    return result;
> -}
> -
> -SIMD_EMU_AVX512_2(simdscalar, _simd_mul_ps, _mm256_mul_ps)
> -SIMD_EMU_AVX512_2(simdscalar, _simd_add_ps, _mm256_add_ps)
> -SIMD_EMU_AVX512_2(simdscalar, _simd_sub_ps, _mm256_sub_ps)
> -SIMD_EMU_AVX512_1(simdscalar, _simd_rsqrt_ps, _mm256_rsqrt_ps)
> -SIMD_EMU_AVX512_2(simdscalar, _simd_min_ps, _mm256_min_ps)
> -SIMD_EMU_AVX512_2(simdscalar, _simd_max_ps, _mm256_max_ps)
> -
> -INLINE simdmask _simd_movemask_ps(simdscalar a)
> -{
> -    simdmask mask;
> -
> -    reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
> -    reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
> -
> -    return mask;
> -}
> -
> -INLINE simdmask _simd_movemask_pd(simdscalard a)
> -{
> -    simdmask mask;
> -
> -    reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
> -    reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
> -
> -    return mask;
> -}
> -
> -INLINE simdmask _simd_movemask_epi8(simdscalari a)
> -{
> -    simdmask mask;
> -
> -    reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
> -    reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
> -
> -    return mask;
> -}
> -
> -INLINE simdscalari _simd_cvtps_epi32(simdscalar a)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_cvtps_epi32(a.lo);
> -    result.hi = _mm256_cvtps_epi32(a.hi);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_cvttps_epi32(simdscalar a)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_cvttps_epi32(a.lo);
> -    result.hi = _mm256_cvttps_epi32(a.hi);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_cvtepi32_ps(simdscalari a)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_cvtepi32_ps(a.lo);
> -    result.hi = _mm256_cvtepi32_ps(a.hi);
> -
> -    return result;
> -}
> -
> -INLINE simdscalar _simd_cmp_ps(simdscalar a, simdscalar b, const int comp)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
> -    result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
> -
> -    return result;
> -}
> -
> -#define _simd_cmplt_ps(a, b) _simd_cmp_ps(a, b, _CMP_LT_OQ)
> -#define _simd_cmpgt_ps(a, b) _simd_cmp_ps(a, b, _CMP_GT_OQ)
> -#define _simd_cmpneq_ps(a, b) _simd_cmp_ps(a, b, _CMP_NEQ_OQ)
> -#define _simd_cmpeq_ps(a, b) _simd_cmp_ps(a, b, _CMP_EQ_OQ)
> -#define _simd_cmpge_ps(a, b) _simd_cmp_ps(a, b, _CMP_GE_OQ)
> -#define _simd_cmple_ps(a, b) _simd_cmp_ps(a, b, _CMP_LE_OQ)
> -
> -SIMD_EMU_AVX512_2(simdscalar, _simd_and_ps, _mm256_and_ps)
> -SIMD_EMU_AVX512_2(simdscalar, _simd_or_ps, _mm256_or_ps)
> -SIMD_EMU_AVX512_1(simdscalar, _simd_rcp_ps, _mm256_rcp_ps)
> -SIMD_EMU_AVX512_2(simdscalar, _simd_div_ps, _mm256_div_ps)
> -
> -INLINE simdscalar _simd_castsi_ps(simdscalari a)
> -{
> -    return *reinterpret_cast<simdscalar *>(&a);
> -}
> -
> -INLINE simdscalari _simd_castps_si(simdscalar a)
> -{
> -    return *reinterpret_cast<simdscalari *>(&a);
> -}
> -
> -INLINE simdscalard _simd_castsi_pd(simdscalari a)
> -{
> -    return *reinterpret_cast<simdscalard *>(&a);
> -}
> -
> -INLINE simdscalari _simd_castpd_si(simdscalard a)
> -{
> -    return *reinterpret_cast<simdscalari *>(&a);
> -}
> -
> -INLINE simdscalar _simd_castpd_ps(simdscalard a)
> -{
> -    return *reinterpret_cast<simdscalar *>(&a);
> -}
> -
> -INLINE simdscalard _simd_castps_pd(simdscalar a)
> -{
> -    return *reinterpret_cast<simdscalard *>(&a);
> -}
> -
> -SIMD_EMU_AVX512_2(simdscalar, _simd_andnot_ps, _mm256_andnot_ps)
> -
> -INLINE simdscalar _simd_round_ps(simdscalar a, const int mode)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_round_ps(a.lo, mode);
> -    result.hi = _mm256_round_ps(a.hi, mode);
> -
> -    return result;
> -}
> -
> -SIMD_EMU_AVX512_2(simdscalari, _simd_mul_epi32, _mm256_mul_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_mullo_epi32, _mm256_mullo_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_sub_epi32, _mm256_sub_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_sub_epi64, _mm256_sub_epi64)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_min_epi32, _mm256_min_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_max_epi32, _mm256_max_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_min_epu32, _mm256_min_epu32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_max_epu32, _mm256_max_epu32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_add_epi32, _mm256_add_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_and_si, _mm256_and_si256)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_andnot_si, _mm256_andnot_si256)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_or_si, _mm256_or_si256)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_xor_si, _mm256_xor_si256)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi32, _mm256_cmpeq_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi32, _mm256_cmpgt_epi32)
> -
> -INLINE int _simd_testz_ps(simdscalar a, simdscalar b)
> -{
> -    int lo = _mm256_testz_ps(a.lo, b.lo);
> -    int hi = _mm256_testz_ps(a.hi, b.hi);
> -
> -    return lo & hi;
> -}
> -
> -#define _simd_cmplt_epi32(a, b) _simd_cmpgt_epi32(b, a)
> -
> -SIMD_EMU_AVX512_2(simdscalari, _simd_unpacklo_epi32, _mm256_unpacklo_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_unpackhi_epi32, _mm256_unpackhi_epi32)
> -
> -INLINE simdscalari _simd_slli_epi32(simdscalari a, const int imm8)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_slli_epi32(a.lo, imm8);
> -    result.hi = _mm256_slli_epi32(a.hi, imm8);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_srai_epi32(simdscalari a, const int imm8)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_srai_epi32(a.lo, imm8);
> -    result.hi = _mm256_srai_epi32(a.hi, imm8);
> -
> -    return result;
> -}
> -
> -INLINE simdscalari _simd_srli_epi32(simdscalari a, const int imm8)
> -{
> -    simdscalari result;
> -
> -    result.lo = _mm256_srli_epi32(a.lo, imm8);
> -    result.hi = _mm256_srli_epi32(a.hi, imm8);
> -
> -    return result;
> -}
> -
> -#define _simd128_fmadd_ps _mm_fmadd_ps
> -
> -SIMD_EMU_AVX512_3(simdscalar, _simd_fmadd_ps, _mm256_fmadd_ps)
> -SIMD_EMU_AVX512_3(simdscalar, _simd_fmsub_ps, _mm256_fmsub_ps)
> -
> -SIMD_EMU_AVX512_2(simdscalari, _simd_shuffle_epi8, _mm256_shuffle_epi8)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_adds_epu8, _mm256_adds_epu8)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_subs_epu8, _mm256_subs_epu8)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_add_epi8, _mm256_add_epi8)
> -
> -INLINE simdscalar _simd_i32gather_ps(float const *m, simdscalari a, const 
> int imm8)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_i32gather_ps(m, a.lo, imm8);
> -    result.hi = _mm256_i32gather_ps(m, a.hi, imm8);
> -
> -    return result;
> -}
> -
> -SIMD_EMU_AVX512_1(simdscalari, _simd_abs_epi32, _mm256_abs_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi64, _mm256_cmpeq_epi64)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi64, _mm256_cmpgt_epi64)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi16, _mm256_cmpeq_epi16)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi16, _mm256_cmpgt_epi16)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi8, _mm256_cmpeq_epi8)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi8, _mm256_cmpgt_epi8)
> -
> -INLINE simdscalar _simd_permute_ps(simdscalar a, simdscalari b)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_permutevar8x32_ps(a.lo, b.lo);
> -    result.hi = _mm256_permutevar8x32_ps(a.hi, b.hi);
> -
> -    return result;
> -}
> -
> -SIMD_EMU_AVX512_2(simdscalari, _simd_permute_epi32, 
> _mm256_permutevar8x32_epi32)
> -
> -SIMD_EMU_AVX512_2(simdscalari, _simd_srlv_epi32, _mm256_srlv_epi32)
> -SIMD_EMU_AVX512_2(simdscalari, _simd_sllv_epi32, _mm256_sllv_epi32)
> -
> -INLINE simdscalar _simd_shuffle_ps(simdscalar a, simdscalar b, const int 
> imm8)
> -{
> -    simdscalar result;
> -
> -    result.lo = _mm256_shuffle_ps(a.lo, b.lo, imm8);
> -    result.hi = _mm256_shuffle_ps(a.hi, b.hi, imm8);
> -
> -    return result;
> -}
> -
> -// convert bitmask to vector mask
> -INLINE simdscalar vMask(int32_t mask)
> -{
> -    simdscalari temp = _simd_set1_epi32(mask);
> -
> -    simdscalari bits = _simd_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 
> 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 
> 0x0004, 0x0002, 0x0001);
> -
> -    simdscalari result = _simd_cmplt_epi32(_simd_setzero_si(), 
> _simd_and_si(temp, bits));
> -
> -    return _simd_castsi_ps(result);
> -}
> -
> -#else
> -
> -INLINE __m512 _m512_broadcast_ss(void const *m)
> -{
> -    return _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0);
> -}
> -
> -INLINE __m512 _m512_broadcast_ps(void const *m)
> -{
> -    return _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0);
> -}
> -
> -INLINE __m512 _m512_blend_ps(__m512 a, __m512 b, const int mask)
> -{
> -    const __mask16 mask16 = _mm512_int2mask(mask);
> -
> -    return _mm512_mask_blend_ps(mask16, a, b);
> -}
> -
> -INLINE __m512 _m512_blendv_ps(__m512 a, __m512 b, __m512 mask)
> -{
> -    const __mask16 mask16 = _mm512_cmpeq_ps_mask(mask, _mm512_setzero_ps());
> -
> -    return _mm512_mask_blend_ps(mask16, a, b);
> -}
> -
> -INLINE int _m512_movemask_ps(__m512 a)
> -{
> -    __m512 mask = _mm512_set1_epi32(0x80000000);
> -
> -    __m512 temp = _mm512_and_epi32(a, mask);
> -
> -    const __mask16 mask16 = _mm512_cmpeq_epu32_mask(temp, mask);
> -
> -    return _mm512mask2int(mask16);
> -}
> -
> -INLINE int _m512_movemask_pd(__m512 a)
> -{
> -    __m512 mask = _mm512_set1_epi64(0x8000000000000000);
> -
> -    __m512 temp = _mm512_and_epi64(a, mask);
> -
> -    const __mask16 mask16 = _mm512_cmpeq_epu64_mask(temp, mask);
> -
> -    return _mm512mask2int(mask16);
> -}
> -
> -INLINE __m512 _m512_cmp_ps(__m512 a, __m512 b, __m512 comp)
> -{
> -    const __mask16 mask16 = _mm512_cmpeq_ps_mask(a, b, comp);
> -
> -    return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), 
> _mm512_set1_epi32(0xFFFFFFFF));
> -}
> -
> -INLINE __m512 _mm512_cmplt_epi32(__m512 a, __m512 b)
> -{
> -    const __mask16 mask16 = _mm512_cmplt_epi32_mask(a, b);
> -
> -    return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), 
> _mm512_set1_epi32(0xFFFFFFFF));
> -}
> -
> -INLINE __m512 _mm512_cmpgt_epi32(__m512 a, __m512 b)
> -{
> -    const __mask16 mask16 = _mm512_cmpgt_epi32_mask(a, b);
> -
> -    return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), 
> _mm512_set1_epi32(0xFFFFFFFF));
> -}
> -
> -#define _simd_load_ps _mm512_load_ps
> -#define _simd_load1_ps _mm256_broadcast_ss
> -#define _simd_loadu_ps _mm512_loadu_ps
> -#define _simd_setzero_ps _mm512_setzero_ps
> -#define _simd_set1_ps _mm512_set1_ps
> -#define _simd_blend_ps  _mm512_blend_ps
> -#define _simd_blendv_ps _mm512_blendv_ps
> -#define _simd_store_ps _mm512_store_ps
> -#define _simd_mul_ps _mm512_mul_ps
> -#define _simd_add_ps _mm512_add_ps
> -#define _simd_sub_ps _mm512_sub_ps
> -#define _simd_rsqrt_ps _mm512_rsqrt28_ps
> -#define _simd_min_ps _mm512_min_ps
> -#define _simd_max_ps _mm512_max_ps
> -#define _simd_movemask_ps _mm512_movemask_ps
> -#define _simd_cvtps_epi32 _mm512_cvtps_epi32
> -#define _simd_cvttps_epi32 _mm512_cvttps_epi32
> -#define _simd_cvtepi32_ps _mm512_cvtepi32_ps
> -#define _simd_cmplt_ps(a, b) _mm512_cmp_ps(a, b, _CMP_LT_OQ)
> -#define _simd_cmpgt_ps(a, b) _mm512_cmp_ps(a, b, _CMP_GT_OQ)
> -#define _simd_cmpneq_ps(a, b) _mm512_cmp_ps(a, b, _CMP_NEQ_OQ)
> -#define _simd_cmpeq_ps(a, b) _mm512_cmp_ps(a, b, _CMP_EQ_OQ)
> -#define _simd_cmpge_ps(a, b) _mm512_cmp_ps(a, b, _CMP_GE_OQ)
> -#define _simd_cmple_ps(a, b) _mm512_cmp_ps(a, b, _CMP_LE_OQ)
> -#define _simd_cmp_ps(a, b, comp) _mm512_cmp_ps(a, b, comp)
> -#define _simd_and_ps _mm512_and_ps
> -#define _simd_or_ps _mm512_or_ps
> -#define _simd_rcp_ps _mm512_rcp28_ps
> -#define _simd_div_ps _mm512_div_ps
> -#define _simd_castsi_ps _mm512_castsi512_ps
> -#define _simd_andnot_ps _mm512_andnot_ps
> -#define _simd_round_ps _mm512_round_ps
> -#define _simd_castpd_ps _mm512_castpd_ps
> -#define _simd_broadcast_ps _m512_broadcast_ps
> -#define _simd_movemask_pd _mm512_movemask_pd
> -#define _simd_castsi_pd _mm512_castsi512_pd
> -
> -#define _simd_mul_epi32 _mm512_mul_epi32
> -#define _simd_mullo_epi32 _mm512_mullo_epi32
> -#define _simd_sub_epi32 _mm512_sub_epi32
> -#define _simd_sub_epi64 _mm512_sub_epi64
> -#define _simd_min_epi32 _mm512_min_epi32
> -#define _simd_max_epi32 _mm512_max_epi32
> -#define _simd_min_epu32 _mm512_min_epu32
> -#define _simd_max_epu32 _mm512_max_epu32
> -#define _simd_add_epi32 _mm512_add_epi32
> -#define _simd_and_si _mm512_and_si512
> -#define _simd_andnot_si _mm512_andnot_si512
> -#define _simd_cmpeq_epi32 _mm512_cmpeq_epi32
> -#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
> -#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
> -#define _simd_or_si _mm512_or_si512
> -#define _simd_castps_si _mm512_castps_si512
> -
> -#endif
> -
> #else
> #error Unsupported vector width
> #endif
> diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h 
> b/src/gallium/drivers/swr/rasterizer/core/format_types.h
> index fcb137d..a242924 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
> @@ -82,16 +82,6 @@ struct PackTraits<8, false>
>         __m256 result = _mm256_setzero_ps();
>         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
>         return _mm256_insertf128_ps(result, vLo, 0);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalar result = _simd_setzero_ps();
> -
> -        __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
> -
> -        result.lo = _mm256_insertf128_ps(result.lo, src, 0);
> -
> -        return result;
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -102,10 +92,6 @@ struct PackTraits<8, false>
>         // store simd bytes
> #if KNOB_SIMD_WIDTH == 8
>         _mm_storel_pd((double*)pDst, 
> _mm_castps_pd(_mm256_castps256_ps128(src)));
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        _mm_store_ps(reinterpret_cast<float*>(pDst), 
> _mm256_castps256_ps128(src.lo));
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -126,18 +112,6 @@ struct PackTraits<8, false>
> #elif KNOB_ARCH>=KNOB_ARCH_AVX2
>         return 
> _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result;
> -
> -        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo));
> -
> -        result.lo = _mm256_cvtepu8_epi32(src);
> -
> -        result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -150,20 +124,6 @@ struct PackTraits<8, false>
>         __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), 
> _mm256_extractf128_si256(src, 1));
>         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
>         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result = _simd_setzero_si();
> -
> -        __m128i templo = 
> _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), 
> _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1));
> -
> -        __m128i temphi = 
> _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), 
> _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1));
> -
> -        __m128i temp = _mm_packus_epi16(templo, temphi);
> -
> -        result.lo = _mm256_insertf128_si256(result.lo, temp, 0);
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -233,16 +193,6 @@ struct PackTraits<8, true>
>         __m256 result = _mm256_setzero_ps();
>         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
>         return _mm256_insertf128_ps(result, vLo, 0);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalar result = _simd_setzero_ps();
> -
> -        __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
> -
> -        result.lo = _mm256_insertf128_ps(result.lo, src, 0);
> -
> -        return result;
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -253,10 +203,6 @@ struct PackTraits<8, true>
>         // store simd bytes
> #if KNOB_SIMD_WIDTH == 8
>         _mm_storel_pd((double*)pDst, 
> _mm_castps_pd(_mm256_castps256_ps128(src)));
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        _mm_store_ps(reinterpret_cast<float*>(pDst), 
> _mm256_castps256_ps128(src.lo));
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -278,18 +224,6 @@ struct PackTraits<8, true>
> #elif KNOB_ARCH>=KNOB_ARCH_AVX2
>         return 
> _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result;
> -
> -        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo));
> -
> -        result.lo = _mm256_cvtepu8_epi32(src);
> -
> -        result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -302,20 +236,6 @@ struct PackTraits<8, true>
>         __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), 
> _mm256_extractf128_si256(src, 1));
>         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
>         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result = _simd_setzero_si();
> -
> -        __m128i templo = 
> _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), 
> _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1));
> -
> -        __m128i temphi = 
> _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), 
> _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1));
> -
> -        __m128i temp = _mm_packs_epi16(templo, temphi);
> -
> -        result.lo = _mm256_insertf128_si256(result.lo, temp, 0);
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -385,16 +305,6 @@ struct PackTraits<16, false>
>         __m256 result = _mm256_setzero_ps();
>         __m128 vLo = _mm_load_ps((const float*)pSrc);
>         return _mm256_insertf128_ps(result, vLo, 0);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalar result;
> -
> -        result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc));
> -
> -        result.hi = _mm256_undefined_ps();
> -
> -        return result;
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -405,10 +315,6 @@ struct PackTraits<16, false>
> #if KNOB_SIMD_WIDTH == 8
>         // store 16B (2B * 8)
>         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -429,16 +335,6 @@ struct PackTraits<16, false>
> #elif KNOB_ARCH>=KNOB_ARCH_AVX2
>         return 
> _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result;
> -
> -        result.lo = 
> _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 
> 0));
> -
> -        result.hi = 
> _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 
> 1));
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -450,21 +346,6 @@ struct PackTraits<16, false>
>         simdscalari src = _simd_castps_si(in);
>         __m256i res = 
> _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), 
> _mm256_extractf128_si256(src, 1)));
>         return _mm256_castsi256_ps(res);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result;
> -
> -        __m256i inlo = _mm256_castps_si256(in.lo);
> -        __m256i inhi = _mm256_castps_si256(in.hi);
> -
> -        __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20);
> -        __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31);
> -
> -        result.lo = _mm256_packus_epi32(templo, temphi);
> -        result.hi = _mm256_undefined_si256();
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -528,16 +409,6 @@ struct PackTraits<16, true>
>         __m256 result = _mm256_setzero_ps();
>         __m128 vLo = _mm_load_ps((const float*)pSrc);
>         return _mm256_insertf128_ps(result, vLo, 0);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalar result;
> -
> -        result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc));
> -
> -        result.hi = _mm256_undefined_ps();
> -
> -        return result;
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -548,10 +419,6 @@ struct PackTraits<16, true>
> #if KNOB_SIMD_WIDTH == 8
>         // store 16B (2B * 8)
>         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -573,16 +440,6 @@ struct PackTraits<16, true>
> #elif KNOB_ARCH>=KNOB_ARCH_AVX2
>         return 
> _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result;
> -
> -        result.lo = 
> _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 
> 0));
> -
> -        result.hi = 
> _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 
> 1));
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -594,21 +451,6 @@ struct PackTraits<16, true>
>         simdscalari src = _simd_castps_si(in);
>         __m256i res = 
> _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), 
> _mm256_extractf128_si256(src, 1)));
>         return _mm256_castsi256_ps(res);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari result;
> -
> -        __m256i inlo = _mm256_castps_si256(in.lo);
> -        __m256i inhi = _mm256_castps_si256(in.hi);
> -
> -        __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20);
> -        __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31);
> -
> -        result.lo = _mm256_packs_epi32(templo, temphi);
> -        result.hi = _mm256_undefined_si256();
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -1193,20 +1035,6 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : 
> PackTraits<16>
> #else
>         return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, 
> _MM_FROUND_TRUNC)));
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -simdscalari result;
> -
> -        __m128i templo = _mm256_cvtps_ph(in.lo, _MM_FROUND_TRUNC);
> -        __m128i temphi = _mm256_cvtps_ph(in.hi, _MM_FROUND_TRUNC);
> -
> -        result.lo = _mm256_castsi128_si256(templo);
> -        result.lo = _mm256_insertf128_si256(result.lo, temphi, 1);
> -
> -        result.hi = _mm256_undefined_si256();
> -
> -        return _simd_castsi_ps(result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -1275,23 +1103,6 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : 
> PackTraits<32>
>         in = _mm256_insertf128_ps(in, srcLo, 0);
>         in = _mm256_insertf128_ps(in, srcHi, 1);
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        __m128 inlo0 = _mm256_extractf128_ps(in.lo, 0);
> -        __m128 inlo1 = _mm256_extractf128_ps(in.lo, 1);
> -        __m128 inhi0 = _mm256_extractf128_ps(in.hi, 0);
> -        __m128 inhi1 = _mm256_extractf128_ps(in.hi, 1);
> -
> -        inlo0 = ConvertFloatToSRGB2(inlo0);
> -        inlo1 = ConvertFloatToSRGB2(inlo1);
> -        inhi0 = ConvertFloatToSRGB2(inhi0);
> -        inhi1 = ConvertFloatToSRGB2(inhi1);
> -
> -        in.lo = _mm256_insertf128_ps(in.lo, inlo0, 0);
> -        in.lo = _mm256_insertf128_ps(in.lo, inlo1, 1);
> -        in.hi = _mm256_insertf128_ps(in.hi, inhi0, 0);
> -        in.hi = _mm256_insertf128_ps(in.hi, inhi1, 1);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h 
> b/src/gallium/drivers/swr/rasterizer/core/knobs.h
> index b108526..bbe15c1 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
> @@ -141,8 +141,6 @@
> 
> #if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4
> #error "incompatible width/tile dimensions"
> -#elif KNOB_SIMD_WIDTH==16 && KNOB_TILE_X_DIM < 4
> -#error "incompatible width/tile dimensions"
> #endif
> 
> #if ENABLE_AVX512_SIMD16
> @@ -154,9 +152,6 @@
> #if KNOB_SIMD_WIDTH == 8
> #define SIMD_TILE_X_DIM 4
> #define SIMD_TILE_Y_DIM 2
> -#elif KNOB_SIMD_WIDTH == 16
> -#define SIMD_TILE_X_DIM 4
> -#define SIMD_TILE_Y_DIM 4
> #else
> #error "Invalid simd width"
> #endif
> diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h 
> b/src/gallium/drivers/swr/rasterizer/core/utils.h
> index dd4fa3e..91a994e 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/utils.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
> @@ -145,7 +145,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i 
> &row2, __m128i &row3)
> #endif
> #endif
> 
> -#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
> +#if KNOB_SIMD_WIDTH == 8
> INLINE
> void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 
> &vSrc2)
> {
> @@ -288,20 +288,6 @@ struct Transpose8_8_8_8
>         simdscalari dst = _mm256_or_si256(dst01, dst23);
>         _simd_store_si((simdscalari*)pDst, dst);
> #endif
> -#elif KNOB_SIMD_WIDTH == 16
> -        simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 
> 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
> -
> -        simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
> -
> -        simdscalari perm1 = _simd_permute_128(src, src, 1);
> -
> -        simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 
> 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
> -
> -        simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
> -
> -        simdscalari dst = _simd_or_si(dst01, dst23);
> -
> -        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
> #else
> #error Unsupported vector width
> #endif
> @@ -363,16 +349,6 @@ struct Transpose8_8
>         __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg 
> gggggggg
>         rg = _mm_unpacklo_epi8(rg, g);
>         _mm_store_si128((__m128i*)pDst, rg);
> -#elif KNOB_SIMD_WIDTH == 16
> -        __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i 
> *>(pSrc));   // rrrrrrrrrrrrrrrrgggggggggggggggg
> -
> -        __m256i r = _mm256_permute4x64_epi64(src, 0x50);    // 0x50 = 
> 01010000b     // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
> -
> -        __m256i g = _mm256_permute4x64_epi64(src, 0xFA);    // 0xFA = 
> 11111010b     // ggggggggxxxxxxxxggggggggxxxxxxxx
> -
> -        __m256i dst = _mm256_unpacklo_epi8(r, g);                            
>        // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
> -
> -        _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
> #else
> #error Unsupported vector width
> #endif
> @@ -421,37 +397,6 @@ struct Transpose32_32_32_32
>         _mm_store_ps((float*)pDst+20, vDst[5]);
>         _mm_store_ps((float*)pDst+24, vDst[6]);
>         _mm_store_ps((float*)pDst+28, vDst[7]);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalar src0 = _simd_load_ps(reinterpret_cast<const 
> float*>(pSrc));
> -        simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) 
> + 16);
> -        simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) 
> + 32);
> -        simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) 
> + 48);
> -
> -        __m128 vDst[8];
> -
> -        vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
> -
> -        _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
> -
> -        vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
> -
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -528,36 +473,6 @@ struct Transpose32_32_32
>         _mm_store_ps((float*)pDst + 20, vDst[5]);
>         _mm_store_ps((float*)pDst + 24, vDst[6]);
>         _mm_store_ps((float*)pDst + 28, vDst[7]);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalar src0 = _simd_load_ps(reinterpret_cast<const 
> float*>(pSrc));
> -        simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) 
> + 16);
> -        simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) 
> + 32);
> -
> -        __m128 vDst[8];
> -
> -        vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
> -
> -        _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
> -
> -        vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
> -
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
> -        _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -635,23 +550,6 @@ struct Transpose32_32
>         _mm_store_ps(pfDst + 4, dst1);
>         _mm_store_ps(pfDst + 8, dst2);
>         _mm_store_ps(pfDst + 12, dst3);
> -#elif KNOB_SIMD_WIDTH == 16
> -        const float* pfSrc = (const float*)pSrc;
> -        __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
> -        __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
> -        __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
> -        __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
> -
> -        __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
> -        __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
> -        __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
> -        __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
> -
> -        float* pfDst = (float*)pDst;
> -        _mm256_store_ps(pfDst + 0, dst0);
> -        _mm256_store_ps(pfDst + 8, dst1);
> -        _mm256_store_ps(pfDst + 16, dst2);
> -        _mm256_store_ps(pfDst + 24, dst3);
> #else
> #error Unsupported vector width
> #endif
> @@ -716,31 +614,6 @@ struct Transpose16_16_16_16
>         _mm_store_si128(((__m128i*)pDst) + 1, dst1);
>         _mm_store_si128(((__m128i*)pDst) + 2, dst2);
>         _mm_store_si128(((__m128i*)pDst) + 3, dst3);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari src_rg = _simd_load_si(reinterpret_cast<const 
> simdscalari*>(pSrc));
> -        simdscalari src_ba = _simd_load_si(reinterpret_cast<const 
> simdscalari*>(pSrc + sizeof(simdscalari)));
> -
> -        __m256i src_r = src_rg.lo;
> -        __m256i src_g = src_rg.hi;
> -        __m256i src_b = src_ba.lo;
> -        __m256i src_a = src_ba.hi;
> -
> -        __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
> -        __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
> -        __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
> -        __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
> -
> -        __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
> -        __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
> -        __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
> -        __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
> -
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -808,30 +681,6 @@ struct Transpose16_16_16
>         _mm_store_si128(((__m128i*)pDst) + 1, dst1);
>         _mm_store_si128(((__m128i*)pDst) + 2, dst2);
>         _mm_store_si128(((__m128i*)pDst) + 3, dst3);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari src_rg = _simd_load_si(reinterpret_cast<const 
> simdscalari*>(pSrc));
> -
> -        __m256i src_r = src_rg.lo;
> -        __m256i src_g = src_rg.hi;
> -        __m256i src_b = _mm256_load_si256(reinterpret_cast<const 
> __m256i*>(pSrc + sizeof(simdscalari)));
> -        __m256i src_a = _mm256_undefined_si256();
> -
> -        __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
> -        __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
> -        __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
> -        __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
> -
> -        __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
> -        __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
> -        __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
> -        __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
> -
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
> -        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> @@ -890,17 +739,6 @@ struct Transpose16_16
> 
>         _mm_store_si128((__m128i*)pDst, resLo);
>         _mm_store_si128((__m128i*)pDst + 1, resHi);
> -#elif KNOB_SIMD_WIDTH == 16
> -#if ENABLE_AVX512_EMULATION
> -        simdscalari src = 
> _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
> -
> -        simdscalari result;
> -
> -        result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
> -        result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
> -
> -        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
> -#endif
> #else
> #error Unsupported vector width
> #endif
> -- 
> 2.7.4
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to