Re: [Mesa-dev] [PATCH 2/5] llvmpipe: add POWER8 portability file - u_pwr8.h

Oded Gabbay Wed, 30 Dec 2015 01:58:56 -0800

On Wed, Dec 30, 2015 at 3:18 AM, Roland Scheidegger <srol...@vmware.com> wrote:
> Am 29.12.2015 um 17:12 schrieb Oded Gabbay:
>> This file provides a portability layer that will make it easier to convert
>> SSE-based functions to VMX/VSX-based functions.
>>
>> All the functions implemented in this file are prefixed using "vec_".
>> Therefore, when converting from SSE-based function, one needs to simply
>> replace the "_mm_" prefix of the SSE function being called to "vec_".
>>
>> Having said that, not all functions could be converted as such, due to the
>> differences between the architectures. So, when doing such
>> conversion hurt the performance, I preferred to implement a more ad-hoc
>> solution. For example, converting the _mm_shuffle_epi32 needed to be done
>> using ad-hoc masks instead of a generic function.
>>
>> All the functions in this file support both little-endian and big-endian.
>>
>> All of the functions are implemented using the Altivec/VMX intrinsics,
>> except one where I needed to use inline assembly (due to missing
>> intrinsic).
>>
>> Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
>> ---
>>  src/gallium/auxiliary/util/u_pwr8.h | 298 
>> ++++++++++++++++++++++++++++++++++++
>>  1 file changed, 298 insertions(+)
>>  create mode 100644 src/gallium/auxiliary/util/u_pwr8.h
>>
>> diff --git a/src/gallium/auxiliary/util/u_pwr8.h 
>> b/src/gallium/auxiliary/util/u_pwr8.h
>> new file mode 100644
>> index 0000000..c6e5fac
>> --- /dev/null
>> +++ b/src/gallium/auxiliary/util/u_pwr8.h
>> @@ -0,0 +1,298 @@
>> +/*
>> + * Copyright 2015 Red Hat Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * on the rights to use, copy, modify, merge, publish, distribute, sub
>> + * license, and/or sell copies of the Software, and to permit persons to 
>> whom
>> + * the Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
>> OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
>> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
>> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
>> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
>> + *
>> + * Author: Oded Gabbay <oded.gab...@redhat.com>
>> + */
>> +
>> +/**
>> + * @file
>> + * POWER8 intrinsics portability header.
>> + *
>> + */
>> +
>> +#ifndef U_PWR8_H_
>> +#define U_PWR8_H_
>> +
>> +#ifdef _ARCH_PWR8
>> +
>> +#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
>> +
>> +typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
>> +
>> +typedef VECTOR_ALIGN_16 union m128i {
>> +   __m128i m128i;
>> +   vector signed int m128si;
>> +   vector unsigned int m128ui;
>> +   ubyte ub[16];
>> +   ushort us[8];
>> +   int i[4];
>> +   uint ui[4];
>> +} __m128i_union;
>> +
>> +static inline __m128i
>> +vec_set_epi32 (int i3, int i2, int i1, int i0)
>> +{
>> +   __m128i_union vdst;
>> +
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +   vdst.i[0] = i0;
>> +   vdst.i[1] = i1;
>> +   vdst.i[2] = i2;
>> +   vdst.i[3] = i3;
>> +#else
>> +   vdst.i[3] = i0;
>> +   vdst.i[2] = i1;
>> +   vdst.i[1] = i2;
>> +   vdst.i[0] = i3;
>> +#endif
>> +
>> +   return (__m128i) vdst.m128si;
>> +}
>> +
>> +static inline __m128i
>> +vec_setr_epi32 (int i0, int i1, int i2, int i3)
>> +{
>> +  return vec_set_epi32 (i3, i2, i1, i0);
>> +}
>> +
>> +static inline __m128i
>> +vec_unpacklo_epi32 (__m128i even, __m128i odd)
>> +{
>> +   static const __m128i perm_mask =
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +      { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
>> +#else
>> +      {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
>> +#endif
>> +
>> +   return vec_perm (even, odd, perm_mask);
>> +}
>> +
>> +static inline __m128i
>> +vec_unpackhi_epi32 (__m128i even, __m128i odd)
>> +{
>> +   static const __m128i perm_mask =
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +      { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
>> +#else
>> +      {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
>> +#endif
>> +
>> +   return vec_perm (even, odd, perm_mask);
>> +}
>> +
>> +static inline __m128i
>> +vec_unpacklo_epi64 (__m128i even, __m128i odd)
>> +{
>> +   static const __m128i perm_mask =
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +      { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
>> +#else
>> +      {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
>> +#endif
>> +
>> +   return vec_perm (even, odd, perm_mask);
>> +}
>> +
>> +static inline __m128i
>> +vec_unpackhi_epi64 (__m128i even, __m128i odd)
>> +{
>> +   static const __m128i perm_mask =
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +      { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
>> +#else
>> +      {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
>> +#endif
>> +
>> +   return vec_perm (even, odd, perm_mask);
>> +}
>> +
>> +static inline __m128i
>> +vec_add_epi32 (__m128i a, __m128i b)
>> +{
>> +   return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
>> +}
>> +
>> +static inline __m128i
>> +vec_sub_epi32 (__m128i a, __m128i b)
>> +{
>> +   return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
>> +}
>> +
>> +static inline __m128i
>> +vec_mullo_epi32 (__m128i a, __m128i b)
>> +{
>> +   __m128i v;
>> +
>> +   __asm__(
>> +           "vmuluwm %0, %1, %2   \n"
>> +           : "=v" (v)
>> +           : "v" (a), "v" (b)
>> +           );
>> +
>> +   return v;
>> +}
>> +
>> +static inline void
>> +transpose4_epi32(const __m128i * restrict a,
>> +                 const __m128i * restrict b,
>> +                 const __m128i * restrict c,
>> +                 const __m128i * restrict d,
>> +                 __m128i * restrict o,
>> +                 __m128i * restrict p,
>> +                 __m128i * restrict q,
>> +                 __m128i * restrict r)
>> +{
>> +   __m128i t0 = vec_unpacklo_epi32(*a, *b);
>> +   __m128i t1 = vec_unpacklo_epi32(*c, *d);
>> +   __m128i t2 = vec_unpackhi_epi32(*a, *b);
>> +   __m128i t3 = vec_unpackhi_epi32(*c, *d);
>> +
>> +   *o = vec_unpacklo_epi64(t0, t1);
>> +   *p = vec_unpackhi_epi64(t0, t1);
>> +   *q = vec_unpacklo_epi64(t2, t3);
>> +   *r = vec_unpackhi_epi64(t2, t3);
>> +}
>> +
>> +static inline __m128i
>> +vec_slli_epi32 (__m128i vsrc, unsigned int count)
>> +{
>> +   __m128i_union vec_count;
>> +
>> +   if (count >= 32)
>> +      return (__m128i) vec_splats (0);
>> +   else if (count == 0)
>> +      return vsrc;
>> +
>> +   /* In VMX, all shift count fields must contain the same value */
>> +   vec_count.m128si = (vector signed int) vec_splats (count);
>> +   return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
>> +}
>> +
>> +static inline __m128i
>> +vec_srli_epi32 (__m128i vsrc, unsigned int count)
>> +{
>> +   __m128i_union vec_count;
>> +
>> +   if (count >= 32)
>> +      return (__m128i) vec_splats (0);
>> +   else if (count == 0)
>> +      return vsrc;
>> +
>> +   /* In VMX, all shift count fields must contain the same value */
>> +   vec_count.m128si = (vector signed int) vec_splats (count);
>> +   return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
>> +}
>> +
>> +static inline __m128i
>> +vec_srai_epi32 (__m128i vsrc, unsigned int count)
>> +{
>> +   __m128i_union vec_count;
>> +
>> +   if (count >= 32)
>> +      return (__m128i) vec_splats (0);
>> +   else if (count == 0)
>> +      return vsrc;
>> +
>> +   /* In VMX, all shift count fields must contain the same value */
>> +   vec_count.m128si = (vector signed int) vec_splats (count);
>> +   return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
>> +}
>> +
>> +static inline __m128i
>> +vec_cmpeq_epi32 (__m128i a, __m128i b)
>> +{
>> +   return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) 
>> b);
>> +}
>> +
>> +static inline __m128i
>> +vec_loadu_si128 (const uint32_t* src)
>> +{
>> +   __m128i_union vsrc;
>> +
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +
>> +   vsrc.m128ui = *((vector unsigned int *) src);
>> +
>> +#else
>> +
>> +   __m128i vmask, tmp1, tmp2;
>> +
>> +   vmask = vec_lvsl(0, src);
>> +
>> +   tmp1 = (typeof(tmp1))vec_ld (0, src);
>> +   tmp2 = (typeof(tmp2))vec_ld (15, src);
>> +   vsrc.m128ui = (typeof(vsrc.m128ui)) vec_perm (tmp1, tmp2, vmask);
>> +
>> +#endif
>> +
>> +   return vsrc.m128i;
>> +}
>> +
>> +static inline void
>> +vec_store_si128 (uint32_t* dest, __m128i vdata)
>> +{
>> +   vec_st ((vector unsigned int) vdata, 0, dest);
>> +}
>> +
>> +static inline int
>> +vec_movemask_epi8 (__m128i vsrc)
>> +{
>> +   __m128i_union vtemp;
>> +   int result;
>> +
>> +   vtemp.m128i = __builtin_vec_vgbbd(vsrc);
>> +
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +   result = vtemp.ub[15] << 8 | vtemp.ub[7];
>> +#else
>> +   result = vtemp.ub[0] << 8 | vtemp.ub[8];
>> +#endif
>> +
>> +   return result;
>> +}
>> +
>> +static inline __m128i
>> +vec_packs_epi16 (__m128i a, __m128i b)
>> +{
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +   return (__m128i) vec_packs ((vector signed short) a,
>> +                               (vector signed short) b);
>> +#else
>> +   return (__m128i) vec_packs ((vector signed short) b,
>> +                               (vector signed short) a);
>> +#endif
>> +}
>> +
>> +static inline __m128i
>> +vec_packs_epi32 (__m128i a, __m128i b)
>> +{
>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>> +   return (__m128i) vec_packs ((vector signed int) a, (vector signed int) 
>> b);
>> +#else
>> +   return (__m128i) vec_packs ((vector signed int) b, (vector signed int) 
>> a);
>> +#endif
>> +}
>> +
>> +#endif /* _ARCH_PWR8 */
>> +
>> +#endif /* U_PWR8_H_ */
>>
>
>
> For which functions do you actually need power8?
> I believe __builtin_vec_vgbbd used for vec_movemask_epi8 is one?
Yes, but not only that.
I also need power8 for vmuluwm, which is the parallel of
_mm_mullo_epi32. Without it, the only option is to just do four
regular multiplications, which again ruins the performance (makes it
worse than original code).


When I started to write this patch series, I had no intention of
limiting it to POWER8. But without vec_vgbbd and vmuluwm, the
performance of the vector code is much worse (around 10-15%) than the
non-vector code.

> I have actually been thinking about how we deal with these masks some
> time ago. It looks suboptimal since for the most part we're packing the
> masks and then just have to unpack them again in the jit code, so I
> always believed there's just overhead due to that (could at least pass
> as one-byte-per-mask bit). The problem with that though is that those
> masks also interact with the c code, albeit typically just checking for
> a specific value, so as long as you can do that efficiently with vector
> instructions it should be possible to work around it (sse41 in fact has
> the very efficient ptest instruction, but unfortunately it can't really
> be integrated well into surrounding c code due to the fact it just sets
> the flags register and has no return value, plus it would need runtime
> feature detection).

I understand what you are saying but I need to learn more about it to
be able to change the code. Although if I see you changing it for SSE,
I will be able to copy it to the VMX/VSX code.

> Plus there's some problem since the "recursive" evaluation of blocks
> (for size 64/16/4) mostly use the same functions, and the outer ones
> really only need some scalar value which can be efficiently dealt with
> with c code.
>
> FWIW looks like the unaligned load is somewhat complex too.

Fortunately, in POWER8, there is almost zero penalty for doing
unaligned loads into vector registers. That is a major improvement
over previous generations.

>I am not
> entirely sure why we use unaligned values in the first place there, as
> far as I can tell it should be possible to fix this with a couple
> PIPE_ALIGN_VAR(16) for the affected struct (surely we'd prefer aligned
> variables if we can). Unless there's some reason that doesn't work.
>
> Roland
>

Yes, that may give a modest boost. I, for one, have put the
VECTOR_ALIGN_16 (which is __attribute__ ((__aligned__ (16)))) before
any vector register in my code (see the start of u_pwr8.h). But adding
that to additional structures might give even more boost.

Oded
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/5] llvmpipe: add POWER8 portability file - u_pwr8.h

Reply via email to