On Fri, 11 Oct 2024 at 19:52, Christophe Lyon <christophe.l...@linaro.org> wrote: > > On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely <jwak...@redhat.com> wrote: > > > > On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus <r...@nvidia.com> wrote: > > > > > > This patch modifies the implementation of the vectorized Mersenne > > > Twister random number generator to use __builtin_shufflevector instead > > > of __builtin_shuffle. This makes it (almost) compatible with Clang. > > > > > > To make the implementation fully compatible with Clang, Clang will need > > > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which > > > currently it does not. This looks like an oversight in Clang and so will > > > be addressed separately. > > > > > > I see no codegen change with this patch. > > > > I'm not qualified to review this myself, but I'd at least like to see > > the CI checks passing: > > https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7...@nvidia.com/ > > Apparently the patch couldn't be applied. > > > > Please configure your email client (thunderbird?) to not munge the > > patch, or attach it rather than sending inline. Or just use > > git-send-email :-) > > > Hi! > > The problem is not with how the patch was sent: patchwork managed to > see it as a patch, and the CI tried to apply it. > The problem is that for some reason, git was not able to apply the > patch to our current baseline. > Unfortunately, we do not go as far as calling 'git am > --show-current-patch=diff' or something else to provide more info in > our CI logs, so we can only guess that something went wrong. Maybe > your patch is based against a too old revision of GCC?
No, that file hasn't changed anywhere near the patch. The problem is that the patch was munged by thunderbird, adding line breaks where they corrupt the patch: Applying: Use shufflevector instead of shuffle in opt_random.h error: git diff header lacks filename information when removing 1 leading pathname component (line 6) Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h I fixed that manually, but it still fails: Applying: Use shufflevector instead of shuffle in opt_random.h error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:35 error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch does not apply Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h That's because of an incorrect number of space characters on the unchanged context lines around the +/- diffs. I fixed that manually, and failed at the next chunk: Applying: Use shufflevector instead of shuffle in opt_random.h error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:52 error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch does not apply Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h So the problem is how the patch was sent. > > Thanks, > > Christophe > > > > > > > > > > Bootstrapped and tested on aarch64-none-linux-gnu. > > > > > > Signed-off-by: Ricardo Jesus <r...@nvidia.com> > > > > > > 2024-09-05 Ricardo Jesus <r...@nvidia.com> > > > > > > * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses > > > of __builtin_shuffle with __builtin_shufflevector. > > > (__aarch64_lsl_128): Move shift amount to a template parameter. > > > (__aarch64_lsr_128): Move shift amount to a template parameter. > > > (__aarch64_recursion): Update call sites of __aarch64_lsl_128 > > > and __aarch64_lsr_128. > > > --- > > > .../config/cpu/aarch64/opt/ext/opt_random.h | 28 +++++++++++-------- > > > 1 file changed, 16 insertions(+), 12 deletions(-) > > > > > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > > index 7f756d1572f..7eb816abcd0 100644 > > > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h > > > @@ -35,13 +35,13 @@ > > > #ifdef __ARM_NEON > > > > > > #ifdef __ARM_BIG_ENDIAN > > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \ > > > - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ > > > - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C}) > > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \ > > > + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ > > > + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C) > > > #else > > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \ > > > - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ > > > - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15}) > > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \ > > > + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ > > > + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15) > > > #endif > > > > > > #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > > > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > > namespace { > > > // Logical Shift right 128-bits by c * 8 bits > > > > > > - __extension__ extern __inline __Uint32x4_t > > > + __extension__ > > > + template<int __c> > > > + extern __inline __Uint32x4_t > > > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > > - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c) > > > + __aarch64_lsr_128 (__Uint8x16_t __a) > > > { > > > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, > > > 0, 0, 0, 0, 0, 0, 0, 0}; > > > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > > > > > // Logical Shift left 128-bits by c * 8 bits > > > > > > - __extension__ extern __inline __Uint32x4_t > > > + __extension__ > > > + template<int __c> > > > + extern __inline __Uint32x4_t > > > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > > - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c) > > > + __aarch64_lsl_128 (__Uint8x16_t __a) > > > { > > > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, > > > 0, 0, 0, 0, 0, 0, 0, 0}; > > > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > > __Uint32x4_t __e) > > > { > > > __Uint32x4_t __y = (__b >> __sr1); > > > - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2); > > > + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c); > > > > > > __Uint32x4_t __v = __d << __sl1; > > > > > > __z = __z ^ __a; > > > __z = __z ^ __v; > > > > > > - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2); > > > + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a); > > > > > > __y = __y & __e; > > > __z = __z ^ __x; > > > -- > > > 2.44.0 > > > > > >