On 29 May 2012 18:30, Richard Henderson <r...@redhat.com> wrote: > On 05/26/2012 01:27 AM, Ramana Radhakrishnan wrote: >> >> - for (i = 0; i< nelt; i += diff) >> + for (i = 0; i< nelt ; i += (diff + 1)) >> for (j = 0; j<= diff; j += 1) >> - if (d->perm[i + j] != i + diff - j) >> - return false; >> + { >> + /* This is guaranteed to be true as the value of diff >> + is 7, 3, 1 and we should have enough elements in the >> + queue to generate this. Getting a vector mask with a >> + value of diff other than these values implies that >> + something is wrong by the time we get here. */ >> + gcc_assert ((i + j)< nelt); > > > Yep, that all looks correct. Unnecessary () in both lines though.
Bah - Thanks - don't know why I put those in :( .Committed to trunk with those changes and I would like to backport this to the 4.7 branch after a couple of weeks to allow the auto-testers to pick this up as it really turns on this functionality in this particular case if the release managers don't object. This is a significant performance issue in 4.7 for cases where we reverse vectors and would be nice to fix there. ( 2 loads + 2 generic permutes vs a single reverse instruciton) regards, Ramana 2012-05-30 Ramana Radhakrishnan <ramana.radhakrish...@linaro.org> * config/arm/arm.c (arm_evpc_neon_vrev): Adjust off by one error. * gcc.target/arm/neon-vrev..c: New. > > > r~
Index: gcc/testsuite/gcc.target/arm/neon-vrev.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vrev.c (revision 0) +++ gcc/testsuite/gcc.target/arm/neon-vrev.c (revision 187999) @@ -0,0 +1,105 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_neon_ok } */ +/* { dg-options "-O2" } */ +/* { dg-add-options arm_neon } */ + +#include <arm_neon.h> + +uint16x4_t +tst_vrev642_u16 (uint16x4_t __a) +{ + uint16x4_t __rv; + uint16x4_t __mask1 = { 3, 2, 1, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint16x8_t +tst_vrev64q2_u16 (uint16x8_t __a) +{ + uint16x8_t __rv; + uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4 }; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint8x8_t +tst_vrev642_u8 (uint8x8_t __a) +{ + uint8x8_t __rv; + uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint8x16_t +tst_vrev64q2_u8 (uint8x16_t __a) +{ + uint8x16_t __rv; + uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + return __builtin_shuffle ( __a, __mask1) ; + +} + +uint32x2_t +tst_vrev642_u32 (uint32x2_t __a) +{ + uint32x2_t __rv; + uint32x2_t __mask1 = {1, 0}; + return __builtin_shuffle ( __a, __mask1) ; + +} + +uint32x4_t +tst_vrev64q2_u32 (uint32x4_t __a) +{ + uint32x4_t __rv; + uint32x4_t __mask1 = {1, 0, 3, 2}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint16x4_t +tst_vrev322_u16 (uint16x4_t __a) +{ + uint16x4_t __mask1 = { 1, 0, 3, 2 }; + return __builtin_shuffle (__a, __mask1); +} + +uint16x8_t +tst_vrev32q2_u16 (uint16x8_t __a) +{ + uint16x8_t __mask1 = { 1, 0, 3, 2, 5, 4, 7, 6 }; + return __builtin_shuffle (__a, __mask1); +} + +uint8x8_t +tst_vrev322_u8 (uint8x8_t __a) +{ + uint8x8_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4}; + return __builtin_shuffle (__a, __mask1); +} + +uint8x16_t +tst_vrev32q2_u8 (uint8x16_t __a) +{ + uint8x16_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}; + return __builtin_shuffle (__a, __mask1); +} + +uint8x8_t +tst_vrev162_u8 (uint8x8_t __a) +{ + uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6}; + return __builtin_shuffle (__a, __mask); +} + +uint8x16_t +tst_vrev16q2_u8 (uint8x16_t __a) +{ + uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return __builtin_shuffle (__a, __mask); +} + +/* { dg-final {scan-assembler-times "vrev32\.16\\t" 2} } */ +/* { dg-final {scan-assembler-times "vrev32\.8\\t" 2} } */ +/* { dg-final {scan-assembler-times "vrev16\.8\\t" 2} } */ +/* { dg-final {scan-assembler-times "vrev64\.8\\t" 2} } */ +/* { dg-final {scan-assembler-times "vrev64\.32\\t" 2} } */ +/* { dg-final {scan-assembler-times "vrev64\.16\\t" 2} } */ Index: gcc/config/arm/arm.c =================================================================== --- gcc/config/arm/arm.c (revision 187998) +++ gcc/config/arm/arm.c (revision 187999) @@ -25637,10 +25637,18 @@ return false; } - for (i = 0; i < nelt; i += diff) + for (i = 0; i < nelt ; i += diff + 1) for (j = 0; j <= diff; j += 1) - if (d->perm[i + j] != i + diff - j) - return false; + { + /* This is guaranteed to be true as the value of diff + is 7, 3, 1 and we should have enough elements in the + queue to generate this. Getting a vector mask with a + value of diff other than these values implies that + something is wrong by the time we get here. */ + gcc_assert (i + j < nelt); + if (d->perm[i + j] != i + diff - j) + return false; + } /* Success! */ if (d->testing_p)