Hi, As discussed several times previously, support for NEON in ARM big-endian mode is quite broken because of differing assumptions about lane ordering made by the ARM EABI and the set of NEON intrinsics on the one hand, and the vectorizer on the other.
Fixing this "properly" would involve quite a large overhaul of the NEON backend implementation, and such an overhaul does not appear to be forthcoming. Unfortunately this leaves big-endian mode with a problem: even if the user is not explicitly using NEON intrinsics, compiling with NEON and the vectorizer enabled (i.e. -O3) can quite easily lead to incorrect code being generated. This is the patch we've been using internally for a while to work around the problem. When applied: * We do not allow Neon vectors to be used for autovectorization. Vectorization is not disabled completely: ARM core registers (e.g. four chars packed into a core register) can still be used to vectorize loops in limited circumstances. I think this is mildly preferable to forcing -ftree-vectorize to be off entirely for big-endian NEON. * Intrinsics are not touched. Those which attempt to mix generic vector operations with the ABI-defined vector types (i.e. those which are implemented with __builtin_shuffle) are, I think, technically incorrect -- but in the sense of two wrongs making a right, so the end result appears to work. * Generic vectors (i.e. direct use of __attribute__((vector_size(foo))) types) will continue to behave strangely in big-endian mode. This of course continues to be suboptimal, but at least in *the common case* we stop generating bad code. Testing in big-endian mode on user-space QEMU (ARMv7-A, NEON, softfp) shows (apart from some noise) test diffs as attached. Notice the large number of removed execution failures, in particular. OK to apply? Thanks, Julian ChangeLog gcc/ * config/arm/arm.c (arm_array_mode_supported_p): No array modes for big-endian NEON. (arm_preferred_simd_mode): Don't use NEON vectors for autovectorization in big-endian mode. (arm_autovectorize_vector_sizes): Don't iterate over other vector sizes for big-endian NEON. gcc/testsuite/ * lib/target-supports.exp (check_vect_support_and_set_flags): Don't run vect tests for big-endian ARM NEON. * gcc.target/arm/neon/vect-vcvt.c: XFAIL for !arm_little_endian. * gcc.target/arm/neon/vect-vcvtq.c: Likewise. * gcc.target/arm/neon-vshl-imm-1.c: Likewise. * gcc.target/arm/neon-vshr-imm-1.c: Likewise. * gcc.target/arm/neon-vmls-1.c: Likewise. * gcc.target/arm/neon-vmla-1.c: Likewise. * gcc.target/arm/neon-vfma-1.c: Likewise. * gcc.target/arm/neon-vfms-1.c: Likewise. * gcc.target/arm/neon-vorn-vbic.c: Likewise. * gcc.target/arm/neon-vlshr-imm-1.c: Likewise. * gcc.target/arm/neon-vcond-ltgt.c: Likewise. * gcc.target/arm/neon-vcond-gt.c: Likewise. * gcc.target/arm/neon-vcond-unordered.c: Likewise.
Index: gcc/config/arm/arm.c =================================================================== --- gcc/config/arm/arm.c (revision 210209) +++ gcc/config/arm/arm.c (working copy) @@ -28813,7 +28813,7 @@ static bool arm_array_mode_supported_p (enum machine_mode mode, unsigned HOST_WIDE_INT nelems) { - if (TARGET_NEON + if (TARGET_NEON && !BYTES_BIG_ENDIAN && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode)) && (nelems >= 2 && nelems <= 4)) return true; @@ -28828,7 +28828,7 @@ arm_array_mode_supported_p (enum machine static enum machine_mode arm_preferred_simd_mode (enum machine_mode mode) { - if (TARGET_NEON) + if (TARGET_NEON && !BYTES_BIG_ENDIAN) switch (mode) { case SFmode: @@ -29845,7 +29845,8 @@ arm_vector_alignment (const_tree type) static unsigned int arm_autovectorize_vector_sizes (void) { - return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8); + return (TARGET_NEON_VECTORIZE_DOUBLE || (TARGET_NEON && BYTES_BIG_ENDIAN)) + ? 0 : (16 | 8); } static bool Index: gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c (working copy) @@ -24,5 +24,5 @@ int convert() return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail { ! arm_little_endian } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ Index: gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c (working copy) @@ -24,5 +24,5 @@ int convert() return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail { ! arm_little_endian } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ Index: gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c (working copy) @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_neon_ok } */ /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */ -/* { dg-final { scan-assembler "vshl\.i32.*#3" } } */ +/* { dg-final { scan-assembler "vshl\.i32.*#3" { xfail { ! arm_little_endian } } } } */ /* Verify that VSHR immediate is used. */ void f1(int n, int x[], int y[]) { Index: gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c (working copy) @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_neon_ok } */ /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */ -/* { dg-final { scan-assembler "vshr\.s32.*#3" } } */ +/* { dg-final { scan-assembler "vshr\.s32.*#3" { xfail { ! arm_little_endian } } } } */ /* Verify that VSHR immediate is used. */ void f1(int n, int x[], int y[]) { Index: gcc/testsuite/gcc.target/arm/neon-vfma-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vfma-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vfma-1.c (working copy) @@ -2,7 +2,7 @@ /* { dg-require-effective-target arm_neonv2_ok } */ /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ /* { dg-add-options arm_neonv2 } */ -/* { dg-final { scan-assembler "vfma\\.f32\[ \]+\[dDqQ]" } } */ +/* { dg-final { scan-assembler "vfma\\.f32\[ \]+\[dDqQ]" { xfail { ! arm_little_endian } } } } */ /* Verify that VFMA is used. */ void f1(int n, float a, float x[], float y[]) { Index: gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c (working copy) @@ -16,5 +16,5 @@ void bic (int *__restrict__ c, int *__re c[i] = b[i] & (~a[i]); } -/* { dg-final { scan-assembler "vorn\\t" } } */ -/* { dg-final { scan-assembler "vbic\\t" } } */ +/* { dg-final { scan-assembler "vorn\\t" { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vbic\\t" { xfail { ! arm_little_endian } } } } */ Index: gcc/testsuite/gcc.target/arm/neon-vmla-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vmla-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vmla-1.c (working copy) @@ -1,7 +1,7 @@ /* { dg-require-effective-target arm_neon_hw } */ /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ /* { dg-add-options arm_neon } */ -/* { dg-final { scan-assembler "vmla\\.i32" } } */ +/* { dg-final { scan-assembler "vmla\\.i32" { xfail { ! arm_little_endian } } } } */ /* Verify that VMLA is used. */ void f1(int n, int a, int x[], int y[]) { Index: gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c (working copy) @@ -13,6 +13,6 @@ void foo (int ilast,float* w, float* w2) } } -/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 } } */ -/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ -/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ +/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ Index: gcc/testsuite/gcc.target/arm/neon-vfms-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vfms-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vfms-1.c (working copy) @@ -2,7 +2,7 @@ /* { dg-require-effective-target arm_neonv2_ok } */ /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ /* { dg-add-options arm_neonv2 } */ -/* { dg-final { scan-assembler "vfms\\.f32\[ \]+\[dDqQ]" } } */ +/* { dg-final { scan-assembler "vfms\\.f32\[ \]+\[dDqQ]" { xfail { ! arm_little_endian } } } } */ /* Verify that VFMS is used. */ void f1(int n, float a, float x[], float y[]) { Index: gcc/testsuite/gcc.target/arm/neon-vcond-gt.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vcond-gt.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vcond-gt.c (working copy) @@ -13,5 +13,5 @@ void foo (int ilast,float* w, float* w2) } } -/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ -/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ Index: gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c (working copy) @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_neon_ok } */ /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */ -/* { dg-final { scan-assembler "vshr\.u32.*#3" } } */ +/* { dg-final { scan-assembler "vshr\.u32.*#3" { xfail { ! arm_little_endian } } } } */ /* Verify that VSHR immediate is used. */ void f1(int n, unsigned int x[], unsigned int y[]) { Index: gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c (working copy) @@ -13,7 +13,7 @@ void foo (int ilast,float* w, float* w2) } } -/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ -/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ -/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ -/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ +/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */ Index: gcc/testsuite/gcc.target/arm/neon-vmls-1.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon-vmls-1.c (revision 210209) +++ gcc/testsuite/gcc.target/arm/neon-vmls-1.c (working copy) @@ -1,7 +1,7 @@ /* { dg-require-effective-target arm_neon_hw } */ /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ /* { dg-add-options arm_neon } */ -/* { dg-final { scan-assembler "vmls\\.i32" } } */ +/* { dg-final { scan-assembler "vmls\\.i32" { xfail { ! arm_little_endian } } } } */ /* Verify that VMLS is used. */ void f1(int n, int a, int x[], int y[]) { Index: gcc/testsuite/lib/target-supports.exp =================================================================== --- gcc/testsuite/lib/target-supports.exp (revision 210209) +++ gcc/testsuite/lib/target-supports.exp (working copy) @@ -5636,6 +5636,14 @@ proc check_vect_support_and_set_flags { } elseif [istarget ia64-*-*] { set dg-do-what-default run } elseif [is-effective-target arm_neon_ok] { + # NEON is not used for vectorization in big-endian mode at present. + # Some vect tests still pass without NEON support (i.e. using + # core registers), but there are too many failures (missed + # vectorization opportunities) to make test results meaningful. + if ![check_effective_target_arm_little_endian] { + return 0 + } + eval lappend DEFAULT_VECTCFLAGS [add_options_for_arm_neon ""] # NEON does not support denormals, so is not used for vectorization by # default to avoid loss of precision. We must pass -ffast-math to test
diffs
Description: Binary data