Hi,

As discussed several times previously, support for NEON in ARM
big-endian mode is quite broken because of differing assumptions about
lane ordering made by the ARM EABI and the set of NEON intrinsics on
the one hand, and the vectorizer on the other.

Fixing this "properly" would involve quite a large overhaul of the NEON
backend implementation, and such an overhaul does not appear to be
forthcoming. Unfortunately this leaves big-endian mode with a problem:
even if the user is not explicitly using NEON intrinsics, compiling
with NEON and the vectorizer enabled (i.e. -O3) can quite easily lead
to incorrect code being generated.

This is the patch we've been using internally for a while to work
around the problem. When applied:

* We do not allow Neon vectors to be used for autovectorization.
  Vectorization is not disabled completely: ARM core registers (e.g.
  four chars packed into a core register) can still be used to vectorize
  loops in limited circumstances. I think this is mildly preferable to
  forcing -ftree-vectorize to be off entirely for big-endian NEON.

* Intrinsics are not touched. Those which attempt to mix generic vector
  operations with the ABI-defined vector types (i.e. those which are
  implemented with __builtin_shuffle) are, I think, technically
  incorrect -- but in the sense of two wrongs making a right, so the
  end result appears to work.

* Generic vectors (i.e. direct use of __attribute__((vector_size(foo)))
  types) will continue to behave strangely in big-endian mode.

This of course continues to be suboptimal, but at least in *the
common case* we stop generating bad code.

Testing in big-endian mode on user-space QEMU (ARMv7-A, NEON, softfp)
shows (apart from some noise) test diffs as attached. Notice the large
number of removed execution failures, in particular.

OK to apply?

Thanks,

Julian

ChangeLog

    gcc/
    * config/arm/arm.c (arm_array_mode_supported_p): No array modes for
    big-endian NEON.
    (arm_preferred_simd_mode): Don't use NEON vectors for
    autovectorization in big-endian mode.
    (arm_autovectorize_vector_sizes): Don't iterate over other vector
    sizes for big-endian NEON.

    gcc/testsuite/
    * lib/target-supports.exp (check_vect_support_and_set_flags): Don't
    run vect tests for big-endian ARM NEON.
    * gcc.target/arm/neon/vect-vcvt.c: XFAIL for !arm_little_endian.
    * gcc.target/arm/neon/vect-vcvtq.c: Likewise.
    * gcc.target/arm/neon-vshl-imm-1.c: Likewise.
    * gcc.target/arm/neon-vshr-imm-1.c: Likewise.
    * gcc.target/arm/neon-vmls-1.c: Likewise.
    * gcc.target/arm/neon-vmla-1.c: Likewise.
    * gcc.target/arm/neon-vfma-1.c: Likewise.
    * gcc.target/arm/neon-vfms-1.c: Likewise.
    * gcc.target/arm/neon-vorn-vbic.c: Likewise.
    * gcc.target/arm/neon-vlshr-imm-1.c: Likewise.
    * gcc.target/arm/neon-vcond-ltgt.c: Likewise.
    * gcc.target/arm/neon-vcond-gt.c: Likewise.
    * gcc.target/arm/neon-vcond-unordered.c: Likewise.
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 210209)
+++ gcc/config/arm/arm.c	(working copy)
@@ -28813,7 +28813,7 @@ static bool
 arm_array_mode_supported_p (enum machine_mode mode,
 			    unsigned HOST_WIDE_INT nelems)
 {
-  if (TARGET_NEON
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN
       && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
       && (nelems >= 2 && nelems <= 4))
     return true;
@@ -28828,7 +28828,7 @@ arm_array_mode_supported_p (enum machine
 static enum machine_mode
 arm_preferred_simd_mode (enum machine_mode mode)
 {
-  if (TARGET_NEON)
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
     switch (mode)
       {
       case SFmode:
@@ -29845,7 +29845,8 @@ arm_vector_alignment (const_tree type)
 static unsigned int
 arm_autovectorize_vector_sizes (void)
 {
-  return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
+  return (TARGET_NEON_VECTORIZE_DOUBLE || (TARGET_NEON && BYTES_BIG_ENDIAN))
+	 ? 0 : (16 | 8);
 }
 
 static bool
Index: gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c	(working copy)
@@ -24,5 +24,5 @@ int convert()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail { ! arm_little_endian } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
Index: gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c	(working copy)
@@ -24,5 +24,5 @@ int convert()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail { ! arm_little_endian } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
Index: gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c	(working copy)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */
-/* { dg-final { scan-assembler "vshl\.i32.*#3" } } */
+/* { dg-final { scan-assembler "vshl\.i32.*#3" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VSHR immediate is used.  */
 void f1(int n, int x[], int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c	(working copy)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */
-/* { dg-final { scan-assembler "vshr\.s32.*#3" } } */
+/* { dg-final { scan-assembler "vshr\.s32.*#3" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VSHR immediate is used.  */
 void f1(int n, int x[], int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vfma-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vfma-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vfma-1.c	(working copy)
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target arm_neonv2_ok } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neonv2 } */
-/* { dg-final { scan-assembler "vfma\\.f32\[	\]+\[dDqQ]" } } */
+/* { dg-final { scan-assembler "vfma\\.f32\[	\]+\[dDqQ]" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VFMA is used.  */
 void f1(int n, float a, float x[], float y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c	(working copy)
@@ -16,5 +16,5 @@ void bic (int *__restrict__ c, int *__re
     c[i] = b[i] & (~a[i]);
 }
 
-/* { dg-final { scan-assembler "vorn\\t" } } */
-/* { dg-final { scan-assembler "vbic\\t" } } */
+/* { dg-final { scan-assembler "vorn\\t" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbic\\t" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vmla-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vmla-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vmla-1.c	(working copy)
@@ -1,7 +1,7 @@
 /* { dg-require-effective-target arm_neon_hw } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neon } */
-/* { dg-final { scan-assembler "vmla\\.i32" } } */
+/* { dg-final { scan-assembler "vmla\\.i32" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VMLA is used.  */
 void f1(int n, int a, int x[], int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c	(working copy)
@@ -13,6 +13,6 @@ void foo (int ilast,float* w, float* w2)
   }
 }
 
-/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vfms-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vfms-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vfms-1.c	(working copy)
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target arm_neonv2_ok } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neonv2 } */
-/* { dg-final { scan-assembler "vfms\\.f32\[	\]+\[dDqQ]" } } */
+/* { dg-final { scan-assembler "vfms\\.f32\[	\]+\[dDqQ]" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VFMS is used.  */
 void f1(int n, float a, float x[], float y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vcond-gt.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vcond-gt.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vcond-gt.c	(working copy)
@@ -13,5 +13,5 @@ void foo (int ilast,float* w, float* w2)
   }
 }
 
-/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c	(working copy)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */
-/* { dg-final { scan-assembler "vshr\.u32.*#3" } } */
+/* { dg-final { scan-assembler "vshr\.u32.*#3" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VSHR immediate is used.  */
 void f1(int n, unsigned int x[], unsigned int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c	(working copy)
@@ -13,7 +13,7 @@ void foo (int ilast,float* w, float* w2)
   }
 }
 
-/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vmls-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vmls-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vmls-1.c	(working copy)
@@ -1,7 +1,7 @@
 /* { dg-require-effective-target arm_neon_hw } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neon } */
-/* { dg-final { scan-assembler "vmls\\.i32" } } */
+/* { dg-final { scan-assembler "vmls\\.i32" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VMLS is used.  */
 void f1(int n, int a, int x[], int y[]) {
Index: gcc/testsuite/lib/target-supports.exp
===================================================================
--- gcc/testsuite/lib/target-supports.exp	(revision 210209)
+++ gcc/testsuite/lib/target-supports.exp	(working copy)
@@ -5636,6 +5636,14 @@ proc check_vect_support_and_set_flags { 
     } elseif [istarget ia64-*-*] {
         set dg-do-what-default run
     } elseif [is-effective-target arm_neon_ok] {
+	# NEON is not used for vectorization in big-endian mode at present.
+	# Some vect tests still pass without NEON support (i.e. using
+	# core registers), but there are too many failures (missed
+	# vectorization opportunities) to make test results meaningful.
+	if ![check_effective_target_arm_little_endian] {
+	    return 0
+	}
+
         eval lappend DEFAULT_VECTCFLAGS [add_options_for_arm_neon ""]
         # NEON does not support denormals, so is not used for vectorization by
         # default to avoid loss of precision.  We must pass -ffast-math to test

Attachment: diffs
Description: Binary data

Reply via email to