From d55e6c047155de63415467911c345d3f6091e696 Mon Sep 17 00:00:00 2001
From: Andrew Pinski <quic_apinski@quicinc.com>
Date: Thu, 18 Jan 2024 20:28:48 -0800
Subject: [PATCH] RFC: aarch64: Start to support v4qi modes for SLP

This is the start of adding V4QI mode to the aarch64 backend to support SLP vectorization.
Currently we support addition, subtraction, extend, and truncate for the types.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64-modes.def          |   1 +
 gcc/config/aarch64/aarch64-simd.md            | 569 +++++++++++-------
 gcc/config/aarch64/aarch64.cc                 | 217 ++++++-
 gcc/config/aarch64/aarch64.opt                |   4 +
 gcc/config/aarch64/iterators.md               | 129 ++--
 .../gcc.target/aarch64/vect_mixed_sizes_3.c   |   6 +-
 .../gcc.target/aarch64/vect_mixed_sizes_6.c   |   3 +-
 .../gcc.target/aarch64/vect_mixed_sizes_7.c   |   3 +-
 gcc/testsuite/lib/target-supports.exp         |   2 +-
 9 files changed, 643 insertions(+), 291 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 25a22c1195e..940c57db573 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -70,6 +70,7 @@ ADJUST_ALIGNMENT (VNx2BI, 2);
 FLOAT_MODE (BF, 2, 0);
 ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
 
+VECTOR_MODES (INT, 4);        /*       V4QI V2HI V1SI.  */
 VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI.  */
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI.  */
 VECTOR_MODES (FLOAT, 8);      /*                 V2SF.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bbeee221f37..8fba57147ee 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -47,8 +47,8 @@ (define_subst_attr "vczle" "add_vec_concat_subst_le" "" "_vec_concatz_le")
 (define_subst_attr "vczbe" "add_vec_concat_subst_be" "" "_vec_concatz_be")
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALLS_F16 0 "nonimmediate_operand")
+	(match_operand:VALLS_F16 1 "general_operand"))]
   "TARGET_FLOAT"
   "
   /* Force the operand into a register if it is not an
@@ -78,8 +78,8 @@ (define_expand "mov<mode>"
 )
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-        (match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALLS_F16 0 "nonimmediate_operand")
+        (match_operand:VALLS_F16 1 "general_operand"))]
   "TARGET_FLOAT && !STRICT_ALIGNMENT"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
@@ -91,8 +91,8 @@ (define_expand "movmisalign<mode>"
 })
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQ_I 0 "register_operand")
-	(vec_duplicate:VDQ_I
+  [(set (match_operand:VDQS_I 0 "register_operand")
+	(vec_duplicate:VDQS_I
 	  (match_operand:<VEL> 1 "register_operand")))]
   "TARGET_SIMD"
   {@ [ cons: =0 , 1  ; attrs: type      ]
@@ -142,26 +142,26 @@ (define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn_and_split "*aarch64_simd_mov<VDMOV:mode>"
-  [(set (match_operand:VDMOV 0 "nonimmediate_operand")
-	(match_operand:VDMOV 1 "general_operand"))]
+(define_insn_and_split "*aarch64_simd_mov<VDHMOV:mode>"
+  [(set (match_operand:VDHMOV 0 "nonimmediate_operand")
+	(match_operand:VDHMOV 1 "general_operand"))]
   "TARGET_FLOAT
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
   {@ [cons: =0, 1; attrs: type, arch, length]
-     [w , m ; neon_load1_1reg<q> , *        , *] ldr\t%d0, %1
-     [r , m ; load_8             , *        , *] ldr\t%x0, %1
-     [m , Dz; store_8            , *        , *] str\txzr, %0
-     [m , w ; neon_store1_1reg<q>, *        , *] str\t%d1, %0
-     [m , r ; store_8            , *        , *] str\t%x1, %0
+     [w , m ; neon_load1_1reg<q> , *        , *] ldr\t%<single_type>0, %1
+     [r , m ; load_8             , *        , *] ldr\t%<single_wx>0, %1
+     [m , Dz; store_8            , *        , *] str\t<single_wx>zr, %0
+     [m , w ; neon_store1_1reg<q>, *        , *] str\t%<single_type>1, %0
+     [m , r ; store_8            , *        , *] str\t%<single_wx>1, %0
      [w , w ; neon_logic<q>      , simd     , *] mov\t%0.<Vbtype>, %1.<Vbtype>
-     [w , w ; neon_logic<q>      , *        , *] fmov\t%d0, %d1
-     [?r, w ; neon_to_gp<q>      , base_simd, *] umov\t%0, %1.d[0]
-     [?r, w ; neon_to_gp<q>      , *        , *] fmov\t%x0, %d1
-     [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
-     [?r, r ; mov_reg            , *        , *] mov\t%0, %1
+     [w , w ; neon_logic<q>      , *        , *] fmov\t%<single_type>0, %<single_type>1
+     [?r, w ; neon_to_gp<q>      , base_simd, *] umov\t%<single_wx>0, %1.<single_type>[0]
+     [?r, w ; neon_to_gp<q>      , *        , *] fmov\t%<single_wx>0, %<single_type>1
+     [?w, r ; f_mcr              , *        , *] fmov\t%<single_type>0, %<single_wx>1
+     [?r, r ; mov_reg            , *        , *] mov\t%<single_wx>0, %<single_wx>1
      [w , Dn; neon_move<q>       , simd     , *] << aarch64_output_simd_mov_immediate (operands[1], 64);
-     [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
+     [w , Dz; f_mcr              , *        , *] fmov\t%<single_type>0, <single_wx>zr
      [w , Dx; neon_move          , simd     , 8] #
   }
   "CONST_INT_P (operands[1])
@@ -323,45 +323,45 @@ (define_insn "aarch64_simd_mov_from_<mode>high"
 )
 
 (define_insn "orn<mode>3<vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (ior:VDQ_I (not:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w"))
-		(match_operand:VDQ_I 2 "register_operand" "w")))]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (ior:VDQS_I (not:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w"))
+		(match_operand:VDQS_I 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "orn\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype>"
   [(set_attr "type" "neon_logic<q>")]
 )
 
 (define_insn "bic<mode>3<vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (and:VDQ_I (not:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w"))
-		(match_operand:VDQ_I 2 "register_operand" "w")))]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (and:VDQS_I (not:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w"))
+		(match_operand:VDQS_I 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "bic\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype>"
   [(set_attr "type" "neon_logic<q>")]
 )
 
 (define_insn "add<mode>3<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (plus:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		  (match_operand:VDQ_I 2 "register_operand" "w")))]
+  [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+        (plus:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")
+		  (match_operand:VDQS_I 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "add\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_add<q>")]
 )
 
 (define_insn "sub<mode>3<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (minus:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		   (match_operand:VDQ_I 2 "register_operand" "w")))]
+  [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+        (minus:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")
+		   (match_operand:VDQS_I 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "sub\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_sub<q>")]
 )
 
 (define_insn "mul<mode>3<vczle><vczbe>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-        (mult:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")
-		   (match_operand:VDQ_BHSI 2 "register_operand" "w")))]
+  [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+        (mult:VDQS_BHSI (match_operand:VDQS_BHSI 1 "register_operand" "w")
+		   (match_operand:VDQS_BHSI 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_mul_<Vetype><q>")]
@@ -778,16 +778,16 @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
 )
 
 (define_insn "neg<mode>2<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-	(neg:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")))]
+  [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+	(neg:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "neg\t%0.<Vtype>, %1.<Vtype>"
   [(set_attr "type" "neon_neg<q>")]
 )
 
 (define_insn "abs<mode>2<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (abs:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")))]
+  [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+        (abs:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "abs\t%0.<Vtype>, %1.<Vtype>"
   [(set_attr "type" "neon_abs<q>")]
@@ -797,9 +797,9 @@ (define_insn "abs<mode>2<vczle><vczbe>"
 ;; combine with any operation with an integrated ABS step, such
 ;; as SABD.
 (define_insn "aarch64_abs<mode><vczle><vczbe>"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w")
-	  (unspec:VSDQ_I_DI
-	    [(match_operand:VSDQ_I_DI 1 "register_operand" "w")]
+  [(set (match_operand:VSDQS_I_DI 0 "register_operand" "=w")
+	  (unspec:VSDQS_I_DI
+	    [(match_operand:VSDQS_I_DI 1 "register_operand" "w")]
 	   UNSPEC_ABS))]
   "TARGET_SIMD"
   "abs\t%<v>0<Vmtype>, %<v>1<Vmtype>"
@@ -812,12 +812,12 @@ (define_insn "aarch64_abs<mode><vczle><vczbe>"
 ;; Whereas SABD would return 192 (-64 signed) on the above example.
 ;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
 (define_insn "aarch64_<su>abd<mode><vczle><vczbe>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-	(minus:VDQ_BHSI
-	  (USMAX:VDQ_BHSI
-	    (match_operand:VDQ_BHSI 1 "register_operand" "w")
-	    (match_operand:VDQ_BHSI 2 "register_operand" "w"))
-	  (<max_opp>:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+	(minus:VDQS_BHSI
+	  (USMAX:VDQS_BHSI
+	    (match_operand:VDQS_BHSI 1 "register_operand" "w")
+	    (match_operand:VDQS_BHSI 2 "register_operand" "w"))
+	  (<max_opp>:VDQS_BHSI
 	    (match_dup 1)
 	    (match_dup 2))))]
   "TARGET_SIMD"
@@ -826,10 +826,10 @@ (define_insn "aarch64_<su>abd<mode><vczle><vczbe>"
 )
 
 (define_expand "<su>abd<mode>3"
-  [(match_operand:VDQ_BHSI 0 "register_operand")
-   (USMAX:VDQ_BHSI
-     (match_operand:VDQ_BHSI 1 "register_operand")
-     (match_operand:VDQ_BHSI 2 "register_operand"))]
+  [(match_operand:VDQS_BHSI 0 "register_operand")
+   (USMAX:VDQS_BHSI
+     (match_operand:VDQS_BHSI 1 "register_operand")
+     (match_operand:VDQS_BHSI 2 "register_operand"))]
   "TARGET_SIMD"
   {
     emit_insn (gen_aarch64_<su>abd<mode> (operands[0], operands[1], operands[2]));
@@ -1092,15 +1092,15 @@ (define_expand "<su>sadv16qi"
 )
 
 (define_insn "aarch64_<su>aba<mode><vczle><vczbe>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-	(plus:VDQ_BHSI (minus:VDQ_BHSI
-			 (USMAX:VDQ_BHSI
-			   (match_operand:VDQ_BHSI 2 "register_operand" "w")
-			   (match_operand:VDQ_BHSI 3 "register_operand" "w"))
-			 (<max_opp>:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+	(plus:VDQS_BHSI (minus:VDQS_BHSI
+			 (USMAX:VDQS_BHSI
+			   (match_operand:VDQS_BHSI 2 "register_operand" "w")
+			   (match_operand:VDQS_BHSI 3 "register_operand" "w"))
+			 (<max_opp>:VDQS_BHSI
 			   (match_dup 2)
 			   (match_dup 3)))
-		       (match_operand:VDQ_BHSI 1 "register_operand" "0")))]
+		       (match_operand:VDQS_BHSI 1 "register_operand" "0")))]
   "TARGET_SIMD"
   "<su>aba\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>"
   [(set_attr "type" "neon_arith_acc<q>")]
@@ -1119,9 +1119,9 @@ (define_insn "fabd<mode>3<vczle><vczbe>"
 
 ;; For AND (vector, register) and BIC (vector, immediate)
 (define_insn "and<mode>3<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand")
-	(and:VDQ_I (match_operand:VDQ_I 1 "register_operand")
-		   (match_operand:VDQ_I 2 "aarch64_reg_or_bic_imm")))]
+  [(set (match_operand:VDQS_I 0 "register_operand")
+	(and:VDQS_I (match_operand:VDQS_I 1 "register_operand")
+		   (match_operand:VDQS_I 2 "aarch64_reg_or_bic_imm")))]
   "TARGET_SIMD"
   {@ [ cons: =0 , 1 , 2   ]
      [ w        , w , w   ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
@@ -1132,9 +1132,9 @@ (define_insn "and<mode>3<vczle><vczbe>"
 
 ;; For ORR (vector, register) and ORR (vector, immediate)
 (define_insn "ior<mode>3<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand")
-	(ior:VDQ_I (match_operand:VDQ_I 1 "register_operand")
-		   (match_operand:VDQ_I 2 "aarch64_orr_imm_sve_advsimd")))]
+  [(set (match_operand:VDQS_I 0 "register_operand")
+	(ior:VDQS_I (match_operand:VDQS_I 1 "register_operand")
+		   (match_operand:VDQS_I 2 "aarch64_orr_imm_sve_advsimd")))]
   "TARGET_SIMD"
   {@ [ cons: =0 , 1 , 2; attrs: arch ]
      [ w        , w , w  ; simd      ] orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
@@ -1147,28 +1147,28 @@ (define_insn "ior<mode>3<vczle><vczbe>"
 )
 
 (define_insn "xor<mode>3<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (xor:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		 (match_operand:VDQ_I 2 "register_operand" "w")))]
+  [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+        (xor:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")
+		 (match_operand:VDQS_I 2 "register_operand" "w")))]
   "TARGET_SIMD"
   "eor\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>"
   [(set_attr "type" "neon_logic<q>")]
 )
 
 (define_insn "one_cmpl<mode>2<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (not:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")))]
+  [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+        (not:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "not\t%0.<Vbtype>, %1.<Vbtype>"
   [(set_attr "type" "neon_logic<q>")]
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VALLS_F16 0 "register_operand" "=w,w,w")
+	(vec_merge:VALLS_F16
+	    (vec_duplicate:VALLS_F16
 		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
-	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
+	    (match_operand:VALLS_F16 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
   {
@@ -1190,10 +1190,10 @@ (define_insn "aarch64_simd_vec_set<mode>"
 )
 
 (define_insn "aarch64_simd_vec_set_zero<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_merge:VALL_F16
-	    (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "")
-	    (match_operand:VALL_F16 3 "register_operand" "0")
+  [(set (match_operand:VALLS_F16 0 "register_operand" "=w")
+	(vec_merge:VALLS_F16
+	    (match_operand:VALLS_F16 1 "aarch64_simd_imm_zero" "")
+	    (match_operand:VALLS_F16 3 "register_operand" "0")
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
   {
@@ -1204,14 +1204,14 @@ (define_insn "aarch64_simd_vec_set_zero<mode>"
 )
 
 (define_insn "@aarch64_simd_vec_copy_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VALLS_F16 0 "register_operand" "=w")
+	(vec_merge:VALLS_F16
+	    (vec_duplicate:VALLS_F16
 	      (vec_select:<VEL>
-		(match_operand:VALL_F16 3 "register_operand" "w")
+		(match_operand:VALLS_F16 3 "register_operand" "w")
 		(parallel
 		  [(match_operand:SI 4 "immediate_operand" "i")])))
-	    (match_operand:VALL_F16 1 "register_operand" "0")
+	    (match_operand:VALLS_F16 1 "register_operand" "0")
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
   {
@@ -1262,18 +1262,18 @@ (define_expand "signbit<mode>2"
 })
 
 (define_insn "aarch64_simd_lshr<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		     (match_operand:VDQ_I  2 "aarch64_simd_rshift_imm" "Dr")))]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (lshiftrt:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")
+		     (match_operand:VDQS_I  2 "aarch64_simd_rshift_imm" "Dr")))]
  "TARGET_SIMD"
  "ushr\t%0.<Vtype>, %1.<Vtype>, %2"
   [(set_attr "type" "neon_shift_imm<q>")]
 )
 
 (define_insn "aarch64_simd_ashr<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand")
-       (ashiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand")
-		     (match_operand:VDQ_I  2 "aarch64_simd_rshift_imm")))]
+ [(set (match_operand:VDQS_I 0 "register_operand")
+       (ashiftrt:VDQS_I (match_operand:VDQS_I 1 "register_operand")
+		     (match_operand:VDQS_I  2 "aarch64_simd_rshift_imm")))]
  "TARGET_SIMD"
  {@ [ cons: =0 , 1 , 2  ; attrs: type        ]
     [ w        , w , D1 ; neon_compare<q>    ] cmlt\t%0.<Vtype>, %1.<Vtype>, #0
@@ -1282,12 +1282,12 @@ (define_insn "aarch64_simd_ashr<mode><vczle><vczbe>"
 )
 
 (define_insn "aarch64_<sra_op>sra_n<mode>_insn"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-	(plus:VDQ_I
-	   (SHIFTRT:VDQ_I
-		(match_operand:VDQ_I 2 "register_operand" "w")
-		(match_operand:VDQ_I 3 "aarch64_simd_rshift_imm"))
-	   (match_operand:VDQ_I 1 "register_operand" "0")))]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+	(plus:VDQS_I
+	   (SHIFTRT:VDQS_I
+		(match_operand:VDQS_I 2 "register_operand" "w")
+		(match_operand:VDQS_I 3 "aarch64_simd_rshift_imm"))
+	   (match_operand:VDQS_I 1 "register_operand" "0")))]
   "TARGET_SIMD"
   "<sra_op>sra\t%<v>0<Vmtype>, %<v>2<Vmtype>, %3"
   [(set_attr "type" "neon_shift_acc<q>")]
@@ -1311,12 +1311,12 @@ (define_insn "aarch64_<sra_op>rsra_n<mode>_insn"
 )
 
 (define_expand "aarch64_<sra_op>sra_n<mode>"
- [(set (match_operand:VDQ_I 0 "register_operand")
-	(plus:VDQ_I
-	   (SHIFTRT:VDQ_I
-		(match_operand:VDQ_I 2 "register_operand")
+ [(set (match_operand:VDQS_I 0 "register_operand")
+	(plus:VDQS_I
+	   (SHIFTRT:VDQS_I
+		(match_operand:VDQS_I 2 "register_operand")
 		(match_operand:SI 3 "aarch64_simd_shift_imm_offset_<ve_mode>"))
-	   (match_operand:VDQ_I 1 "register_operand")))]
+	   (match_operand:VDQS_I 1 "register_operand")))]
   "TARGET_SIMD"
   {
     operands[3]
@@ -1352,27 +1352,27 @@ (define_expand "aarch64_<sra_op>rsra_n<mode>"
 )
 
 (define_insn "aarch64_simd_imm_shl<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		   (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm" "Dl")))]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (ashift:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")
+		   (match_operand:VDQS_I  2 "aarch64_simd_lshift_imm" "Dl")))]
  "TARGET_SIMD"
   "shl\t%0.<Vtype>, %1.<Vtype>, %2"
   [(set_attr "type" "neon_shift_imm<q>")]
 )
 
 (define_insn "aarch64_simd_reg_sshl<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		   (match_operand:VDQ_I 2 "register_operand" "w")))]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (ashift:VDQS_I (match_operand:VDQS_I 1 "register_operand" "w")
+		   (match_operand:VDQS_I 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "sshl\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_shift_reg<q>")]
 )
 
 (define_insn "aarch64_simd_reg_shl<mode>_unsigned<vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")
-		    (match_operand:VDQ_I 2 "register_operand" "w")]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (unspec:VDQS_I [(match_operand:VDQS_I 1 "register_operand" "w")
+		    (match_operand:VDQS_I 2 "register_operand" "w")]
 		   UNSPEC_ASHIFT_UNSIGNED))]
  "TARGET_SIMD"
  "ushl\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
@@ -1380,9 +1380,9 @@ (define_insn "aarch64_simd_reg_shl<mode>_unsigned<vczle><vczbe>"
 )
 
 (define_insn "aarch64_simd_reg_shl<mode>_signed<vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")
-		    (match_operand:VDQ_I 2 "register_operand" "w")]
+ [(set (match_operand:VDQS_I 0 "register_operand" "=w")
+       (unspec:VDQS_I [(match_operand:VDQS_I 1 "register_operand" "w")
+		    (match_operand:VDQS_I 2 "register_operand" "w")]
 		   UNSPEC_ASHIFT_SIGNED))]
  "TARGET_SIMD"
  "sshl\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
@@ -1390,8 +1390,8 @@ (define_insn "aarch64_simd_reg_shl<mode>_signed<vczle><vczbe>"
 )
 
 (define_expand "ashl<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand")
-   (match_operand:VDQ_I 1 "register_operand")
+  [(match_operand:VDQS_I 0 "register_operand")
+   (match_operand:VDQS_I 1 "register_operand")
    (match_operand:SI  2 "general_operand")]
  "TARGET_SIMD"
 {
@@ -1423,8 +1423,8 @@ (define_expand "ashl<mode>3"
 })
 
 (define_expand "lshr<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand")
-   (match_operand:VDQ_I 1 "register_operand")
+  [(match_operand:VDQS_I 0 "register_operand")
+   (match_operand:VDQS_I 1 "register_operand")
    (match_operand:SI  2 "general_operand")]
  "TARGET_SIMD"
 {
@@ -1458,8 +1458,8 @@ (define_expand "lshr<mode>3"
 })
 
 (define_expand "ashr<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand")
-   (match_operand:VDQ_I 1 "register_operand")
+  [(match_operand:VDQS_I 0 "register_operand")
+   (match_operand:VDQS_I 1 "register_operand")
    (match_operand:SI  2 "general_operand")]
  "TARGET_SIMD"
 {
@@ -1493,9 +1493,9 @@ (define_expand "ashr<mode>3"
 })
 
 (define_expand "vashl<mode>3"
- [(match_operand:VDQ_I 0 "register_operand")
-  (match_operand:VDQ_I 1 "register_operand")
-  (match_operand:VDQ_I 2 "register_operand")]
+ [(match_operand:VDQS_I 0 "register_operand")
+  (match_operand:VDQS_I 1 "register_operand")
+  (match_operand:VDQS_I 2 "register_operand")]
  "TARGET_SIMD"
 {
   emit_insn (gen_aarch64_simd_reg_sshl<mode> (operands[0], operands[1],
@@ -1504,9 +1504,9 @@ (define_expand "vashl<mode>3"
 })
 
 (define_expand "vashr<mode>3"
- [(match_operand:VDQ_I 0 "register_operand")
-  (match_operand:VDQ_I 1 "register_operand")
-  (match_operand:VDQ_I 2 "register_operand")]
+ [(match_operand:VDQS_I 0 "register_operand")
+  (match_operand:VDQS_I 1 "register_operand")
+  (match_operand:VDQS_I 2 "register_operand")]
  "TARGET_SIMD"
 {
   rtx neg = gen_reg_rtx (<MODE>mode);
@@ -1534,9 +1534,9 @@ (define_expand "aarch64_ashr_simddi"
 )
 
 (define_expand "vlshr<mode>3"
- [(match_operand:VDQ_I 0 "register_operand")
-  (match_operand:VDQ_I 1 "register_operand")
-  (match_operand:VDQ_I 2 "register_operand")]
+ [(match_operand:VDQS_I 0 "register_operand")
+  (match_operand:VDQS_I 1 "register_operand")
+  (match_operand:VDQS_I 2 "register_operand")]
  "TARGET_SIMD"
 {
   rtx neg = gen_reg_rtx (<MODE>mode);
@@ -1577,7 +1577,7 @@ (define_insn "vec_shr_<mode><vczle><vczbe>"
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALLS_F16 0 "register_operand")
    (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
@@ -1591,11 +1591,11 @@ (define_expand "vec_set<mode>"
 
 
 (define_insn "aarch64_mla<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-       (plus:VDQ_BHSI (mult:VDQ_BHSI
-			(match_operand:VDQ_BHSI 2 "register_operand" "w")
-			(match_operand:VDQ_BHSI 3 "register_operand" "w"))
-		      (match_operand:VDQ_BHSI 1 "register_operand" "0")))]
+ [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+       (plus:VDQS_BHSI (mult:VDQS_BHSI
+			(match_operand:VDQS_BHSI 2 "register_operand" "w")
+			(match_operand:VDQS_BHSI 3 "register_operand" "w"))
+		      (match_operand:VDQS_BHSI 1 "register_operand" "0")))]
  "TARGET_SIMD"
  "mla\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>"
   [(set_attr "type" "neon_mla_<Vetype><q>")]
@@ -1651,10 +1651,10 @@ (define_insn "aarch64_mla_n<mode><vczle><vczbe>"
 )
 
 (define_insn "aarch64_mls<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-       (minus:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "0")
-		   (mult:VDQ_BHSI (match_operand:VDQ_BHSI 2 "register_operand" "w")
-			      (match_operand:VDQ_BHSI 3 "register_operand" "w"))))]
+ [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+       (minus:VDQS_BHSI (match_operand:VDQS_BHSI 1 "register_operand" "0")
+		   (mult:VDQS_BHSI (match_operand:VDQS_BHSI 2 "register_operand" "w")
+			      (match_operand:VDQS_BHSI 3 "register_operand" "w"))))]
  "TARGET_SIMD"
  "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>"
   [(set_attr "type" "neon_mla_<Vetype><q>")]
@@ -1711,9 +1711,9 @@ (define_insn "aarch64_mls_n<mode><vczle><vczbe>"
 
 ;; Max/Min operations.
 (define_insn "<su><maxmin><mode>3<vczle><vczbe>"
- [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-       (MAXMIN:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")
-		    (match_operand:VDQ_BHSI 2 "register_operand" "w")))]
+ [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+       (MAXMIN:VDQS_BHSI (match_operand:VDQS_BHSI 1 "register_operand" "w")
+		    (match_operand:VDQS_BHSI 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "<su><maxmin>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_minmax<q>")]
@@ -3480,24 +3480,24 @@ (define_insn "*aarch64_<su>addlp<mode><vczle><vczbe>_insn"
 )
 
 (define_insn "clrsb<mode>2<vczle><vczbe>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-        (clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
+  [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+        (clrsb:VDQS_BHSI (match_operand:VDQS_BHSI 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "cls\\t%0.<Vtype>, %1.<Vtype>"
   [(set_attr "type" "neon_cls<q>")]
 )
 
 (define_insn "clz<mode>2<vczle><vczbe>"
- [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-       (clz:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
+ [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+       (clz:VDQS_BHSI (match_operand:VDQS_BHSI 1 "register_operand" "w")))]
  "TARGET_SIMD"
  "clz\\t%0.<Vtype>, %1.<Vtype>"
   [(set_attr "type" "neon_cls<q>")]
 )
 
 (define_insn "popcount<mode>2<vczle><vczbe>"
-  [(set (match_operand:VB 0 "register_operand" "=w")
-        (popcount:VB (match_operand:VB 1 "register_operand" "w")))]
+  [(set (match_operand:VB_WS 0 "register_operand" "=w")
+        (popcount:VB_WS (match_operand:VB_WS 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "cnt\\t%0.<Vbtype>, %1.<Vbtype>"
   [(set_attr "type" "neon_cnt<q>")]
@@ -3636,13 +3636,13 @@ (define_insn "aarch64_reduc_<optab>_internal<mode>"
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand")
-	(xor:VDQ_I
-	   (and:VDQ_I
-	     (xor:VDQ_I
+  [(set (match_operand:VDQS_I 0 "register_operand")
+	(xor:VDQS_I
+	   (and:VDQS_I
+	     (xor:VDQS_I
 	       (match_operand:<V_INT_EQUIV> 3 "register_operand")
-	       (match_operand:VDQ_I 2 "register_operand"))
-	     (match_operand:VDQ_I 1 "register_operand"))
+	       (match_operand:VDQS_I 2 "register_operand"))
+	     (match_operand:VDQS_I 1 "register_operand"))
 	  (match_dup:<V_INT_EQUIV> 3)
 	))]
   "TARGET_SIMD"
@@ -3661,13 +3661,13 @@ (define_insn "aarch64_simd_bsl<mode>_internal<vczle><vczbe>"
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt<vczle><vczbe>"
-  [(set (match_operand:VDQ_I 0 "register_operand")
-	(xor:VDQ_I
-	   (and:VDQ_I
-	     (xor:VDQ_I
-	       (match_operand:VDQ_I 3 "register_operand")
+  [(set (match_operand:VDQS_I 0 "register_operand")
+	(xor:VDQS_I
+	   (and:VDQS_I
+	     (xor:VDQS_I
+	       (match_operand:VDQS_I 3 "register_operand")
 	       (match_operand:<V_INT_EQUIV> 2 "register_operand"))
-	      (match_operand:VDQ_I 1 "register_operand"))
+	      (match_operand:VDQS_I 1 "register_operand"))
 	  (match_dup:<V_INT_EQUIV> 2)))]
   "TARGET_SIMD"
   {@ [ cons: =0 , 1 , 2 , 3  ]
@@ -3771,10 +3771,10 @@ (define_insn_and_split "aarch64_simd_bsldi_alt"
 )
 
 (define_expand "aarch64_simd_bsl<mode>"
-  [(match_operand:VALLDIF 0 "register_operand")
+  [(match_operand:VALLSDIF 0 "register_operand")
    (match_operand:<V_INT_EQUIV> 1 "register_operand")
-   (match_operand:VALLDIF 2 "register_operand")
-   (match_operand:VALLDIF 3 "register_operand")]
+   (match_operand:VALLSDIF 2 "register_operand")
+   (match_operand:VALLSDIF 3 "register_operand")]
  "TARGET_SIMD"
 {
   /* We can't alias operands together if they have different modes.  */
@@ -3797,9 +3797,9 @@ (define_expand "aarch64_simd_bsl<mode>"
 })
 
 (define_expand "vcond_mask_<mode><v_int_equiv>"
-  [(match_operand:VALLDI 0 "register_operand")
-   (match_operand:VALLDI 1 "nonmemory_operand")
-   (match_operand:VALLDI 2 "nonmemory_operand")
+  [(match_operand:VALLSDI 0 "register_operand")
+   (match_operand:VALLSDI 1 "nonmemory_operand")
+   (match_operand:VALLSDI 2 "nonmemory_operand")
    (match_operand:<V_INT_EQUIV> 3 "register_operand")]
   "TARGET_SIMD"
 {
@@ -3831,8 +3831,8 @@ (define_expand "cbranch<mode>4"
   [(set (pc)
         (if_then_else
           (match_operator 0 "aarch64_equality_operator"
-            [(match_operand:VDQ_I 1 "register_operand")
-             (match_operand:VDQ_I 2 "aarch64_simd_reg_or_zero")])
+            [(match_operand:VDQS_I 1 "register_operand")
+             (match_operand:VDQS_I 2 "aarch64_simd_reg_or_zero")])
           (label_ref (match_operand 3 ""))
           (pc)))]
   "TARGET_SIMD"
@@ -3857,12 +3857,15 @@ (define_expand "cbranch<mode>4"
       emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
       emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
     }
+  auto mode = DImode;
+  if (known_eq (32, GET_MODE_BITSIZE (<MODE>mode)))
+    mode = SImode;
 
-  rtx val = gen_reg_rtx (DImode);
-  emit_move_insn (val, gen_lowpart (DImode, tmp));
+  rtx val = gen_reg_rtx (mode);
+  emit_move_insn (val, gen_lowpart (mode, tmp));
 
   rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
-  rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
+  rtx cmp_rtx = gen_rtx_fmt_ee (code, mode, cc_reg, const0_rtx);
   emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
   DONE;
 })
@@ -3870,10 +3873,10 @@ (define_expand "cbranch<mode>4"
 ;; Patterns comparing two vectors to produce a mask.
 
 (define_expand "vec_cmp<mode><mode>"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand")
+  [(set (match_operand:VSDQS_I_DI 0 "register_operand")
 	  (match_operator 1 "comparison_operator"
-	    [(match_operand:VSDQ_I_DI 2 "register_operand")
-	     (match_operand:VSDQ_I_DI 3 "nonmemory_operand")]))]
+	    [(match_operand:VSDQS_I_DI 2 "register_operand")
+	     (match_operand:VSDQS_I_DI 3 "nonmemory_operand")]))]
   "TARGET_SIMD"
 {
   rtx mask = operands[0];
@@ -4123,10 +4126,10 @@ (define_expand "vec_cmp<mode><v_int_equiv>"
 })
 
 (define_expand "vec_cmpu<mode><mode>"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand")
+  [(set (match_operand:VSDQS_I_DI 0 "register_operand")
 	  (match_operator 1 "comparison_operator"
-	    [(match_operand:VSDQ_I_DI 2 "register_operand")
-	     (match_operand:VSDQ_I_DI 3 "nonmemory_operand")]))]
+	    [(match_operand:VSDQS_I_DI 2 "register_operand")
+	     (match_operand:VSDQS_I_DI 3 "nonmemory_operand")]))]
   "TARGET_SIMD"
 {
   emit_insn (gen_vec_cmp<mode><mode> (operands[0], operands[1],
@@ -4135,13 +4138,13 @@ (define_expand "vec_cmpu<mode><mode>"
 })
 
 (define_expand "vcond<mode><mode>"
-  [(set (match_operand:VALLDI 0 "register_operand")
-	(if_then_else:VALLDI
+  [(set (match_operand:VALLSDI 0 "register_operand")
+	(if_then_else:VALLSDI
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:VALLDI 4 "register_operand")
-	     (match_operand:VALLDI 5 "nonmemory_operand")])
-	  (match_operand:VALLDI 1 "nonmemory_operand")
-	  (match_operand:VALLDI 2 "nonmemory_operand")))]
+	    [(match_operand:VALLSDI 4 "register_operand")
+	     (match_operand:VALLSDI 5 "nonmemory_operand")])
+	  (match_operand:VALLSDI 1 "nonmemory_operand")
+	  (match_operand:VALLSDI 2 "nonmemory_operand")))]
   "TARGET_SIMD"
 {
   rtx mask = gen_reg_rtx (<V_INT_EQUIV>mode);
@@ -4295,7 +4298,7 @@ (define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
 (define_insn_and_split "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
+	  (match_operand:VALLS_F16 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -4794,14 +4797,14 @@ (define_expand "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w2<mode>"
 ;; <su><r>h<addsub>.
 
 (define_expand "<su_optab>avg<mode>3_floor"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand")
-	(truncate:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand")
+	(truncate:VDQS_BHSI
 	  (ashiftrt:<V2XWIDE>
 	    (plus:<V2XWIDE>
 	      (ANY_EXTEND:<V2XWIDE>
-		(match_operand:VDQ_BHSI 1 "register_operand"))
+		(match_operand:VDQS_BHSI 1 "register_operand"))
 	      (ANY_EXTEND:<V2XWIDE>
-		(match_operand:VDQ_BHSI 2 "register_operand")))
+		(match_operand:VDQS_BHSI 2 "register_operand")))
 	    (match_dup 3))))]
   "TARGET_SIMD"
   {
@@ -4810,15 +4813,15 @@ (define_expand "<su_optab>avg<mode>3_floor"
 )
 
 (define_expand "<su_optab>avg<mode>3_ceil"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand")
-	(truncate:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand")
+	(truncate:VDQS_BHSI
 	  (ashiftrt:<V2XWIDE>
 	    (plus:<V2XWIDE>
 	      (plus:<V2XWIDE>
 		(ANY_EXTEND:<V2XWIDE>
-		  (match_operand:VDQ_BHSI 1 "register_operand"))
+		  (match_operand:VDQS_BHSI 1 "register_operand"))
 		(ANY_EXTEND:<V2XWIDE>
-		  (match_operand:VDQ_BHSI 2 "register_operand")))
+		  (match_operand:VDQS_BHSI 2 "register_operand")))
 	       (match_dup 3))
 	    (match_dup 3))))]
   "TARGET_SIMD"
@@ -4828,14 +4831,14 @@ (define_expand "<su_optab>avg<mode>3_ceil"
 )
 
 (define_expand "aarch64_<su>hsub<mode>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand")
-	(truncate:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand")
+	(truncate:VDQS_BHSI
 	  (ashiftrt:<V2XWIDE>
 	    (minus:<V2XWIDE>
 	      (ANY_EXTEND:<V2XWIDE>
-		(match_operand:VDQ_BHSI 1 "register_operand"))
+		(match_operand:VDQS_BHSI 1 "register_operand"))
 	      (ANY_EXTEND:<V2XWIDE>
-		(match_operand:VDQ_BHSI 2 "register_operand")))
+		(match_operand:VDQS_BHSI 2 "register_operand")))
 	    (match_dup 3))))]
   "TARGET_SIMD"
   {
@@ -4844,14 +4847,14 @@ (define_expand "aarch64_<su>hsub<mode>"
 )
 
 (define_insn "*aarch64_<su>h<ADDSUB:optab><mode><vczle><vczbe>_insn"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-	(truncate:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+	(truncate:VDQS_BHSI
 	  (ashiftrt:<V2XWIDE>
 	    (ADDSUB:<V2XWIDE>
 	      (ANY_EXTEND:<V2XWIDE>
-		(match_operand:VDQ_BHSI 1 "register_operand" "w"))
+		(match_operand:VDQS_BHSI 1 "register_operand" "w"))
 	      (ANY_EXTEND:<V2XWIDE>
-		(match_operand:VDQ_BHSI 2 "register_operand" "w")))
+		(match_operand:VDQS_BHSI 2 "register_operand" "w")))
 	    (match_operand:<V2XWIDE> 3 "aarch64_simd_imm_one"))))]
   "TARGET_SIMD"
   "<su>h<ADDSUB:optab>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
@@ -4859,15 +4862,15 @@ (define_insn "*aarch64_<su>h<ADDSUB:optab><mode><vczle><vczbe>_insn"
 )
 
 (define_insn "*aarch64_<su>rhadd<mode><vczle><vczbe>_insn"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-	(truncate:VDQ_BHSI
+  [(set (match_operand:VDQS_BHSI 0 "register_operand" "=w")
+	(truncate:VDQS_BHSI
 	  (ashiftrt:<V2XWIDE>
 	    (plus:<V2XWIDE>
 	      (plus:<V2XWIDE>
 		(ANY_EXTEND:<V2XWIDE>
-		  (match_operand:VDQ_BHSI 1 "register_operand" "w"))
+		  (match_operand:VDQS_BHSI 1 "register_operand" "w"))
 		(ANY_EXTEND:<V2XWIDE>
-		  (match_operand:VDQ_BHSI 2 "register_operand" "w")))
+		  (match_operand:VDQS_BHSI 2 "register_operand" "w")))
 	       (match_operand:<V2XWIDE> 3 "aarch64_simd_imm_one"))
 	    (match_dup 3))))]
   "TARGET_SIMD"
@@ -7028,8 +7031,8 @@ (define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
 	(neg:<V_INT_EQUIV>
 	  (COMPARISONS:<V_INT_EQUIV>
-	    (match_operand:VDQ_I 1 "register_operand")
-	    (match_operand:VDQ_I 2 "aarch64_simd_reg_or_zero")
+	    (match_operand:VDQS_I 1 "register_operand")
+	    (match_operand:VDQS_I 2 "aarch64_simd_reg_or_zero")
 	  )))]
   "TARGET_SIMD"
   {@ [ cons: =0 , 1 , 2   ; attrs: type           ]
@@ -7093,8 +7096,8 @@ (define_insn "aarch64_cm<optab><mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
 	(neg:<V_INT_EQUIV>
 	  (UCOMPARISONS:<V_INT_EQUIV>
-	    (match_operand:VDQ_I 1 "register_operand" "w")
-	    (match_operand:VDQ_I 2 "register_operand" "w")
+	    (match_operand:VDQS_I 1 "register_operand" "w")
+	    (match_operand:VDQS_I 2 "register_operand" "w")
 	  )))]
   "TARGET_SIMD"
   "cm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>"
@@ -7160,10 +7163,10 @@ (define_insn "aarch64_cmtst<mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
 	(plus:<V_INT_EQUIV>
 	  (eq:<V_INT_EQUIV>
-	    (and:VDQ_I
-	      (match_operand:VDQ_I 1 "register_operand" "w")
-	      (match_operand:VDQ_I 2 "register_operand" "w"))
-	    (match_operand:VDQ_I 3 "aarch64_simd_imm_zero"))
+	    (and:VDQS_I
+	      (match_operand:VDQS_I 1 "register_operand" "w")
+	      (match_operand:VDQS_I 2 "register_operand" "w"))
+	    (match_operand:VDQS_I 3 "aarch64_simd_imm_zero"))
 	  (match_operand:<V_INT_EQUIV> 4 "aarch64_simd_imm_minus_one")))
   ]
   "TARGET_SIMD"
@@ -7179,8 +7182,8 @@ (define_insn "*aarch64_cmtst_same_<mode><vczle><vczbe>"
   [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
 	(plus:<V_INT_EQUIV>
 	  (eq:<V_INT_EQUIV>
-	    (match_operand:VDQ_I 1 "register_operand" "w")
-	    (match_operand:VDQ_I 2 "aarch64_simd_imm_zero"))
+	    (match_operand:VDQS_I 1 "register_operand" "w")
+	    (match_operand:VDQS_I 2 "aarch64_simd_imm_zero"))
 	  (match_operand:<V_INT_EQUIV> 3 "aarch64_simd_imm_minus_one")))
   ]
   "TARGET_SIMD"
@@ -8388,10 +8391,10 @@ (define_expand "aarch64_ld<nregs>_lane<vstruct_elt>"
 ;; vec_perm support
 
 (define_expand "vec_perm<mode>"
-  [(match_operand:VB 0 "register_operand")
-   (match_operand:VB 1 "register_operand")
-   (match_operand:VB 2 "register_operand")
-   (match_operand:VB 3 "register_operand")]
+  [(match_operand:VB_WS 0 "register_operand")
+   (match_operand:VB_WS 1 "register_operand")
+   (match_operand:VB_WS 2 "register_operand")
+   (match_operand:VB_WS 3 "register_operand")]
   "TARGET_SIMD"
 {
   aarch64_expand_vec_perm (operands[0], operands[1],
@@ -8675,7 +8678,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
 ;; Standard pattern name vec_init<mode><Vel>.
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALLS_F16 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -8692,9 +8695,18 @@ (define_expand "vec_init<mode><Vhalf>"
   DONE;
 })
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:VD_NO2E 0 "register_operand")
+   (match_operand 1 "" "")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_simd_ld1r<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_duplicate:VALL_F16
+  [(set (match_operand:VALLS_F16 0 "register_operand" "=w")
+	(vec_duplicate:VALLS_F16
 	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
   "TARGET_SIMD"
   "ld1r\\t{%0.<Vtype>}, %1"
@@ -8754,7 +8766,7 @@ (define_insn "aarch64_urecpe<mode>"
 
 (define_expand "vec_extract<mode><Vel>"
   [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:VALLS_F16 1 "register_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
@@ -9881,3 +9893,120 @@ (define_insn "aarch64_bfcvtsf"
   "shl\\t%d0, %d1, #16"
   [(set_attr "type" "neon_shift_imm")]
 )
+
+;; V4QI and V2HI modes special patterns
+
+;; TODO: vec_extract V8QI->V4QI/V4HI->V2HI
+
+/* Extends */
+/* V2HI -> V2SI, V4QI -> V4HI
+   via V4HI -> V4SI, V8QI -> V8HI */
+(define_expand "<optab><mode><Vwide>2"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(ANY_EXTEND:<VWIDE> (match_operand:VH_I 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  {
+    auto widemode = (<MODE>mode == V2HImode) ? V4SImode : V8HImode;
+    auto op1mode = (<MODE>mode == V2HImode) ? V4HImode : V8QImode;
+    rtx op1 = gen_lowpart (op1mode, operands[1]);
+    rtx op0 = gen_reg_rtx (widemode);
+    if (<MODE>mode == V2HImode)
+      emit_insn (gen_<optab>v4hiv4si2 (op0, op1));
+    else
+      emit_insn (gen_<optab>v8qiv8hi2 (op0, op1));
+    emit_move_insn (operands[0], gen_lowpart (<VWIDE>mode, op0));
+    DONE;
+  }
+)
+
+(define_expand "<optab>v4qiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand")
+	(ANY_EXTEND:V4SI (match_operand:V4QI 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (V4HImode);
+    emit_insn (gen_<optab>v4qiv4hi2 (tmp, operands[1]));
+    emit_insn (gen_<optab>v4hiv4si2 (operands[0], tmp));
+    DONE;
+  }
+)
+
+/* Truncates */
+(define_insn "truncv4hiv4qi2"
+  [(set (match_operand:V4QI 0 "register_operand" "=w")
+	(truncate:V4QI (match_operand:V4HI 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "xtn\t%0.8b, %1.8h"
+  [(set_attr "type" "neon_move_narrow_q")]
+)
+
+(define_expand "truncv4siv4qi2"
+  [(set (match_operand:V4QI 0 "register_operand")
+	(truncate:V4QI (match_operand:V4SI 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (V4HImode);
+    emit_insn (gen_truncv4siv4hi2 (tmp, operands[1]));
+    emit_insn (gen_truncv4hiv4qi2 (operands[0], tmp));
+    DONE;
+  }
+)
+
+/* Widening sum */
+(define_insn "widen_<su>sumv4qi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=w")
+	(plus:V4HI (ANY_EXTEND:V4HI
+		    (match_operand:V4QI 1 "register_operand" "w"))
+		   (match_operand:V4HI 2 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<su>addw\t%0.8h, %2.8h, %1.8b"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+/* Reductions */
+(define_expand "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand")
+       (unspec:<VEL> [(match_operand:VH_I 1 "register_operand")]
+		    UNSPEC_ADDV))]
+ "TARGET_SIMD"
+ {
+   rtx double_reg = gen_reg_rtx (<VDBL>mode);
+   emit_insn (gen_aarch64_vec_concat<mode> (double_reg, operands[1], CONST0_RTX (<MODE>mode)));
+   emit_insn (gen_reduc_plus_scal_<Vdbl> (operands[0], double_reg));
+   DONE;
+ }
+)
+
+(define_expand "reduc_<optab>_scal_<mode>"
+  [(match_operand:<VEL> 0 "register_operand")
+   (unspec:VH_I [(match_operand:VH_I 1 "register_operand")]
+		    MAXMINV)]
+  "TARGET_SIMD"
+  {
+    rtx double_reg = gen_reg_rtx (<VDBL>mode);
+    rtx other = operands[1];
+    /* umax can be optimized to use the upper part being 0 rather than a dup,
+       This allows using fmov in some cases. */
+    if (<CODE> == UMAX)
+      other = CONST0_RTX (<MODE>mode);
+    emit_insn (gen_aarch64_vec_concat<mode> (double_reg, operands[1], other));
+    emit_insn (gen_reduc_<optab>_scal_<Vdbl> (operands[0], double_reg));
+    DONE;
+ }
+)
+;; For 32-bit modes we use ushl/r, as this does not require a SIMD zero.
+;; Using vN.2s though
+(define_insn "vec_shr_<mode>"
+  [(set (match_operand:VH_I 0 "register_operand" "=w")
+        (unspec:VH_I [(match_operand:VH_I 1 "register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+		   UNSPEC_VEC_SHR))]
+  "TARGET_SIMD"
+  {
+    if (BYTES_BIG_ENDIAN)
+      return "shl %0.2s, %1.2s, %2";
+    else
+      return "ushr %0.2s, %1.2s, %2";
+  }
+  [(set_attr "type" "neon_shift_imm")]
+)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 7f0cc47d0f0..6f75e11ed0b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1628,6 +1628,12 @@ aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
     case E_V4x2DFmode:
       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
 
+    /* 32-bit Advanced SIMD vectors.  */
+    case E_V4QImode:
+    case E_V2HImode:
+      if (BYTES_BIG_ENDIAN)
+	return 0;
+    /* FALLTHRU */
     /* 64-bit Advanced SIMD vectors.  */
     case E_V8QImode:
     case E_V4HImode:
@@ -1948,11 +1954,12 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
 	}
     }
 
-  /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
+  /* Prefer to use 1 128-bit vector instead of 2 64-bit or 4 32-bit vectors.  */
   if (TARGET_SIMD
       && (vec_flags & VEC_ADVSIMD)
       && known_eq (nunits, 0U)
-      && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
+      && (known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
+	  || known_eq (GET_MODE_BITSIZE (vector_mode), 32U))
       && maybe_ge (GET_MODE_BITSIZE (element_mode)
 		   * GET_MODE_NUNITS (vector_mode), 128U))
     {
@@ -1961,6 +1968,19 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
 	return res;
     }
 
+  /* Prefer to use 1 64-bit vector instead of 2 32-bit vectors.  */
+  if (TARGET_SIMD
+      && (vec_flags & VEC_ADVSIMD)
+      && known_eq (nunits, 0U)
+      && known_eq (GET_MODE_BITSIZE (vector_mode), 32U)
+      && maybe_ge (GET_MODE_BITSIZE (element_mode)
+		   * GET_MODE_NUNITS (vector_mode), 64U))
+    {
+      machine_mode res = aarch64_simd_container_mode (element_mode, 64);
+      if (VECTOR_MODE_P (res))
+	return res;
+    }
+
   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
 }
 
@@ -22237,9 +22257,23 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
       && known_eq (width, BITS_PER_SVE_VECTOR))
     return aarch64_full_sve_mode (mode).else_mode (word_mode);
 
-  gcc_assert (known_eq (width, 64) || known_eq (width, 128));
+  gcc_assert (known_eq (width, 32)
+	      || known_eq (width, 64)
+	      || known_eq (width, 128));
   if (TARGET_BASE_SIMD)
     {
+      if (!BYTES_BIG_ENDIAN && known_eq (width, 32))
+	{
+	  switch (mode)
+	  {
+	  case E_HImode:
+	    return V2HImode;
+	  case E_QImode:
+	    return V4QImode;
+	  default:
+	    break;
+	  }
+	}
       if (known_eq (width, 128))
 	return aarch64_vq_mode (mode).else_mode (word_mode);
       else
@@ -22353,18 +22387,23 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
     V8QImode,
 
     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
-       for wider elements.
-
-       TODO: We could support a limited form of V4QImode too, so that
-       we use 32-bit vectors for 8-bit elements.  */
+       for wider elements.  */
     V4HImode,
 
     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
        for 64-bit elements.
 
-       TODO: We could similarly support limited forms of V2QImode and V2HImode
-       for this case.  */
-    V2SImode
+       TODO: We could similarly support limited forms of V2QImode for
+       this case.  */
+    V2SImode,
+
+    /* Try using 32-bit vectors for 8-bit elements and 128-bit vectors
+       for wider elements.  */
+    V4QImode,
+
+    /* Try using 32-bit vectors for 16-bit elements and 128-bit vectors
+       for wider elements.  */
+    V2HImode,
   };
 
   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
@@ -22395,7 +22434,13 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
 					advsimd_modes[advsimd_i]))
 	modes->safe_push (sve_modes[sve_i++]);
       else
-	modes->safe_push (advsimd_modes[advsimd_i++]);
+	{
+	  if ((aarch64_32bit_auto_vec
+	       || (advsimd_modes[advsimd_i] != V4QImode
+		   && advsimd_modes[advsimd_i] != V2HImode)))
+	    modes->safe_push (advsimd_modes[advsimd_i]);
+	  advsimd_i++;
+	}
     }
   while (sve_i < ARRAY_SIZE (sve_modes))
    modes->safe_push (sve_modes[sve_i++]);
@@ -25427,7 +25472,7 @@ aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
   machine_mode vmode = GET_MODE (target);
   bool one_vector_p = rtx_equal_p (op0, op1);
 
-  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode || vmode == V4QImode);
   gcc_checking_assert (GET_MODE (op0) == vmode);
   gcc_checking_assert (GET_MODE (op1) == vmode);
   gcc_checking_assert (GET_MODE (sel) == vmode);
@@ -25435,7 +25480,16 @@ aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
 
   if (one_vector_p)
     {
-      if (vmode == V8QImode)
+      if (vmode == V4QImode)
+	{
+	  /* Expand the argument to a V16QI mode by duplicating it.  */
+	  rtx quad = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_aarch64_simd_dupv4si (gen_lowpart (V4SImode, quad), gen_lowpart (SImode, op0)));
+	  sel = gen_lowpart (V8QImode, sel);
+	  target = gen_lowpart (V8QImode, target);
+	  emit_insn (gen_aarch64_qtbl1v8qi (target, quad, sel));
+	}
+      else if (vmode == V8QImode)
 	{
 	  /* Expand the argument to a V16QI mode by duplicating it.  */
 	  rtx pair = gen_reg_rtx (V16QImode);
@@ -25451,7 +25505,18 @@ aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
     {
       rtx pair;
 
-      if (vmode == V8QImode)
+      if (vmode == V4QImode)
+	{
+	  rtx p8 = gen_reg_rtx (V8QImode);
+	  emit_insn (gen_aarch64_vec_concatv4qi (p8, op0, op1));
+	  pair = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_aarch64_combinev8qi (pair, p8, p8));
+	  sel = gen_lowpart (V8QImode, sel);
+	  target = gen_lowpart (V8QImode, target);
+	  emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
+	}
+
+      else if (vmode == V8QImode)
 	{
 	  pair = gen_reg_rtx (V16QImode);
 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
@@ -25599,6 +25664,16 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
     }
   out = d->target;
 
+  /* For emulated 4byte vectors, just use a paradoxical subreg
+     of the 8byte vector. */
+  if (d->vmode == V2HImode || d->vmode == V4QImode)
+    {
+      vmode = vmode == V2HImode ? V4HImode : V8QImode;
+      out = gen_lowpart (vmode, out);
+      in0 = gen_lowpart (vmode, in0);
+      in1 = gen_lowpart (vmode, in1);
+    }
+
   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
   return true;
@@ -25677,6 +25752,20 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
       odd = !odd;
     }
   out = d->target;
+  /* For emulated 4byte vectors, use 8byte vectors
+     and concat the regs together before and then do the ext
+     that way. */
+  if (d->vmode == V2HImode || d->vmode == V4QImode)
+    {
+      vmode = d->vmode == V2HImode ? V4HImode : V8QImode;
+      out = gen_lowpart (vmode, out);
+      rtx double_reg = gen_reg_rtx (vmode);
+      in0 = gen_lowpart (V4QImode, in0);
+      in1 = gen_lowpart (V4QImode, in1);
+      emit_insn (gen_aarch64_vec_concatv4qi (gen_lowpart (V8QImode, double_reg), in0, in1));
+      in0 = double_reg;
+      in1 = double_reg;
+    }
 
   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
@@ -25691,6 +25780,7 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
   poly_uint64 nelt = d->perm.length ();
   rtx out, in0, in1;
   machine_mode vmode = d->vmode;
+  bool highpart32 = false;
 
   if (GET_MODE_UNIT_SIZE (vmode) > 8)
     return false;
@@ -25719,8 +25809,28 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
     }
   out = d->target;
 
+  /* For emulated 4byte vectors, just use a paradoxical subreg
+     of the 8byte vector. The zip2 like is zip1 followed by a slection of the "top" half. */
+  if (d->vmode == V2HImode || d->vmode == V4QImode)
+    {
+      vmode = vmode == V2HImode ? V4HImode : V8QImode;
+      out = gen_reg_rtx (vmode);
+      in0 = gen_lowpart (vmode, in0);
+      in1 = gen_lowpart (vmode, in1);
+      highpart32 = high;
+      high = false;
+    }
+
   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
+
+
+  if (d->vmode == V2HImode || d->vmode == V4QImode)
+    {
+      gcc_assert (!high);
+      gcc_assert (vmode == V4HImode || vmode == V8QImode);
+      emit_insn (gen_aarch64_get_lanev2si (gen_lowpart (SImode, d->target), gen_lowpart (V2SImode, out), highpart32 ? const1_rtx : const0_rtx));
+    }
   return true;
 }
 
@@ -25760,10 +25870,32 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
       location = d->perm.length ().to_constant () - location;
     }
 
+  rtx op0 = d->op0;
+  rtx op1 = d->op1;
+  rtx target = d->target;
+  auto vmode = d->vmode;
+  /* For emulated 4byte vectors, use 8byte vectors
+     and concat the regs together before and then do the ext
+     that way. */
+  if (d->vmode == V2HImode || d->vmode == V4QImode)
+    {
+      /* Convert V2HI into V4QI locations as using V8QI here is simplier. */
+      if (vmode == V2HImode)
+	location *= 2;
+      vmode = V8QImode;
+      target = gen_lowpart (V8QImode, target);
+      rtx double_reg = gen_reg_rtx (V8QImode);
+      op0 = gen_lowpart (V4QImode, op0);
+      op1 = gen_lowpart (V4QImode, op1);
+      emit_insn (gen_aarch64_vec_concatv4qi (double_reg, op0, op1));
+      op0 = double_reg;
+      op1 = double_reg;
+    }
+
   offset = GEN_INT (location);
-  emit_set_insn (d->target,
-		 gen_rtx_UNSPEC (d->vmode,
-				 gen_rtvec (3, d->op0, d->op1, offset),
+  emit_set_insn (target,
+		 gen_rtx_UNSPEC (vmode,
+				 gen_rtvec (3, op0, op1, offset),
 				 UNSPEC_EXT));
   return true;
 }
@@ -25822,8 +25954,20 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
 					 d->target, pred, d->op0));
       return true;
     }
-  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
-  emit_set_insn (d->target, src);
+
+  auto vmode = d->vmode;
+  rtx out = d->target;
+  rtx in0 = d->op0;
+  /* For emulated 4byte vectors, just use a paradoxical subreg
+     of the 8byte vector. */
+  if (vmode == V2HImode || vmode == V4QImode)
+    {
+      vmode = vmode == V2HImode ? V4HImode : V8QImode;
+      out = gen_lowpart (vmode, out);
+      in0 = gen_lowpart (vmode, in0);
+    }
+  rtx src = gen_rtx_UNSPEC (vmode, gen_rtvec (1, in0), unspec);
+  emit_set_insn (out, src);
   return true;
 }
 
@@ -25904,7 +26048,7 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
   /* Generic code will try constant permutation twice.  Once with the
      original mode and again with the elements lowered to QImode.
      So wait and don't do the selector expansion ourselves.  */
-  if (vmode != V8QImode && vmode != V16QImode)
+  if (vmode != V8QImode && vmode != V16QImode && vmode != V4QImode)
     return false;
 
   /* to_constant is safe since this routine is specific to Advanced SIMD
@@ -25943,10 +26087,11 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
 	}
     }
 
+  rtx target = d->target;
   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   sel = force_reg (vmode, sel);
 
-  aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  aarch64_expand_vec_perm_1 (target, d->op0, d->op1, sel);
   return true;
 }
 
@@ -26109,9 +26254,21 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
     }
   gcc_assert (extractindex < nelt);
 
+  rtx target = d->target;
+
+  /* For emulated 4byte vectors, just use a paradoxical subreg
+     of the 8byte vector. */
+  if (d->vmode == V2HImode || d->vmode == V4QImode)
+    {
+      mode = mode == V2HImode ? V4HImode : V8QImode;
+      target = gen_lowpart (mode, target);
+      insv = gen_lowpart (mode, insv);
+      extractv = gen_lowpart (mode, extractv);
+    }
+
   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
   expand_operand ops[5];
-  create_output_operand (&ops[0], d->target, mode);
+  create_output_operand (&ops[0], target, mode);
   create_input_operand (&ops[1], insv, mode);
   create_integer_operand (&ops[2], 1 << idx);
   create_input_operand (&ops[3], extractv, mode);
@@ -26186,7 +26343,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  rtx target, rtx op0, rtx op1,
 				  const vec_perm_indices &sel)
 {
-  struct expand_vec_perm_d d;
+  expand_vec_perm_d d;
+  bool truncate = false;
 
   /* Check whether the mask can be applied to a single vector.  */
   if (sel.ninputs () == 1
@@ -26204,6 +26362,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
     }
   else
     d.one_vector_p = false;
+  rtx_insn *last = get_last_insn ();
 
   d.zero_op0_p = op0 == CONST0_RTX (op_mode);
   d.zero_op1_p = op1 == CONST0_RTX (op_mode);
@@ -26221,12 +26380,14 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
   d.testing_p = !target;
 
-  if (!d.testing_p)
-    return aarch64_expand_vec_perm_const_1 (&d);
-
-  rtx_insn *last = get_last_insn ();
   bool ret = aarch64_expand_vec_perm_const_1 (&d);
-  gcc_assert (last == get_last_insn ());
+  if (truncate && target)
+    {
+      gcc_assert (target != d.target);
+      emit_insn (gen_truncv4hiv4qi2 (target, gen_lowpart (V4HImode, d.target)));
+    }
+  if (d.testing_p)
+    gcc_assert (last == get_last_insn ());
 
   return ret;
 }
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 6356c419399..e3e8fc632e2 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -69,6 +69,10 @@ Enum(cmodel) String(small) Value(AARCH64_CMODEL_SMALL)
 EnumValue
 Enum(cmodel) String(large) Value(AARCH64_CMODEL_LARGE)
 
+mautovector32bits
+Target Optimization Var(aarch64_32bit_auto_vec) Init(1)
+Emulate a vector of 4 element of 1byte (32bits) and 2 elements of 2 bytes using the 64bits register.
+
 mbig-endian
 Target RejectNegative Mask(BIG_END)
 Assume target CPU is configured as big endian.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f527b2cfeb8..c36be65225a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -86,6 +86,9 @@ (define_mode_iterator GPF_TF [SF DF TF SD DD TD])
 ;; Integer Advanced SIMD modes.
 (define_mode_iterator VDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
 
+;; Integer Advanced SIMD modes plus 32bit modes
+(define_mode_iterator VDQS_I [V4QI V2HI V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
+
 ;; Advanced SIMD and scalar, 64 & 128-bit container, all integer modes.
 (define_mode_iterator VSDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI QI HI SI DI])
 
@@ -93,12 +96,18 @@ (define_mode_iterator VSDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI QI HI SI DI])
 ;; integer modes; 64-bit scalar integer mode.
 (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
 
+;; Same as above plus the 32bit modes
+(define_mode_iterator VSDQS_I_DI [V4QI V2HI V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
+
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
 
 ;; Double vector modes suitable for moving.  Includes BFmode.
 (define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
 
+;; Double and Single vector modes suitable for moving; Includes BFmode.
+(define_mode_iterator VDHMOV [V4QI V2HI V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; 64-bit modes for operations that implicitly clear the top bits of a Q reg.
 (define_mode_iterator VDZ [V8QI V4HI V4HF V4BF V2SI V2SF DI DF])
 
@@ -117,6 +126,9 @@ (define_mode_iterator VD_BHSI [V8QI V4HI V2SI])
 ;; 128 and 64-bit container; 8, 16, 32-bit vector integer modes
 (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
 
+;; 128, 64, and 32-bit container; 8, 16, 32-bit vector integer modes
+(define_mode_iterator VDQS_BHSI [V4QI V2HI V8QI V16QI V4HI V8HI V2SI V4SI])
+
 ;; Quad vector modes.
 (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
 
@@ -132,6 +144,9 @@ (define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
 ;; Double integer vector modes.
 (define_mode_iterator VD_I [V8QI V4HI V2SI DI])
 
+;; VD without 2 element modes.
+(define_mode_iterator VD_NO2E [V8QI V4HI])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
@@ -144,6 +159,9 @@ (define_mode_iterator VQ_2E [V2DI V2DF])
 ;; BFmode vector modes.
 (define_mode_iterator VBF [V4BF V8BF])
 
+;; 32bit vector integer modes.
+(define_mode_iterator VH_I [V4QI V2HI])
+
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; addresses in different modes.  In LP64, only DI will match, while in
 ;; ILP32, either can match.
@@ -200,13 +218,21 @@ (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing.
+;; Includes 32bit vectors V4QI and V2HI
+(define_mode_iterator VALLS_F16 [V4QI V2HI V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				 V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
-				V4HF V8HF V2SF V4SF])
+				       V4HF V8HF V2SF V4SF])
 
 ;; All Advanced SIMD modes barring HF modes, plus DI.
 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])
 
+
+(define_mode_iterator VALLSDI [V4QI V2HI V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF DI])
+
 ;; All Advanced SIMD modes and DI.
 (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				  V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
@@ -215,6 +241,10 @@ (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
 			       V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
 
+;; All Advanced SIMD modes (including 32bit), plus DI and DF.
+(define_mode_iterator VALLSDIF [V4QI V2HI V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
+			       V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
+
 ;; All Advanced SIMD polynomial modes and DI.
 (define_mode_iterator VALLP [V8QI V16QI V4HI V8HI V2DI DI])
 
@@ -250,7 +280,7 @@ (define_mode_iterator VQW [V16QI V8HI V4SI])
 (define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
 
 ;; VDC plus SI and SF.
-(define_mode_iterator VDCSIF [V8QI V4HI V4BF V4HF V2SI V2SF SI SF DI DF])
+(define_mode_iterator VDCSIF [V4QI V2HI V8QI V4HI V4BF V4HF V2SI V2SF SI SF DI DF])
 
 ;; Polynomial modes for vector combines.
 (define_mode_iterator VDC_P [V8QI V4HI DI])
@@ -311,6 +341,9 @@ (define_mode_iterator VQ_HSI [V8HI V4SI])
 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
 
+;; All byte modes including the 32bit modes.
+(define_mode_iterator VB_WS [V4QI V8QI V16QI])
+
 ;; 1 and 2 lane DI and DF modes.
 (define_mode_iterator V12DIF [V1DI V1DF V2DI V2DF])
 
@@ -1179,8 +1212,8 @@ (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
 ;; For scalar usage of vector/FP registers
 (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
 		    (HF  "h") (SF "s") (DF "d")
-		    (V8QI "") (V16QI "")
-		    (V4HI "") (V8HI "")
+		    (V4QI "") (V8QI "") (V16QI "")
+		    (V2HI "") (V4HI "") (V8HI "")
 		    (V2SI "") (V4SI  "")
 		    (V2DI "") (V2SF "")
 		    (V4SF "") (V4HF "")
@@ -1210,8 +1243,10 @@ (define_mode_attr vas [(DI "") (SI ".2s")])
 
 ;; Map a vector to the number of units in it, if the size of the mode
 ;; is constant.
-(define_mode_attr nunits [(V8QI "8") (V16QI "16")
-			  (V4HI "4") (V8HI "8")
+;; Note V4QI/V2HI even though there are 4/2 units in them, this is
+;; talking about the total units in the underlying form
+(define_mode_attr nunits [(V4QI "8") (V8QI "8") (V16QI "16")
+			  (V2HI "4") (V4HI "4") (V8HI "8")
 			  (V2SI "2") (V4SI "4")
 			  (V1DI "1") (V2DI "2")
 			  (V4HF "4") (V8HF "8")
@@ -1222,9 +1257,10 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
 			  (V8DI "8")])
 
 ;; Map a mode to the number of bits in it, if the size of the mode
-;; is constant.
-(define_mode_attr bitsize [(V8QI "64") (V16QI "128")
-			   (V4HI "64") (V8HI "128")
+;; is constant. Except for V4QI and V2HI which uses the 64bit as
+;; emulating of 32bit modes
+(define_mode_attr bitsize [(V4QI "64") (V8QI "64") (V16QI "128")
+			   (V2HI "64") (V4HI "64") (V8HI "128")
 			   (V2SI "64") (V4SI "128")
 				       (V2DI "128")])
 
@@ -1279,8 +1315,8 @@ (define_mode_attr cmode [(QI "q") (HI "h") (SI "s") (DI "d")])
 ;; Map modes to Usg and Usj constraints for SISD right shifts
 (define_mode_attr cmode_simd [(SI "g") (DI "j")])
 
-(define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
-			 (V4HI "4h") (V8HI  "8h")
+(define_mode_attr Vtype [(V4QI "8b") (V8QI "8b") (V16QI "16b")
+			 (V2HI "4h") (V4HI "4h") (V8HI  "8h")
 			 (V4BF "4h") (V8BF  "8h")
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
@@ -1322,8 +1358,8 @@ (define_mode_attr Qlane [(V4HI "_v4hi") (V8HI  "q_v4hi")
 (define_mode_attr Vrevsuff [(V4HI "16") (V8HI "16") (V2SI "32")
                             (V4SI "32") (V2DI "64")])
 
-(define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b")
-			 (V4HI ".4h") (V8HI  ".8h")
+(define_mode_attr Vmtype [(V4QI ".8b") (V8QI ".8b") (V16QI ".16b")
+			 (V2HI ".4h") (V4HI ".4h") (V8HI  ".8h")
 			 (V2SI ".2s") (V4SI  ".4s")
 			 (V2DI ".2d") (V4HF ".4h")
 			 (V8HF ".8h") (V4BF ".4h")
@@ -1341,8 +1377,8 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
 			   (HI   "")])
 
 ;; Mode-to-individual element type mapping.
-(define_mode_attr Vetype [(V8QI "b") (V16QI "b")
-			  (V4HI "h") (V8HI  "h")
+(define_mode_attr Vetype [(V4QI "b") (V8QI "b") (V16QI "b")
+			  (V2HI "h") (V4HI "h") (V8HI  "h")
 			  (V2SI "s") (V4SI  "s")
 			  (V2DI "d") (V1DI  "d")
 			  (V4HF "h") (V8HF  "h")
@@ -1451,8 +1487,8 @@ (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
 			 (SI "s") (DI "d")])
 
 ;; Mode-to-bitwise operation type mapping.
-(define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
-			  (V4HI "8b") (V8HI  "16b")
+(define_mode_attr Vbtype [(V4QI "8b")  (V8QI "8b")  (V16QI "16b")
+			  (V2HI "8b")  (V4HI "8b") (V8HI  "16b")
 			  (V2SI "8b") (V4SI  "16b")
 			  (V2DI "16b") (V4HF "8b")
 			  (V8HF "16b") (V2SF  "8b")
@@ -1515,8 +1551,8 @@ (define_mode_attr vstruct_elt [(V2x8QI "v8qi") (V2x4HI "v4hi")
 			       (V4x2DF "v2df") (V4x8BF "v8bf")])
 
 ;; Define element mode for each vector mode.
-(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
-		       (V4HI "HI") (V8HI  "HI")
+(define_mode_attr VEL [(V4QI "QI") (V8QI  "QI") (V16QI "QI")
+		       (V2HI "HI") (V4HI "HI") (V8HI  "HI")
 		       (V2SI "SI") (V4SI  "SI")
 		       (DI   "DI") (V1DI  "DI")
 		       (V2DI "DI")
@@ -1537,8 +1573,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
 		       (VNx2DF "DF")])
 
 ;; Define element mode for each vector mode (lower case).
-(define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
-		       (V4HI "hi") (V8HI "hi")
+(define_mode_attr Vel [(V4QI "qi") (V8QI "qi") (V16QI "qi")
+		       (V2HI "hi") (V4HI "hi") (V8HI "hi")
 		       (V2SI "si") (V4SI "si")
 		       (DI   "di") (V1DI "si")
 		       (V2DI "di")
@@ -1621,7 +1657,8 @@ (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
 (define_mode_attr V1half [(V2DI "v1di")  (V2DF  "v1df")])
 
 ;; Double modes of vector modes.
-(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
+(define_mode_attr VDBL [(V4QI "V8QI")  (V2HI "V4HI")
+			(V8QI "V16QI") (V4HI "V8HI")
 			(V4HF "V8HF")  (V4BF "V8BF")
 			(V2SI "V4SI")  (V2SF "V4SF")
 			(SI   "V2SI")  (SF   "V2SF")
@@ -1634,7 +1671,8 @@ (define_mode_attr VPAIR [(SI "V2x4QI") (DI "V2x8QI")])
 (define_mode_attr Vdtype [(V4HF "8h") (V2SF "4s")])
 
 ;; Double modes of vector modes (lower case).
-(define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
+(define_mode_attr Vdbl [(V4QI "v8qi")  (V2HI "v4hi")
+			(V8QI "v16qi") (V4HI "v8hi")
 			(V4HF "v8hf")  (V4BF "v8bf")
 			(V2SI "v4si")  (V2SF "v4sf")
 			(SI   "v2si")  (DI   "v2di")
@@ -1684,7 +1722,8 @@ (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
 			   (V2DI "4s")])
 
 ;; Widened modes of vector modes.
-(define_mode_attr VWIDE [(V8QI  "V8HI")  (V4HI  "V4SI")
+(define_mode_attr VWIDE [(V4QI "V4HI")   (V2HI "V2SI")
+			 (V8QI  "V8HI")  (V4HI  "V4SI")
 			 (V2SI  "V2DI")  (V16QI "V8HI")
 			 (V8HI  "V4SI")  (V4SI  "V2DI")
 			 (HI    "SI")    (SI    "DI")
@@ -1697,7 +1736,8 @@ (define_mode_attr VWIDE [(V8QI  "V8HI")  (V4HI  "V4SI")
 			 (VNx4BI  "VNx2BI")])
 
 ;; Modes with the same number of elements but strictly 2x the width.
-(define_mode_attr V2XWIDE [(V8QI "V8HI") (V4HI "V4SI")
+(define_mode_attr V2XWIDE [(V4QI "V4HI") (V2HI "V4SI")
+			   (V8QI "V8HI") (V4HI "V4SI")
 			   (V16QI "V16HI") (V8HI "V8SI")
 			   (V2SI "V2DI") (V4SI "V4DI")
 			   (V2DI "V2TI") (DI "TI")
@@ -1725,7 +1765,8 @@ (define_mode_attr v2xwide [(V8QI "v8hi") (V4HI "v4si")
 (define_mode_attr VWIDE_PRED [(VNx8HF "VNx4BI") (VNx4SF "VNx2BI")])
 
 ;; Widened modes of vector modes, lowercase
-(define_mode_attr Vwide [(V2SF "v2df") (V4HF "v4sf")
+(define_mode_attr Vwide [(V4QI "v4hi") (V2HI "v2si")
+			 (V2SF "v2df") (V4HF "v4sf")
 			 (VNx16QI "vnx8hi") (VNx8HI "vnx4si")
 			 (VNx4SI  "vnx2di")
 			 (VNx8HF  "vnx4sf") (VNx4SF "vnx2df")
@@ -1748,7 +1789,8 @@ (define_mode_attr Vwsuf [(V8QI "") (V4HI "")
 			  (V8HI "") (V4SI "")])
 
 ;; Scalar mode of widened vector reduction.
-(define_mode_attr VWIDE_S [(V8QI "HI") (V4HI "SI")
+(define_mode_attr VWIDE_S [(V4QI "HI") (V2HI "SI")
+			  (V8QI "HI") (V4HI "SI")
 			  (V2SI "DI") (V16QI "HI")
 			  (V8HI "SI") (V4SI "DI")])
 
@@ -1774,7 +1816,8 @@ (define_mode_attr Vewtype [(VNx16QI "h")
 			   (VNx2DI  "q")])
 
 ;; Widened mode register suffixes for VDW/VQW.
-(define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s")
+(define_mode_attr Vmwtype [(V4QI ".8h") (V2HI ".4s")
+			   (V8QI ".8h") (V4HI ".4s")
 			   (V2SI ".2d") (V16QI ".8h")
 			   (V8HI ".4s") (V4SI ".2d")
 			   (V4HF ".4s") (V2SF ".2d")
@@ -1788,6 +1831,7 @@ (define_mode_attr Vhalftype [(V16QI "8b") (V8HI "4h")
 ;; Whether a mode fits in W or X registers (i.e. "w" for 32-bit modes
 ;; and "x" for 64-bit modes).
 (define_mode_attr single_wx [(SI   "w") (SF   "w")
+			     (V4QI "w") (V2HI "w")
 			     (V8QI "x") (V4HI "x")
 			     (V4HF "x") (V4BF "x")
 			     (V2SI "x") (V2SF "x")
@@ -1796,6 +1840,7 @@ (define_mode_attr single_wx [(SI   "w") (SF   "w")
 ;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes
 ;; and "d" for 64-bit modes).
 (define_mode_attr single_type [(SI   "s") (SF   "s")
+			       (V4QI "s") (V2HI "s")
 			       (V8QI "d") (V4HI "d")
 			       (V4HF "d") (V4BF "d")
 			       (V2SI "d") (V2SF "d")
@@ -1804,6 +1849,7 @@ (define_mode_attr single_type [(SI   "s") (SF   "s")
 ;; Whether a double-width mode fits in D or Q registers (i.e. "d" for
 ;; 32-bit modes and "q" for 64-bit modes).
 (define_mode_attr single_dtype [(SI   "d") (SF   "d")
+			        (V4QI "q") (V2HI "q")
 			        (V8QI "q") (V4HI "q")
 			        (V4HF "q") (V4BF "q")
 			        (V2SI "q") (V2SF "q")
@@ -1819,8 +1865,8 @@ (define_mode_attr vw [(V8QI "w") (V16QI "w")
 
 ;; Corresponding core element mode for each vector mode.  This is a
 ;; variation on <vw> mapping FP modes to GP regs.
-(define_mode_attr vwcore [(V8QI "w") (V16QI "w")
-			  (V4HI "w") (V8HI "w")
+(define_mode_attr vwcore [(V4QI "w") (V8QI "w") (V16QI "w")
+			  (V2HI "w") (V4HI "w") (V8HI "w")
 			  (V2SI "w") (V4SI "w")
 			  (DI   "x") (V2DI "x")
 			  (V4HF "w") (V8HF "w")
@@ -1846,8 +1892,8 @@ (define_mode_attr vccore [(VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "x")
 (define_mode_attr Vallxd [(QI "8b") (HI "4h") (SI "2s")])
 
 ;; Mode with floating-point values replaced by like-sized integers.
-(define_mode_attr V_INT_EQUIV [(V8QI "V8QI") (V16QI "V16QI")
-			       (V4HI "V4HI") (V8HI  "V8HI")
+(define_mode_attr V_INT_EQUIV [(V4QI "V4QI") (V8QI "V8QI") (V16QI "V16QI")
+			       (V2HI "V2HI") (V4HI "V4HI") (V8HI  "V8HI")
 			       (V2SI "V2SI") (V4SI  "V4SI")
 			       (DI   "DI")   (V2DI  "V2DI")
 			       (V4HF "V4HI") (V8HF  "V8HI")
@@ -1865,8 +1911,8 @@ (define_mode_attr V_INT_EQUIV [(V8QI "V8QI") (V16QI "V16QI")
 ])
 
 ;; Lower case mode with floating-point values replaced by like-sized integers.
-(define_mode_attr v_int_equiv [(V8QI "v8qi") (V16QI "v16qi")
-			       (V4HI "v4hi") (V8HI  "v8hi")
+(define_mode_attr v_int_equiv [(V4QI "v4qi") (V8QI "v8qi") (V16QI "v16qi")
+			       (V2HI "v2hi") (V4HI "v4hi") (V8HI  "v8hi")
 			       (V2SI "v2si") (V4SI  "v4si")
 			       (DI   "di")   (V2DI  "v2di")
 			       (V4HF "v4hi") (V8HF  "v8hi")
@@ -1932,8 +1978,8 @@ (define_mode_attr v_cmp_mixed [(V2SI "v2sf") (V4SI "v4sf")
 			       (V4SF "v4si") (V2DF "v2di")])
 
 ;; Lower case element modes (as used in shift immediate patterns).
-(define_mode_attr ve_mode [(V8QI "qi") (V16QI "qi")
-			   (V4HI "hi") (V8HI  "hi")
+(define_mode_attr ve_mode [(V4QI "qi") (V8QI "qi") (V16QI "qi")
+			   (V2HI "hi") (V4HI "hi") (V8HI  "hi")
 			   (V2SI "si") (V4SI  "si")
 			   (DI   "di") (V2DI  "di")
 			   (QI   "qi") (HI    "hi")
@@ -2068,8 +2114,8 @@ (define_mode_attr fp [(V8QI "")  (V16QI "")
 		      (SF "_fp")])
 
 ;; Defined to '_q' for 128-bit types.
-(define_mode_attr q [(V8QI "") (V16QI "_q")
-		     (V4HI "") (V8HI  "_q")
+(define_mode_attr q [(V4QI "") (V8QI "") (V16QI "_q")
+		     (V2HI "") (V4HI "") (V8HI  "_q")
 		     (V4BF "") (V8BF  "_q")
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
@@ -2105,6 +2151,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
 
 ;; Equivalent of the "q" attribute for the <VDBL> mode.
 (define_mode_attr dblq [(SI   "") (SF   "")
+		        (V4QI "") (V2HI "")
 		        (V8QI "_q") (V4HI "_q")
 		        (V4HF "_q") (V4BF "_q")
 		        (V2SI "_q") (V2SF "_q")
@@ -3743,6 +3790,12 @@ (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
 				 (UNSPEC_FMAXNM "fmaxnm")
 				 (UNSPEC_FMINNM "fminnm")])
 
+(define_int_attr CODE 		 [(UNSPEC_UMAXV "UMAX")
+				  (UNSPEC_UMINV "UMIN")
+				  (UNSPEC_SMAXV "SMAX")
+				  (UNSPEC_SMINV "SMIN")])
+
+
 (define_code_attr binqops_op [(ss_plus "sqadd")
 			      (us_plus "uqadd")
 			      (ss_minus "sqsub")
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c
index 1290772216e..13092b6a3f1 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c
@@ -14,6 +14,8 @@ f (int16_t *x, int16_t *y, int8_t *z, int n)
     }
 }
 
+/* The inner vectorized part of the loop will use q for 16bit and d for 8bit addition.
+   And then when in the peeled off final part, will use emulated 32byte vectors V4HI and V2QI for the peeled part */
 /* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
-/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8b,} 1 } } */
-/* { dg-final { scan-assembler-not {\tadd\tv[0-9]+\.4h,} } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4h,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c
index 6c09b5b146b..defb564650e 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c
@@ -14,5 +14,6 @@ f (int32_t *x, int32_t *y, int16_t *z, int n)
     }
 }
 
-/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */
+/* The second sxtl is the peeled off part that does V2HI->V2SI conversion. */
+/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c
index 94a66c545ef..e885f2973dc 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c
@@ -14,5 +14,6 @@ f (int16_t *x, int16_t *y, int8_t *z, int n)
     }
 }
 
-/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */
+/* The second sxtl is the peeled off part that does V4QI->V4HI conversion. */
+/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index d3edc7d839e..5fa4b46811f 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9205,7 +9205,7 @@ proc available_vector_sizes { } {
 	if { [check_effective_target_aarch64_sve] } {
 	    lappend result [aarch64_sve_bits]
 	}
-	lappend result 128 64
+	lappend result 128 64 32
     } elseif { [istarget arm*-*-*]
 		&& [check_effective_target_arm_neon_ok] } {
 	lappend result 128 64
-- 
2.43.0