Hi Srinath,

> -----Original Message-----
> From: Srinath Parvathaneni <srinath.parvathan...@arm.com>
> Sent: 02 June 2020 15:00
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov <kyrylo.tkac...@arm.com>
> Subject: [GCC][PATCH][ARM]: Correct the grouping of operands in MVE
> vector scatter store intrinsics (PR94735).
> 
> Hello,
> 
> The operands in RTL patterns of MVE vector scatter store intrinsics are
> wrongly grouped, because of which few
> vector loads and stores instructions are wrongly getting optimized out with -
> O2.
> 
> A new predicate "mve_scatter_memory" is defined in this patch, this
> predicate returns TRUE on
> matching: (mem(reg)) for MVE scatter store intrinsics.
> This patch fixes the issue by adding define_expand pattern with
> "mve_scatter_memory" predicate and calls the
> corresponding define_insn by passing register_operand as first argument.
> This register_operand is extracted
> from the operand with "mve_scatter_memory" predicate in define_expand
> pattern.
> 
> Please refer to M-profile Vector Extension (MVE) intrinsics [1]  for more
> details.
> [1] https://developer.arm.com/architectures/instruction-sets/simd-
> isas/helium/mve-intrinsics
> 
> Regression tested on arm-none-eabi and found no regressions.
> 
> Ok for trunk?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Srinath.
> 
> gcc/ChangeLog:
> 
> 2020-06-02    Srinath Parvathaneni    <srinath.parvathan...@arm.com>
> 
>       PR target/94735
>       * config/arm//predicates.md (mve_scatter_memory): Define to
>       match (mem (reg)) for scatter store memory.
>       * config/arm/mve.md (mve_vstrbq_scatter_offset_<supf><mode>):
> Modify
>       define_insn to define_expand.
>       (mve_vstrbq_scatter_offset_p_<supf><mode>): Likewise.
>       (mve_vstrhq_scatter_offset_<supf><mode>): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_p_<supf><mode>): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_<supf><mode>): Likewise.
>       (mve_vstrdq_scatter_offset_p_<supf>v2di): Likewise.
>       (mve_vstrdq_scatter_offset_<supf>v2di): Likewise.
>       (mve_vstrdq_scatter_shifted_offset_p_<supf>v2di): Likewise.
>       (mve_vstrdq_scatter_shifted_offset_<supf>v2di): Likewise.
>       (mve_vstrhq_scatter_offset_fv8hf): Likewise.
>       (mve_vstrhq_scatter_offset_p_fv8hf): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_fv8hf): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_p_fv8hf): Likewise.
>       (mve_vstrwq_scatter_offset_fv4sf): Likewise.
>       (mve_vstrwq_scatter_offset_p_fv4sf): Likewise.
>       (mve_vstrwq_scatter_offset_p_<supf>v4si): Likewise.
>       (mve_vstrwq_scatter_offset_<supf>v4si): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_fv4sf): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_p_fv4sf): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_p_<supf>v4si): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_<supf>v4si): Likewise.
>       (mve_vstrbq_scatter_offset_<supf><mode>_insn): Define insn for
> scatter
>       stores.
>       (mve_vstrbq_scatter_offset_p_<supf><mode>_insn): Likewise.
>       (mve_vstrhq_scatter_offset_<supf><mode>_insn): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn):
> Likewise.
>       (mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn): Likewise.
>       (mve_vstrdq_scatter_offset_p_<supf>v2di_insn): Likewise.
>       (mve_vstrdq_scatter_offset_<supf>v2di_insn): Likewise.
>       (mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn): Likewise.
>       (mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn): Likewise.
>       (mve_vstrhq_scatter_offset_fv8hf_insn): Likewise.
>       (mve_vstrhq_scatter_offset_p_fv8hf_insn): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_fv8hf_insn): Likewise.
>       (mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn): Likewise.
>       (mve_vstrwq_scatter_offset_fv4sf_insn): Likewise.
>       (mve_vstrwq_scatter_offset_p_fv4sf_insn): Likewise.
>       (mve_vstrwq_scatter_offset_p_<supf>v4si_insn): Likewise.
>       (mve_vstrwq_scatter_offset_<supf>v4si_insn): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_fv4sf_insn): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn): Likewise.
>       (mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 2020-06-02    Srinath Parvathaneni    srinath.parvathan...@arm.com
> 
>       PR target/94735
>       * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c: New
> test.
>       * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c:
> Likewise.
>       * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c:
> Likewise.
>       * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c:
> Likewise.
>       * gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c:
>       Likewise.
>       *
> gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c:
>       Likewise.
> 
> 
> ###############     Attachment also inlined for ease of reply
> ###############
> 
> 
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index
> 986fbfe2abae5f1e91e65f1ff5c84709c43c4617..3a57901bd5bcd770832d59dc7
> 7cd92b6d9b5ecb4 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -8102,22 +8102,29 @@
>  ;;
>  ;; [vstrbq_scatter_offset_s vstrbq_scatter_offset_u]
>  ;;
> -(define_insn "mve_vstrbq_scatter_offset_<supf><mode>"
> -  [(set (match_operand:<MVE_B_ELEM> 0 "memory_operand" "=Us")
> -     (unspec:<MVE_B_ELEM>
> -             [(match_operand:MVE_2 1 "s_register_operand" "w")
> -              (match_operand:MVE_2 2 "s_register_operand" "w")]
> -      VSTRBSOQ))
> -  ]
> +(define_expand "mve_vstrbq_scatter_offset_<supf><mode>"
> +  [(match_operand:<MVE_B_ELEM> 0 "mve_scatter_memory")
> +   (match_operand:MVE_2 1 "s_register_operand")
> +   (match_operand:MVE_2 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn("vstrb.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrbq_scatter_offset_<supf><mode>_insn (ind,
> operands[1],
> +                                                           operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrbq_scatter_offset_<supf><mode>_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:MVE_2 1 "s_register_operand" "w")
> +        (match_operand:MVE_2 2 "s_register_operand" "w")]
> +       VSTRBSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrb.<V_sz_elem>\t%q2, [%0, %q1]"
>    [(set_attr "length" "4")])
> 
>  ;;
> @@ -8210,23 +8217,33 @@
>  ;;
>  ;; [vstrbq_scatter_offset_p_s vstrbq_scatter_offset_p_u]
>  ;;
> -(define_insn "mve_vstrbq_scatter_offset_p_<supf><mode>"
> -  [(set (match_operand:<MVE_B_ELEM> 0 "memory_operand" "=Us")
> -     (unspec:<MVE_B_ELEM>
> -             [(match_operand:MVE_2 1 "s_register_operand" "w")
> -              (match_operand:MVE_2 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRBSOQ))
> -  ]
> +(define_expand "mve_vstrbq_scatter_offset_p_<supf><mode>"
> +  [(match_operand:<MVE_B_ELEM>  0 "mve_scatter_memory")
> +   (match_operand:MVE_2 1 "s_register_operand")
> +   (match_operand:MVE_2 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand" "Up")
> +   (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrbt.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrbq_scatter_offset_p_<supf><mode>_insn (ind, operands[1],
> +                                                    operands[2],
> +                                                    operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrbq_scatter_offset_p_<supf><mode>_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:MVE_2 1 "s_register_operand" "w")
> +        (match_operand:MVE_2 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRBSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrbt.<V_sz_elem>\t%q2, [%0, %q1]"
>    [(set_attr "length" "8")])
> 
>  ;;
> @@ -9097,87 +9114,122 @@
>  ;;
>  ;; [vstrhq_scatter_offset_p_s vstrhq_scatter_offset_p_u]
>  ;;
> -(define_insn "mve_vstrhq_scatter_offset_p_<supf><mode>"
> -  [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
> -     (unspec:<MVE_H_ELEM>
> -             [(match_operand:MVE_6 1 "s_register_operand" "w")
> -              (match_operand:MVE_6 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRHSOQ))
> -  ]
> +(define_expand "mve_vstrhq_scatter_offset_p_<supf><mode>"
> +  [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
> +   (match_operand:MVE_6 1 "s_register_operand")
> +   (match_operand:MVE_6 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrht.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrhq_scatter_offset_p_<supf><mode>_insn (ind, operands[1],
> +                                                    operands[2],
> +                                                    operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_offset_p_<supf><mode>_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:MVE_6 1 "s_register_operand" "w")
> +        (match_operand:MVE_6 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRHSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrhq_scatter_offset_s vstrhq_scatter_offset_u]
>  ;;
> -(define_insn "mve_vstrhq_scatter_offset_<supf><mode>"
> -  [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
> -     (unspec:<MVE_H_ELEM>
> -             [(match_operand:MVE_6 1 "s_register_operand" "w")
> -              (match_operand:MVE_6 2 "s_register_operand" "w")]
> -      VSTRHSOQ))
> -  ]
> +(define_expand "mve_vstrhq_scatter_offset_<supf><mode>"
> +  [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
> +   (match_operand:MVE_6 1 "s_register_operand")
> +   (match_operand:MVE_6 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrh.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrhq_scatter_offset_<supf><mode>_insn (ind,
> operands[1],
> +                                                           operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_offset_<supf><mode>_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:MVE_6 1 "s_register_operand" "w")
> +        (match_operand:MVE_6 2 "s_register_operand" "w")]
> +       VSTRHSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrh.<V_sz_elem>\t%q2, [%0, %q1]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrhq_scatter_shifted_offset_p_s vstrhq_scatter_shifted_offset_p_u]
>  ;;
> -(define_insn "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>"
> -  [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Ux")
> -     (unspec:<MVE_H_ELEM>
> -             [(match_operand:MVE_6 1 "s_register_operand" "w")
> -              (match_operand:MVE_6 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRHSSOQ))
> -  ]
> +(define_expand "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>"
> +  [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
> +   (match_operand:MVE_6 1 "s_register_operand")
> +   (match_operand:MVE_6 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrht.<V_sz_elem>\t%q2, [%m0, %q1, uxtw
> #1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn (ind,
> operands[1],
> +                                                            operands[2],
> +                                                            operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:MVE_6 1 "s_register_operand" "w")
> +        (match_operand:MVE_6 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRHSSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrhq_scatter_shifted_offset_s vstrhq_scatter_shifted_offset_u]
>  ;;
> -(define_insn "mve_vstrhq_scatter_shifted_offset_<supf><mode>"
> -  [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
> -     (unspec:<MVE_H_ELEM>
> -             [(match_operand:MVE_6 1 "s_register_operand" "w")
> -              (match_operand:MVE_6 2 "s_register_operand" "w")]
> -      VSTRHSSOQ))
> -  ]
> +(define_expand "mve_vstrhq_scatter_shifted_offset_<supf><mode>"
> +  [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
> +   (match_operand:MVE_6 1 "s_register_operand")
> +   (match_operand:MVE_6 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrh.<V_sz_elem>\t%q2, [%m0, %q1, uxtw #1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn (ind,
> operands[1],
> +                                                          operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:MVE_6 1 "s_register_operand" "w")
> +        (match_operand:MVE_6 2 "s_register_operand" "w")]
> +       VSTRHSSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrh.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
>    [(set_attr "length" "4")])
> 
>  ;;
> @@ -9345,173 +9397,240 @@
>  ;;
>  ;; [vstrdq_scatter_offset_p_s vstrdq_scatter_offset_p_u]
>  ;;
> -(define_insn "mve_vstrdq_scatter_offset_p_<supf>v2di"
> -  [(set (match_operand:V2DI 0 "memory_operand" "=Us")
> -     (unspec:V2DI
> -             [(match_operand:V2DI 1 "s_register_operand" "w")
> -              (match_operand:V2DI 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRDSOQ))
> -  ]
> +(define_expand "mve_vstrdq_scatter_offset_p_<supf>v2di"
> +  [(match_operand:V2DI 0 "mve_scatter_memory")
> +   (match_operand:V2DI 1 "s_register_operand")
> +   (match_operand:V2DI 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRDSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\;\tvstrdt.64\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrdq_scatter_offset_p_<supf>v2di_insn (ind,
> operands[1],
> +                                                           operands[2],
> +                                                           operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrdq_scatter_offset_p_<supf>v2di_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V2DI 1 "s_register_operand" "w")
> +        (match_operand:V2DI 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRDSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrdt.64\t%q2, [%0, %q1]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrdq_scatter_offset_s vstrdq_scatter_offset_u]
>  ;;
> -(define_insn "mve_vstrdq_scatter_offset_<supf>v2di"
> -  [(set (match_operand:V2DI 0 "memory_operand" "=Us")
> -     (unspec:V2DI
> -             [(match_operand:V2DI 1 "s_register_operand" "w")
> -              (match_operand:V2DI 2 "s_register_operand" "w")]
> -      VSTRDSOQ))
> -  ]
> +(define_expand "mve_vstrdq_scatter_offset_<supf>v2di"
> +  [(match_operand:V2DI 0 "mve_scatter_memory")
> +   (match_operand:V2DI 1 "s_register_operand")
> +   (match_operand:V2DI 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRDSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrd.64\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrdq_scatter_offset_<supf>v2di_insn (ind,
> operands[1],
> +                                                         operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrdq_scatter_offset_<supf>v2di_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V2DI 1 "s_register_operand" "w")
> +        (match_operand:V2DI 2 "s_register_operand" "w")]
> +       VSTRDSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrd.64\t%q2, [%0, %q1]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrdq_scatter_shifted_offset_p_s vstrdq_scatter_shifted_offset_p_u]
>  ;;
> -(define_insn "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di"
> -  [(set (match_operand:V2DI 0 "memory_operand" "=Us")
> -     (unspec:V2DI
> -             [(match_operand:V2DI 1 "s_register_operand" "w")
> -              (match_operand:V2DI 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRDSSOQ))
> -  ]
> +(define_expand "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di"
> +  [(match_operand:V2DI 0 "mve_scatter_memory")
> +   (match_operand:V2DI 1 "s_register_operand")
> +   (match_operand:V2DI 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRDSSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\;\tvstrdt.64\t%q2, [%m0, %q1, UXTW #3]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn (ind,
> operands[1],
> +                                                          operands[2],
> +                                                          operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V2DI 1 "s_register_operand" "w")
> +        (match_operand:V2DI 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRDSSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrdt.64\t%q2, [%0, %q1, UXTW #3]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrdq_scatter_shifted_offset_s vstrdq_scatter_shifted_offset_u]
>  ;;
> -(define_insn "mve_vstrdq_scatter_shifted_offset_<supf>v2di"
> -  [(set (match_operand:V2DI 0 "memory_operand" "=Us")
> -     (unspec:V2DI
> -             [(match_operand:V2DI 1 "s_register_operand" "w")
> -              (match_operand:V2DI 2 "s_register_operand" "w")]
> -      VSTRDSSOQ))
> -  ]
> +(define_expand "mve_vstrdq_scatter_shifted_offset_<supf>v2di"
> +  [(match_operand:V2DI 0 "mve_scatter_memory")
> +   (match_operand:V2DI 1 "s_register_operand")
> +   (match_operand:V2DI 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRDSSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrd.64\t%q2, [%m0, %q1, UXTW #3]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn (ind,
> operands[1],
> +                                                        operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V2DI 1 "s_register_operand" "w")
> +        (match_operand:V2DI 2 "s_register_operand" "w")]
> +       VSTRDSSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrd.64\t%q2, [%0, %q1, UXTW #3]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrhq_scatter_offset_f]
>  ;;
> -(define_insn "mve_vstrhq_scatter_offset_fv8hf"
> -  [(set (match_operand:V8HI 0 "memory_operand" "=Us")
> -     (unspec:V8HI
> -             [(match_operand:V8HI 1 "s_register_operand" "w")
> -              (match_operand:V8HF 2 "s_register_operand" "w")]
> -      VSTRHQSO_F))
> -  ]
> +(define_expand "mve_vstrhq_scatter_offset_fv8hf"
> +  [(match_operand:V8HI 0 "mve_scatter_memory")
> +   (match_operand:V8HI 1 "s_register_operand")
> +   (match_operand:V8HF 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrh.16\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrhq_scatter_offset_fv8hf_insn (ind, operands[1],
> +                                                    operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_offset_fv8hf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V8HI 1 "s_register_operand" "w")
> +        (match_operand:V8HF 2 "s_register_operand" "w")]
> +       VSTRHQSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vstrh.16\t%q2, [%0, %q1]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrhq_scatter_offset_p_f]
>  ;;
> -(define_insn "mve_vstrhq_scatter_offset_p_fv8hf"
> -  [(set (match_operand:V8HI 0 "memory_operand" "=Us")
> -     (unspec:V8HI
> -             [(match_operand:V8HI 1 "s_register_operand" "w")
> -              (match_operand:V8HF 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRHQSO_F))
> -  ]
> +(define_expand "mve_vstrhq_scatter_offset_p_fv8hf"
> +  [(match_operand:V8HI 0 "mve_scatter_memory")
> +   (match_operand:V8HI 1 "s_register_operand")
> +   (match_operand:V8HF 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrht.16\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrhq_scatter_offset_p_fv8hf_insn (ind,
> operands[1],
> +                                                      operands[2],
> +                                                      operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_offset_p_fv8hf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V8HI 1 "s_register_operand" "w")
> +        (match_operand:V8HF 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRHQSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vpst\;vstrht.16\t%q2, [%0, %q1]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrhq_scatter_shifted_offset_f]
>  ;;
> -(define_insn "mve_vstrhq_scatter_shifted_offset_fv8hf"
> -  [(set (match_operand:V8HI 0 "memory_operand" "=Us")
> -     (unspec:V8HI
> -             [(match_operand:V8HI 1 "s_register_operand" "w")
> -              (match_operand:V8HF 2 "s_register_operand" "w")]
> -      VSTRHQSSO_F))
> -  ]
> +(define_expand "mve_vstrhq_scatter_shifted_offset_fv8hf"
> +  [(match_operand:V8HI 0 "memory_operand" "=Us")
> +   (match_operand:V8HI 1 "s_register_operand" "w")
> +   (match_operand:V8HF 2 "s_register_operand" "w")
> +   (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrh.16\t%q2, [%m0, %q1, uxtw #1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrhq_scatter_shifted_offset_fv8hf_insn (ind,
> operands[1],
> +                                                            operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_shifted_offset_fv8hf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V8HI 1 "s_register_operand" "w")
> +        (match_operand:V8HF 2 "s_register_operand" "w")]
> +       VSTRHQSSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vstrh.16\t%q2, [%0, %q1, uxtw #1]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrhq_scatter_shifted_offset_p_f]
>  ;;
> -(define_insn "mve_vstrhq_scatter_shifted_offset_p_fv8hf"
> -  [(set (match_operand:V8HI 0 "memory_operand" "=Us")
> -     (unspec:V8HI
> -             [(match_operand:V8HI 1 "s_register_operand" "w")
> -              (match_operand:V8HF 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRHQSSO_F))
> -  ]
> +(define_expand "mve_vstrhq_scatter_shifted_offset_p_fv8hf"
> +  [(match_operand:V8HI 0 "memory_operand" "=Us")
> +   (match_operand:V8HI 1 "s_register_operand" "w")
> +   (match_operand:V8HF 2 "s_register_operand" "w")
> +   (match_operand:HI 3 "vpr_register_operand" "Up")
> +   (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrht.16\t%q2, [%m0, %q1, uxtw #1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn (ind, operands[1],
> +                                                     operands[2],
> +                                                     operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V8HI 1 "s_register_operand" "w")
> +        (match_operand:V8HF 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRHQSSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vpst\;vstrht.16\t%q2, [%0, %q1, uxtw #1]"
>    [(set_attr "length" "8")])
> 
>  ;;
> @@ -9562,173 +9681,240 @@
>  ;;
>  ;; [vstrwq_scatter_offset_f]
>  ;;
> -(define_insn "mve_vstrwq_scatter_offset_fv4sf"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SF 2 "s_register_operand" "w")]
> -      VSTRWQSO_F))
> -  ]
> +(define_expand "mve_vstrwq_scatter_offset_fv4sf"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SF 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrw.32\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrwq_scatter_offset_fv4sf_insn (ind, operands[1],
> +                                                    operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_offset_fv4sf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SF 2 "s_register_operand" "w")]
> +       VSTRWQSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vstrw.32\t%q2, [%0, %q1]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrwq_scatter_offset_p_f]
>  ;;
> -(define_insn "mve_vstrwq_scatter_offset_p_fv4sf"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SF 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRWQSO_F))
> -  ]
> +(define_expand "mve_vstrwq_scatter_offset_p_fv4sf"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SF 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrwt.32\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrwq_scatter_offset_p_fv4sf_insn (ind,
> operands[1],
> +                                                      operands[2],
> +                                                      operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_offset_p_fv4sf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SF 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRWQSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vpst\;vstrwt.32\t%q2, [%0, %q1]"
>    [(set_attr "length" "8")])
> 
>  ;;
> -;; [vstrwq_scatter_offset_p_s vstrwq_scatter_offset_p_u]
> +;; [vstrwq_scatter_offset_s vstrwq_scatter_offset_u]
>  ;;
> -(define_insn "mve_vstrwq_scatter_offset_p_<supf>v4si"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SI 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRWSOQ))
> -  ]
> +(define_expand "mve_vstrwq_scatter_offset_p_<supf>v4si"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SI 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\n\tvstrwt.32\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrwq_scatter_offset_p_<supf>v4si_insn (ind,
> operands[1],
> +                                                           operands[2],
> +                                                           operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_offset_p_<supf>v4si_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SI 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRWSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrwt.32\t%q2, [%0, %q1]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrwq_scatter_offset_s vstrwq_scatter_offset_u]
>  ;;
> -(define_insn "mve_vstrwq_scatter_offset_<supf>v4si"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SI 2 "s_register_operand" "w")]
> -      VSTRWSOQ))
> -  ]
> +(define_expand "mve_vstrwq_scatter_offset_<supf>v4si"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SI 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrw.32\t%q2, [%m0, %q1]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrwq_scatter_offset_<supf>v4si_insn (ind,
> operands[1],
> +                                                         operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_offset_<supf>v4si_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SI 2 "s_register_operand" "w")]
> +       VSTRWSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrw.32\t%q2, [%0, %q1]"
>    [(set_attr "length" "4")])
> 
>  ;;
>  ;; [vstrwq_scatter_shifted_offset_f]
>  ;;
> -(define_insn "mve_vstrwq_scatter_shifted_offset_fv4sf"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SF 2 "s_register_operand" "w")]
> -      VSTRWQSSO_F))
> -  ]
> +(define_expand "mve_vstrwq_scatter_shifted_offset_fv4sf"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SF 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrw.32\t%q2, [%m0, %q1, uxtw #2]",ops);
> -   return "";
> -}
> -  [(set_attr "length" "4")])
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (gen_mve_vstrwq_scatter_shifted_offset_fv4sf_insn (ind,
> operands[1],
> +                                                            operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_shifted_offset_fv4sf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SF 2 "s_register_operand" "w")]
> +      VSTRWQSSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vstrw.32\t%q2, [%0, %q1, uxtw #2]"
> +  [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrwq_scatter_shifted_offset_p_f]
>  ;;
> -(define_insn "mve_vstrwq_scatter_shifted_offset_p_fv4sf"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SF 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRWQSSO_F))
> -  ]
> +(define_expand "mve_vstrwq_scatter_shifted_offset_p_fv4sf"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SF 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
>    "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\;\tvstrwt.32\t%q2, [%m0, %q1, uxtw #2]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn (ind, operands[1],
> +                                                     operands[2],
> +                                                     operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SF 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRWQSSO_F))]
> +  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
> +  "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrwq_scatter_shifted_offset_p_s vstrwq_scatter_shifted_offset_p_u]
>  ;;
> -(define_insn "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SI 2 "s_register_operand" "w")
> -              (match_operand:HI 3 "vpr_register_operand" "Up")]
> -      VSTRWSSOQ))
> -  ]
> +(define_expand "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SI 2 "s_register_operand")
> +   (match_operand:HI 3 "vpr_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vpst\;\tvstrwt.32\t%q2, [%m0, %q1, uxtw #2]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn (ind,
> operands[1],
> +                                                          operands[2],
> +                                                          operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SI 2 "s_register_operand" "w")
> +        (match_operand:HI 3 "vpr_register_operand" "Up")]
> +       VSTRWSSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
>    [(set_attr "length" "8")])
> 
>  ;;
>  ;; [vstrwq_scatter_shifted_offset_s vstrwq_scatter_shifted_offset_u]
>  ;;
> -(define_insn "mve_vstrwq_scatter_shifted_offset_<supf>v4si"
> -  [(set (match_operand:V4SI 0 "memory_operand" "=Us")
> -     (unspec:V4SI
> -             [(match_operand:V4SI 1 "s_register_operand" "w")
> -              (match_operand:V4SI 2 "s_register_operand" "w")]
> -      VSTRWSSOQ))
> -  ]
> +(define_expand "mve_vstrwq_scatter_shifted_offset_<supf>v4si"
> +  [(match_operand:V4SI 0 "mve_scatter_memory")
> +   (match_operand:V4SI 1 "s_register_operand")
> +   (match_operand:V4SI 2 "s_register_operand")
> +   (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
>    "TARGET_HAVE_MVE"
>  {
> -   rtx ops[3];
> -   ops[0] = operands[0];
> -   ops[1] = operands[1];
> -   ops[2] = operands[2];
> -   output_asm_insn ("vstrw.32\t%q2, [%m0, %q1, uxtw #2]",ops);
> -   return "";
> -}
> +  rtx ind = XEXP (operands[0], 0);
> +  gcc_assert (REG_P (ind));
> +  emit_insn (
> +    gen_mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn (ind,
> operands[1],
> +                                                        operands[2]));
> +  DONE;
> +})
> +
> +(define_insn "mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn"
> +  [(set (mem:BLK (scratch))
> +     (unspec:BLK
> +       [(match_operand:SI 0 "register_operand" "r")
> +        (match_operand:V4SI 1 "s_register_operand" "w")
> +        (match_operand:V4SI 2 "s_register_operand" "w")]
> +       VSTRWSSOQ))]
> +  "TARGET_HAVE_MVE"
> +  "vstrw.32\t%q2, [%0, %q1, uxtw #2]"
>    [(set_attr "length" "4")])
> 
>  ;;
> diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
> index
> c57ad73577e1eebebc8951ed5b4fb544dd3381f8..9e9bca4d87fdc31e045b2b5
> bb03b996f082079bd 100644
> --- a/gcc/config/arm/predicates.md
> +++ b/gcc/config/arm/predicates.md
> @@ -37,6 +37,12 @@
>                   && mve_vector_mem_operand (GET_MODE (op), XEXP (op,
> 0),
>                                              false)")))
> 
> +(define_predicate "mve_scatter_memory"
> +  (and (match_code "mem")
> +       (match_test "TARGET_HAVE_MVE && REG_P (XEXP (op, 0))
> +                 && mve_vector_mem_operand (GET_MODE (op), XEXP (op,
> 0),
> +                                            false)")))
> +
>  ;; True for immediates in the range of 1 to 16 for MVE.
>  (define_predicate "mve_imm_16"
>    (match_test "satisfies_constraint_Rd (op)"))
> diff --git
> a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..21b9e12d57e064688e6d52
> 493deffc1c2c39761d
> --- /dev/null
> +++
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c
> @@ -0,0 +1,67 @@
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include "arm_mve.h"
> +
> +int
> +foows32(uint32x4_t pDataDest, int32x4_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrwq_scatter_base_s32 (pDataDest, 4, value);
> +    vstrwq_scatter_base_s32 (pDataDest, 132, value);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
> +    return 0;
> +}
> +
> +int
> +foowu32(uint32x4_t pDataDest, uint32x4_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrwq_scatter_base_u32 (pDataDest, 4, value);
> +    vstrwq_scatter_base_u32 (pDataDest, 132, value);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
> +    return 0;
> +}
> +
> +int
> +foowf32(uint32x4_t pDataDest, float32x4_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrwq_scatter_base_f32 (pDataDest, 4, value);
> +    vstrwq_scatter_base_f32 (pDataDest, 132, value);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
> +    return 0;
> +}
> +
> +int
> +foods64(uint64x2_t pDataDest, int64x2_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrdq_scatter_base_s64 (pDataDest, 256, value);
> +    vstrdq_scatter_base_s64 (pDataDest, 512, value);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
> +    return 0;
> +}
> +
> +int
> +foodu64(uint64x2_t pDataDest, uint64x2_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrdq_scatter_base_u64 (pDataDest, 256, value);
> +    vstrdq_scatter_base_u64 (pDataDest, 512, value);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
> +    vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
> +    return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "vstr\[a-z\]" 20 } } */
> diff --git
> a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..15c6496732a31259ebcceeb
> eb8ac65e071a04b20
> --- /dev/null
> +++
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c
> @@ -0,0 +1,69 @@
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include "arm_mve.h"
> +
> +mve_pred16_t __p;
> +
> +int
> +foows32(uint32x4_t pDataDest, int32x4_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrwq_scatter_base_p_s32 (pDataDest, 4, value, __p);
> +    vstrwq_scatter_base_p_s32 (pDataDest, 132, value, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
> +    return 0;
> +}
> +
> +int
> +foowu32(uint32x4_t pDataDest, uint32x4_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrwq_scatter_base_p_u32 (pDataDest, 4, value, __p);
> +    vstrwq_scatter_base_p_u32 (pDataDest, 132, value, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
> +    return 0;
> +}
> +
> +int
> +foowf32(uint32x4_t pDataDest, float32x4_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrwq_scatter_base_p_f32 (pDataDest, 4, value, __p);
> +    vstrwq_scatter_base_p_f32 (pDataDest, 132, value, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
> +    return 0;
> +}
> +
> +int
> +foods64(uint64x2_t pDataDest, int64x2_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrdq_scatter_base_p_s64 (pDataDest, 256, value, __p);
> +    vstrdq_scatter_base_p_s64 (pDataDest, 512, value, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
> +    return 0;
> +}
> +
> +int
> +foodu64(uint64x2_t pDataDest, uint64x2_t value, int32_t * ret)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    vstrdq_scatter_base_p_u64 (pDataDest, 256, value, __p);
> +    vstrdq_scatter_base_p_u64 (pDataDest, 512, value, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
> +    vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
> +    return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 20 } } */
> diff --git
> a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6d123669c13f168e651b7aa
> 3344c4324fd4afe50
> --- /dev/null
> +++
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c
> @@ -0,0 +1,215 @@
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include "arm_mve.h"
> +
> +int
> +foobu8( uint8_t * pDataSrc, uint8_t * pDataDest)
> +{
> +    const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 
> 15, 8,
> 14};
> +    const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 
> 17, 22,
> 16, 20, 18, 30};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[16]);
> +    vstrbq_scatter_offset_u8 (pDataDest, vecOffs1, (uint8x16_t) vecIn1);
> +    vstrbq_scatter_offset_u8 (pDataDest, vecOffs2, (uint8x16_t) vecIn2);
> +    pDataDest[32] = pDataSrc[32];
> +    return 0;
> +}
> +
> +int
> +foobu16( uint8_t * pDataSrc, uint8_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrbq_scatter_offset_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1);
> +    vstrbq_scatter_offset_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foobu32( uint8_t * pDataSrc, uint8_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrbq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
> +    vstrbq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foobs8( int8_t * pDataSrc, int8_t * pDataDest)
> +{
> +    const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 
> 15, 8,
> 14};
> +    const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 
> 17, 22,
> 16, 20, 18, 30};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[16]);
> +    vstrbq_scatter_offset_s8 (pDataDest, vecOffs1, (int8x16_t) vecIn1);
> +    vstrbq_scatter_offset_s8 (pDataDest, vecOffs2, (int8x16_t) vecIn2);
> +    pDataDest[32] = pDataSrc[32];
> +    return 0;
> +}
> +
> +int
> +foobs16( int8_t * pDataSrc, int8_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
> +    vstrbq_scatter_offset_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1);
> +    vstrbq_scatter_offset_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foobs32( uint8_t * pDataSrc, int8_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
> +    vstrbq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
> +    vstrbq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_offset_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1);
> +    vstrhq_scatter_offset_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohu32( uint16_t * pDataSrc, uint16_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrhq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
> +    vstrhq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohs16( int16_t * pDataSrc, int16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_offset_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1);
> +    vstrhq_scatter_offset_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohs32( uint16_t * pDataSrc, int16_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
> +    vstrhq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
> +    vstrhq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohf16( float16_t * pDataSrc, float16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_offset_f16 (pDataDest, vecOffs1, (float16x8_t) vecIn1);
> +    vstrhq_scatter_offset_f16 (pDataDest, vecOffs2, (float16x8_t) vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
> +    vstrwq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foows32( int32_t * pDataSrc, int32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
> +    vstrwq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foowf32( float32_t * pDataSrc, float32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrwq_scatter_offset_f32 (pDataDest, vecOffs1, (float32x4_t) vecIn1);
> +    vstrwq_scatter_offset_f32 (pDataDest, vecOffs2, (float32x4_t) vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foowu64( uint64_t * pDataSrc, uint64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 3};
> +    const uint64x2_t vecOffs2 = { 1, 2};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
> +    vstrdq_scatter_offset_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1);
> +    vstrdq_scatter_offset_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2);
> +    pDataDest[4] = pDataSrc[4];
> +    return 0;
> +}
> +
> +int
> +foows64( int64_t * pDataSrc, int64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 3};
> +    const uint64x2_t vecOffs2 = { 1, 2};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
> +    vstrdq_scatter_offset_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1);
> +    vstrdq_scatter_offset_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2);
> +    pDataDest[4] = pDataSrc[4];
> +    return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "vstr\[a-z\]" 32 } } */
> diff --git
> a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.
> c
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.
> c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..cd2e1ee80f9dfe35955468a
> 822bd202679039831
> --- /dev/null
> +++
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.
> c
> @@ -0,0 +1,216 @@
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include "arm_mve.h"
> +
> +mve_pred16_t __p;
> +int
> +foobu8( uint8_t * pDataSrc, uint8_t * pDataDest)
> +{
> +    const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 
> 15, 8,
> 14};
> +    const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 
> 17, 22,
> 16, 20, 18, 30};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[16]);
> +    vstrbq_scatter_offset_p_u8(pDataDest, vecOffs1, (uint8x16_t) vecIn1,
> __p);
> +    vstrbq_scatter_offset_p_u8(pDataDest, vecOffs2, (uint8x16_t) vecIn2,
> __p);
> +    pDataDest[32] = pDataSrc[32];
> +    return 0;
> +}
> +
> +int
> +foobu16( uint8_t * pDataSrc, uint8_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrbq_scatter_offset_p_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1,
> __p);
> +    vstrbq_scatter_offset_p_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2,
> __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foobu32( uint8_t * pDataSrc, uint8_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrbq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1,
> __p);
> +    vstrbq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foobs8( int8_t * pDataSrc, int8_t * pDataDest)
> +{
> +    const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 
> 15, 8,
> 14};
> +    const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 
> 17, 22,
> 16, 20, 18, 30};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[16]);
> +    vstrbq_scatter_offset_p_s8 (pDataDest, vecOffs1, (int8x16_t) vecIn1, 
> __p);
> +    vstrbq_scatter_offset_p_s8 (pDataDest, vecOffs2, (int8x16_t) vecIn2, 
> __p);
> +    pDataDest[32] = pDataSrc[32];
> +    return 0;
> +}
> +
> +int
> +foobs16( int8_t * pDataSrc, int8_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
> +    vstrbq_scatter_offset_p_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1,
> __p);
> +    vstrbq_scatter_offset_p_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2,
> __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foobs32( uint8_t * pDataSrc, int8_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
> +    vstrbq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1,
> __p);
> +    vstrbq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_offset_p_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1,
> __p);
> +    vstrhq_scatter_offset_p_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2,
> __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohu32( uint16_t * pDataSrc, uint16_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrhq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1,
> __p);
> +    vstrhq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohs16( int16_t * pDataSrc, int16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_offset_p_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1,
> __p);
> +    vstrhq_scatter_offset_p_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2,
> __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohs32( uint16_t * pDataSrc, int16_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
> +    vstrhq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1,
> __p);
> +    vstrhq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohf16( float16_t * pDataSrc, float16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_offset_p_f16 (pDataDest, vecOffs1, (float16x8_t) vecIn1,
> __p);
> +    vstrhq_scatter_offset_p_f16 (pDataDest, vecOffs2, (float16x8_t) vecIn2,
> __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1,
> __p);
> +    vstrwq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foows32( int32_t * pDataSrc, int32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1,
> __p);
> +    vstrwq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foowf32( float32_t * pDataSrc, float32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
> +    vstrwq_scatter_offset_p_f32 (pDataDest, vecOffs1, (float32x4_t) vecIn1,
> __p);
> +    vstrwq_scatter_offset_p_f32 (pDataDest, vecOffs2, (float32x4_t) vecIn2,
> __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foowu64( uint64_t * pDataSrc, uint64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 3};
> +    const uint64x2_t vecOffs2 = { 1, 2};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
> +    vstrdq_scatter_offset_p_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1,
> __p);
> +    vstrdq_scatter_offset_p_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2,
> __p);
> +    pDataDest[4] = pDataSrc[4];
> +    return 0;
> +}
> +
> +int
> +foows64( int64_t * pDataSrc, int64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 3};
> +    const uint64x2_t vecOffs2 = { 1, 2};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
> +    vstrdq_scatter_offset_p_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1,
> __p);
> +    vstrdq_scatter_offset_p_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2,
> __p);
> +    pDataDest[4] = pDataSrc[4];
> +    return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 32 } } */
> diff --git
> a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_o
> ffset.c
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_o
> ffset.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..62dfb450a6d30312472f5c8
> bb2d41e98fe6b6a32
> --- /dev/null
> +++
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_o
> ffset.c
> @@ -0,0 +1,141 @@
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include "arm_mve.h"
> +
> +int
> +foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_shifted_offset_u32 (pDataDest, vecOffs1, vecIn1);
> +    vstrwq_scatter_shifted_offset_u32 (pDataDest, vecOffs2, vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foowf32( float32_t * pDataSrc, float32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    float32x4_t vecIn1 = vldrwq_f32 ((float32_t const *) pDataSrc);
> +    float32x4_t vecIn2 = vldrwq_f32 ((float32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_shifted_offset_f32 (pDataDest, vecOffs1, vecIn1);
> +    vstrwq_scatter_shifted_offset_f32 (pDataDest, vecOffs2, vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
> +    uint16x8_t vecIn1 = vldrhq_u16 ((uint16_t const *) pDataSrc);
> +    uint16x8_t vecIn2 = vldrhq_u16 ((uint16_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_shifted_offset_u16 (pDataDest, vecOffs1, vecIn1);
> +    vstrhq_scatter_shifted_offset_u16 (pDataDest, vecOffs2, vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohu32( uint32_t * pDataSrc, uint32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrhq_u32 ((uint16_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrhq_u32 ((uint16_t const *) &pDataSrc[4]);
> +    vstrhq_scatter_shifted_offset_u32 ((uint16_t *)pDataDest, vecOffs1,
> vecIn1);
> +    vstrhq_scatter_shifted_offset_u32 ((uint16_t *)pDataDest, vecOffs2,
> vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohf16( float16_t * pDataSrc, float16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
> +    float16x8_t vecIn1 = vldrhq_f16 ((float16_t const *) pDataSrc);
> +    float16x8_t vecIn2 = vldrhq_f16 ((float16_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_shifted_offset_f16 (pDataDest, vecOffs1, vecIn1);
> +    vstrhq_scatter_shifted_offset_f16 (pDataDest, vecOffs2, vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foodu64( uint64_t * pDataSrc, uint64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 1};
> +    const uint64x2_t vecOffs2 = { 2, 3};
> +    uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
> +    uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
> +
> +    vstrdq_scatter_shifted_offset_u64 (pDataDest, vecOffs1, (uint64x2_t)
> vecIn1);
> +    vstrdq_scatter_shifted_offset_u64 (pDataDest, vecOffs2, (uint64x2_t)
> vecIn2);
> +
> +    pDataDest[2] = pDataSrc[2];
> +    return 0;
> +}
> +
> +int
> +foows32( int32_t * pDataSrc, int32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
> +    vstrwq_scatter_shifted_offset_s32 (pDataDest, vecOffs1, vecIn1);
> +    vstrwq_scatter_shifted_offset_s32 (pDataDest, vecOffs2, vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohs16( int16_t * pDataSrc, int16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
> +    int16x8_t vecIn1 = vldrhq_s16 ((int16_t const *) pDataSrc);
> +    int16x8_t vecIn2 = vldrhq_s16 ((int16_t const *) &pDataSrc[8]);
> +    vstrhq_scatter_shifted_offset_s16 (pDataDest, vecOffs1, vecIn1);
> +    vstrhq_scatter_shifted_offset_s16 (pDataDest, vecOffs2, vecIn2);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohs32( int32_t * pDataSrc, int32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrhq_s32 ((int16_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrhq_s32 ((int16_t const *) &pDataSrc[4]);
> +    vstrhq_scatter_shifted_offset_s32 ((int16_t *)pDataDest, vecOffs1,
> vecIn1);
> +    vstrhq_scatter_shifted_offset_s32 ((int16_t *)pDataDest, vecOffs2,
> vecIn2);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foods64( int64_t * pDataSrc, int64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 1};
> +    const uint64x2_t vecOffs2 = { 2, 3};
> +    int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
> +    int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[2]);
> +
> +    vstrdq_scatter_shifted_offset_s64 (pDataDest, vecOffs1, (int64x2_t)
> vecIn1);
> +    vstrdq_scatter_shifted_offset_s64 (pDataDest, vecOffs2, (int64x2_t)
> vecIn2);
> +
> +    pDataDest[2] = pDataSrc[2];
> +    return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "vstr\[a-z\]" 20 } } */
> diff --git
> a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_o
> ffset_p.c
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_o
> ffset_p.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a51d3a211672e74e99f571e
> f362445d13f2e2368
> --- /dev/null
> +++
> b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_o
> ffset_p.c
> @@ -0,0 +1,142 @@
> +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
> +/* { dg-add-options arm_v8_1m_mve_fp } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include "arm_mve.h"
> +
> +mve_pred16_t __p;
> +int
> +foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrwq_z_u32 ((uint32_t const *) pDataSrc, __p);
> +    uint32x4_t vecIn2 = vldrwq_z_u32 ((uint32_t const *) &pDataSrc[4], __p);
> +    vstrwq_scatter_shifted_offset_p_u32 (pDataDest, vecOffs1, vecIn1, __p);
> +    vstrwq_scatter_shifted_offset_p_u32 (pDataDest, vecOffs2, vecIn2, __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foowf32( float32_t * pDataSrc, float32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    float32x4_t vecIn1 = vldrwq_z_f32 ((float32_t const *) pDataSrc, __p);
> +    float32x4_t vecIn2 = vldrwq_z_f32 ((float32_t const *) &pDataSrc[4], 
> __p);
> +    vstrwq_scatter_shifted_offset_p_f32 (pDataDest, vecOffs1, vecIn1, __p);
> +    vstrwq_scatter_shifted_offset_p_f32 (pDataDest, vecOffs2, vecIn2, __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
> +    uint16x8_t vecIn1 = vldrhq_z_u16 ((uint16_t const *) pDataSrc, __p);
> +    uint16x8_t vecIn2 = vldrhq_z_u16 ((uint16_t const *) &pDataSrc[8], __p);
> +    vstrhq_scatter_shifted_offset_p_u16 (pDataDest, vecOffs1, vecIn1, __p);
> +    vstrhq_scatter_shifted_offset_p_u16 (pDataDest, vecOffs2, vecIn2, __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohu32( uint32_t * pDataSrc, uint32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    uint32x4_t vecIn1 = vldrhq_z_u32 ((uint16_t const *) pDataSrc, __p);
> +    uint32x4_t vecIn2 = vldrhq_z_u32 ((uint16_t const *) &pDataSrc[4], __p);
> +    vstrhq_scatter_shifted_offset_p_u32 ((uint16_t *)pDataDest, vecOffs1,
> vecIn1, __p);
> +    vstrhq_scatter_shifted_offset_p_u32 ((uint16_t *)pDataDest, vecOffs2,
> vecIn2, __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohf16( float16_t * pDataSrc, float16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
> +    float16x8_t vecIn1 = vldrhq_z_f16 ((float16_t const *) pDataSrc, __p);
> +    float16x8_t vecIn2 = vldrhq_z_f16 ((float16_t const *) &pDataSrc[8], 
> __p);
> +    vstrhq_scatter_shifted_offset_p_f16 (pDataDest, vecOffs1, vecIn1, __p);
> +    vstrhq_scatter_shifted_offset_p_f16 (pDataDest, vecOffs2, vecIn2, __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foodu64( uint64_t * pDataSrc, uint64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 1};
> +    const uint64x2_t vecOffs2 = { 2, 3};
> +    uint32x4_t vecIn1 = vldrwq_z_u32 ((uint32_t const *) pDataSrc, __p);
> +    uint32x4_t vecIn2 = vldrwq_z_u32 ((uint32_t const *) &pDataSrc[2], __p);
> +
> +    vstrdq_scatter_shifted_offset_p_u64 (pDataDest, vecOffs1, (uint64x2_t)
> vecIn1, __p);
> +    vstrdq_scatter_shifted_offset_p_u64 (pDataDest, vecOffs2, (uint64x2_t)
> vecIn2, __p);
> +
> +    pDataDest[2] = pDataSrc[2];
> +    return 0;
> +}
> +
> +int
> +foows32( int32_t * pDataSrc, int32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrwq_z_s32 ((int32_t const *) pDataSrc, __p);
> +    int32x4_t vecIn2 = vldrwq_z_s32 ((int32_t const *) &pDataSrc[4], __p);
> +    vstrwq_scatter_shifted_offset_p_s32 (pDataDest, vecOffs1, vecIn1, __p);
> +    vstrwq_scatter_shifted_offset_p_s32 (pDataDest, vecOffs2, vecIn2, __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foohs16( int16_t * pDataSrc, int16_t * pDataDest)
> +{
> +    const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
> +    const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
> +    int16x8_t vecIn1 = vldrhq_z_s16 ((int16_t const *) pDataSrc, __p);
> +    int16x8_t vecIn2 = vldrhq_z_s16 ((int16_t const *) &pDataSrc[8], __p);
> +    vstrhq_scatter_shifted_offset_p_s16 (pDataDest, vecOffs1, vecIn1, __p);
> +    vstrhq_scatter_shifted_offset_p_s16 (pDataDest, vecOffs2, vecIn2, __p);
> +    pDataDest[16] = pDataSrc[16];
> +    return 0;
> +}
> +
> +int
> +foohs32( int32_t * pDataSrc, int32_t * pDataDest)
> +{
> +    const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
> +    const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
> +    int32x4_t vecIn1 = vldrhq_z_s32 ((int16_t const *) pDataSrc, __p);
> +    int32x4_t vecIn2 = vldrhq_z_s32 ((int16_t const *) &pDataSrc[4], __p);
> +    vstrhq_scatter_shifted_offset_p_s32 ((int16_t *)pDataDest, vecOffs1,
> vecIn1, __p);
> +    vstrhq_scatter_shifted_offset_p_s32 ((int16_t *)pDataDest, vecOffs2,
> vecIn2, __p);
> +    pDataDest[8] = pDataSrc[8];
> +    return 0;
> +}
> +
> +int
> +foods64( int64_t * pDataSrc, int64_t * pDataDest)
> +{
> +    const uint64x2_t vecOffs1 = { 0, 1};
> +    const uint64x2_t vecOffs2 = { 2, 3};
> +    int32x4_t vecIn1 = vldrwq_z_s32 ((int32_t const *) pDataSrc, __p);
> +    int32x4_t vecIn2 = vldrwq_z_s32 ((int32_t const *) &pDataSrc[2], __p);
> +
> +    vstrdq_scatter_shifted_offset_p_s64 (pDataDest, vecOffs1, (int64x2_t)
> vecIn1, __p);
> +    vstrdq_scatter_shifted_offset_p_s64 (pDataDest, vecOffs2, (int64x2_t)
> vecIn2, __p);
> +
> +    pDataDest[2] = pDataSrc[2];
> +    return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 20 } } */

Reply via email to