memcpy

Andrew Pinski Wed, 06 Aug 2025 11:50:28 -0700

On Wed, Aug 6, 2025 at 11:20 AM Keith Packard <kei...@keithp.com> wrote:
>
> This option (enabled by default) preserves existing behavior by
> allowing use of Advanced SIMD registers while expanding
> memset/memcpy/memmove operations into inline instructions.
>
> Disabling this option prevents use of these registers for environments
> where the FPU may be disabled to reduce the cost of saving/restoring
> the processor state, such as in interrupt handlers.


I am trying to understand the difference between this option and
-mgeneral-regs-only since you mentioned that FPU will be disabled for
those cases.
Also I am not sure aarch64-elf should turn off this by default since
there are some bare metal env which support the full SIMD registers
for interrupt save/restoring.

Thanks,
Andrew

>
> Signed-off-by: Keith Packard <kei...@keithp.com>
> ---
>  gcc/common/config/aarch64/aarch64-common.cc |  4 ++++
>  gcc/config/aarch64/aarch64.cc               |  8 +++++---
>  gcc/config/aarch64/aarch64.h                |  7 +++++++
>  gcc/config/aarch64/aarch64.opt              |  4 ++++
>  gcc/config/aarch64/aarch64.opt.urls         |  3 +++
>  gcc/doc/invoke.texi                         | 10 +++++++++-
>  6 files changed, 32 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/common/config/aarch64/aarch64-common.cc 
> b/gcc/common/config/aarch64/aarch64-common.cc
> index 1488697c6ce..b6b60b0fdfb 100644
> --- a/gcc/common/config/aarch64/aarch64-common.cc
> +++ b/gcc/common/config/aarch64/aarch64-common.cc
> @@ -146,6 +146,10 @@ aarch64_handle_option (struct gcc_options *opts,
>        opts->x_flag_aarch64_max_vectorization = val;
>        return true;
>
> +    case OPT_msimd_memops:
> +      opts->x_aarch64_flag_simd_memops = val;
> +      return true;
> +
>      default:
>        return true;
>      }
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d30c9c75e42..19e6973a5e3 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -19906,6 +19906,8 @@ static const struct aarch64_attribute_info 
> aarch64_attributes[] =
>       OPT_moutline_atomics},
>    { "max-vectorization", aarch64_attr_bool, false, NULL,
>       OPT_mmax_vectorization},
> +  { "simd-memops", aarch64_attr_bool, true, NULL,
> +     OPT_msimd_memops},
>    { NULL, aarch64_attr_custom, false, NULL, OPT____ }
>  };
>
> @@ -27788,7 +27790,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>
>    /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  
> */
> -  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
> +  unsigned max_copy_size = TARGET_SIMD_MEMOPS ? 256 : 128;
>    unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
>                                        : aarch64_mops_memcpy_size_threshold;
>
> @@ -27805,7 +27807,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>       ??? Although it would be possible to use LDP/STP Qn in streaming mode
>       (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
>       whether that would improve performance.  */
> -  bool use_qregs = size > 24 && TARGET_SIMD;
> +  bool use_qregs = size > 24 && TARGET_SIMD_MEMOPS;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -27905,7 +27907,7 @@ aarch64_expand_setmem (rtx *operands)
>    machine_mode mode = BLKmode, next_mode;
>
>    /* Variable-sized or strict-align memset may use the MOPS expansion.  */
> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD_MEMOPS
>        || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 096c853af7f..fc6fd6bf869 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -121,6 +121,13 @@
>     of LSE instructions.  */
>  #define TARGET_OUTLINE_ATOMICS (aarch64_flag_outline_atomics)
>
> +#ifndef AARCH64_SIMD_MEMOPS_DEFAULT
> +#define AARCH64_SIMD_MEMOPS_DEFAULT 1
> +#endif
> +
> +/* Allow use of SIMD registers for memory copy and set expansions */
> +#define TARGET_SIMD_MEMOPS (TARGET_SIMD && aarch64_flag_simd_memops)
> +
>  /* Align global data as an optimization.  */
>  #define DATA_ALIGNMENT(EXP, ALIGN) aarch64_data_alignment (EXP, ALIGN)
>
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index 9ca753e6a88..1d77d2048f2 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -352,6 +352,10 @@ moutline-atomics
>  Target Var(aarch64_flag_outline_atomics) Init(2) Save
>  Generate local calls to out-of-line atomic operations.
>
> +msimd-memops
> +Target Var(aarch64_flag_simd_memops) Init(AARCH64_SIMD_MEMOPS_DEFAULT) Save
> +Allow use of SIMD registers in memory set/copy expansions.
> +
>  -param=aarch64-vect-compare-costs=
>  Target Joined UInteger Var(aarch64_vect_compare_costs) Init(1) 
> IntegerRange(0, 1) Param
>  When vectorizing, consider using multiple different approaches and use
> diff --git a/gcc/config/aarch64/aarch64.opt.urls 
> b/gcc/config/aarch64/aarch64.opt.urls
> index 7ec14a94381..709fc86a6c1 100644
> --- a/gcc/config/aarch64/aarch64.opt.urls
> +++ b/gcc/config/aarch64/aarch64.opt.urls
> @@ -92,6 +92,9 @@ 
> UrlSuffix(gcc/AArch64-Options.html#index-mstack-protector-guard-reg)
>  mstack-protector-guard-offset=
>  UrlSuffix(gcc/AArch64-Options.html#index-mstack-protector-guard-offset)
>
> +msimd-memops
> +UrlSuffix(gcc/AArch64-Options.html#index-msimd-memops)
> +
>  Wexperimental-fmv-target
>  UrlSuffix(gcc/AArch64-Options.html#index-Wexperimental-fmv-target)
>
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 00468a72ada..4d518c28049 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -835,7 +835,7 @@ Objective-C and Objective-C++ Dialects}.
>  -moverride=@var{string}  -mverbose-cost-dump
>  -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{sysreg}
>  -mstack-protector-guard-offset=@var{offset} -mtrack-speculation
> --moutline-atomics -mearly-ldp-fusion -mlate-ldp-fusion
> +-moutline-atomics -mearly-ldp-fusion -mlate-ldp-fusion -msimd-memops
>  -Wexperimental-fmv-target}
>
>  @emph{Adapteva Epiphany Options} (@ref{Adapteva Epiphany Options})
> @@ -22182,6 +22182,14 @@ For best performance it is highly recommended to use 
> @option{-mcpu} or
>  @option{-mtune} instead.  This parameter should only be used for code
>  exploration.
>
> +@item -msimd-memops
> +@itemx -mno-simd-memops
> +Enable or disable use of Advanced SIMD registers when expanding memory
> +copy and memory set operations. Use of these registers can improve
> +performance and reduce instruction count for these operations. This
> +option is ignored unless Advanced SIMD registers are available.
> +This option is on by default.
> +
>  @opindex march
>  @item -march=@var{name}
>  Specify the name of the target architecture and, optionally, one or
> --
> 2.49.0
>

Re: [PATCH 1/2] aarch64: Add -msimd-memops option controlling SIMD usage in memset/memcpy

Reply via email to