MOVS instructions allow segment override of their source operand, e.g.:

    rep movsq %gs:(%rsi), (%rdi)

where %rsi is the address of the source location (with %gs segment override)
and %rdi is the address of the destination location.

The testcase improves from (-O2 -mno-sse -mtune=generic):

    xorl    %eax, %eax
.L2:
    movl    %eax, %edx
    addl    $8, %eax
    movq    %gs:m(%rdx), %rcx
    movq    %rcx, (%rdi,%rdx)
    cmpl    $240, %eax
    jb    .L2
    ret

to:
    movl    $m, %esi
    movl    $30, %ecx
    rep movsq %gs:(%rsi), (%rdi)
    ret

    PR 111657

gcc/ChangeLog:

    * config/i386/i386-expand.cc (alg_usable_p): Remove have_as bool
    argument and add dst_as and src_as address space arguments.  Reject
    libcall algorithm with dst_as and src_as in the non-default address
    spaces.  Reject rep_prefix_{1,4,8}_byte algorithms with dst_as in
    the non-default address space.
    (decide_alg): Remove have_as bool argument and add dst_as and src_as
    address space arguments.  Update calls to alg_usable_p.
    (ix86_expand_set_or_cpymem): Update call to decide_alg.
    * config/i386/i386.md (strmov): Do not fail if operand[3] (source)
    is in the non-default address space.  Expand with gen_strmov_singleop
    only when operand[1] (destination) is in the default address space.
    (*strmovdi_rex_1): Determine memory operands from insn pattern.
    Allow only when destination is in the default address space.
    Rewrite asm template to use explicit operands.
    (*strmovsi_1): Ditto.
    (*strmovhi_1): DItto.
    (*strmovqi_1): Ditto.
    (*rep_movdi_rex64): Ditto.
    (*rep_movsi): Ditto.
    (*rep_movqi): Ditto.
    (*strsetdi_rex_1): Determine memory operands from insn pattern.
    Allow only when destination is in the default address space.
    (*strsetsi_1): Ditto.
    (*strsethi_1): Ditto.
    (*strsetqi_1): Ditto.
    (*rep_stosdi_rex64): Ditto.
    (*rep_stossi): Ditto.
    (*rep_stosqi): Ditto.

gcc/testsuite/ChangeLog:

    * gcc.target/i386/pr111657-1.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 388e65192e4..f1cc85b4531 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -8907,31 +8907,33 @@ expand_set_or_cpymem_constant_prologue (rtx dst, rtx 
*srcp, rtx destreg,
 /* Return true if ALG can be used in current context.
    Assume we expand memset if MEMSET is true.  */
 static bool
-alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
+alg_usable_p (enum stringop_alg alg, bool memset,
+             addr_space_t dst_as, addr_space_t src_as)
 {
   if (alg == no_stringop)
     return false;
   /* It is not possible to use a library call if we have non-default
      address space.  We can do better than the generic byte-at-a-time
      loop, used as a fallback.  */
-  if (alg == libcall && have_as)
+  if (alg == libcall &&
+      !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
     return false;
   if (alg == vector_loop)
     return TARGET_SSE || TARGET_AVX;
   /* Algorithms using the rep prefix want at least edi and ecx;
      additionally, memset wants eax and memcpy wants esi.  Don't
      consider such algorithms if the user has appropriated those
-     registers for their own purposes, or if we have a non-default
-     address space, since some string insns cannot override the segment.  */
+     registers for their own purposes, or if we have the destination
+     in the non-default address space, since string insns cannot
+     override the destination segment.  */
   if (alg == rep_prefix_1_byte
       || alg == rep_prefix_4_byte
       || alg == rep_prefix_8_byte)
     {
-      if (have_as)
-       return false;
       if (fixed_regs[CX_REG]
          || fixed_regs[DI_REG]
-         || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
+         || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
+         || !ADDR_SPACE_GENERIC_P (dst_as))
        return false;
     }
   return true;
@@ -8941,8 +8943,8 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool 
have_as)
 static enum stringop_alg
 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
            unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
-           bool memset, bool zero_memset, bool have_as,
-           int *dynamic_check, bool *noalign, bool recur)
+           bool memset, bool zero_memset, addr_space_t dst_as,
+           addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
 {
   const struct stringop_algs *algs;
   bool optimize_for_speed;
@@ -8974,7 +8976,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
     {
       enum stringop_alg candidate = algs->size[i].alg;
-      bool usable = alg_usable_p (candidate, memset, have_as);
+      bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
       any_alg_usable_p |= usable;
 
       if (candidate != libcall && candidate && usable)
@@ -8990,17 +8992,17 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
 
   /* If user specified the algorithm, honor it if possible.  */
   if (ix86_stringop_alg != no_stringop
-      && alg_usable_p (ix86_stringop_alg, memset, have_as))
+      && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
     return ix86_stringop_alg;
   /* rep; movq or rep; movl is the smallest variant.  */
   else if (!optimize_for_speed)
     {
       *noalign = true;
       if (!count || (count & 3) || (memset && !zero_memset))
-       return alg_usable_p (rep_prefix_1_byte, memset, have_as)
+       return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
               ? rep_prefix_1_byte : loop_1_byte;
       else
-       return alg_usable_p (rep_prefix_4_byte, memset, have_as)
+       return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
               ? rep_prefix_4_byte : loop;
     }
   /* Very tiny blocks are best handled via the loop, REP is expensive to
@@ -9024,7 +9026,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
              enum stringop_alg candidate = algs->size[i].alg;
 
              if (candidate != libcall
-                 && alg_usable_p (candidate, memset, have_as))
+                 && alg_usable_p (candidate, memset, dst_as, src_as))
                {
                  alg = candidate;
                  alg_noalign = algs->size[i].noalign;
@@ -9044,7 +9046,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
                  else if (!any_alg_usable_p)
                    break;
                }
-             else if (alg_usable_p (candidate, memset, have_as)
+             else if (alg_usable_p (candidate, memset, dst_as, src_as)
                       && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
                            && candidate == rep_prefix_1_byte
                            /* NB: If min_size != max_size, size is
@@ -9066,7 +9068,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
      choice in ix86_costs.  */
   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
       && (algs->unknown_size == libcall
-         || !alg_usable_p (algs->unknown_size, memset, have_as)))
+         || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
     {
       enum stringop_alg alg;
       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
@@ -9081,8 +9083,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
            *dynamic_check = 128;
          return loop_1_byte;
        }
-      alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
-                       zero_memset, have_as, dynamic_check, noalign, true);
+      alg = decide_alg (count, new_expected_size, min_size, max_size,
+                       memset, zero_memset, dst_as, src_as,
+                       dynamic_check, noalign, true);
       gcc_assert (*dynamic_check == -1);
       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
        *dynamic_check = max;
@@ -9094,7 +9097,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
   /* Try to use some reasonable fallback algorithm.  Note that for
      non-default address spaces we default to a loop instead of
      a libcall.  */
-  return (alg_usable_p (algs->unknown_size, memset, have_as)
+
+  bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
+                  && ADDR_SPACE_GENERIC_P (src_as));
+
+  return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
          ? algs->unknown_size : have_as ? loop : libcall);
 }
 
@@ -9320,7 +9327,7 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
   unsigned HOST_WIDE_INT max_size = -1;
   unsigned HOST_WIDE_INT probable_max_size = -1;
   bool misaligned_prologue_used = false;
-  bool have_as;
+  addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -9358,16 +9365,15 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
count_exp, rtx val_exp,
   if (count > (HOST_WIDE_INT_1U << 30))
     return false;
 
-  have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
+  dst_as = MEM_ADDR_SPACE (dst);
   if (!issetmem)
-    have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
+    src_as = MEM_ADDR_SPACE (src);
 
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
   alg = decide_alg (count, expected_size, min_size, probable_max_size,
-                   issetmem,
-                   issetmem && val_exp == const0_rtx, have_as,
-                   &dynamic_check, &noalign, false);
+                   issetmem, issetmem && val_exp == const0_rtx,
+                   dst_as, src_as, &dynamic_check, &noalign, false);
 
   if (dump_file)
     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e170da3b0e6..962e7ab92aa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -25587,10 +25587,6 @@ (define_expand "strmov"
              (clobber (reg:CC FLAGS_REG))])]
   ""
 {
-  /* Can't use this for non-default address spaces.  */
-  if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (operands[3])))
-    FAIL;
-
   int piece_size = GET_MODE_SIZE (GET_MODE (operands[1]));
 
   /* If .md ever supports :P for Pmode, these can be directly
@@ -25598,9 +25594,12 @@ (define_expand "strmov"
   operands[5] = plus_constant (Pmode, operands[0], piece_size);
   operands[6] = plus_constant (Pmode, operands[2], piece_size);
 
-  /* Can't use this if the user has appropriated esi or edi.  */
+  /* Can't use this if the user has appropriated esi or edi,
+   * or if we have the destination in the non-default address space,
+   * since string insns cannot override the destination segment.  */
   if ((TARGET_SINGLE_STRINGOP || optimize_insn_for_size_p ())
-      && !(fixed_regs[SI_REG] || fixed_regs[DI_REG]))
+      && !(fixed_regs[SI_REG] || fixed_regs[DI_REG])
+      && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (operands[1])))
     {
       emit_insn (gen_strmov_singleop (operands[0], operands[1],
                                      operands[2], operands[3],
@@ -25635,8 +25634,16 @@ (define_insn "*strmovdi_rex_1"
                (const_int 8)))]
   "TARGET_64BIT
    && !(fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^movsq"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 0);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^movsq\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "memory" "both")
    (set_attr "mode" "DI")])
@@ -25651,8 +25658,16 @@ (define_insn "*strmovsi_1"
        (plus:P (match_dup 3)
                (const_int 4)))]
   "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^movs{l|d}"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 0);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^movs{l|d}\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "memory" "both")
    (set_attr "mode" "SI")])
@@ -25667,8 +25682,16 @@ (define_insn "*strmovhi_1"
        (plus:P (match_dup 3)
                (const_int 2)))]
   "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^movsw"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 0);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^movsw\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "memory" "both")
    (set_attr "mode" "HI")])
@@ -25683,8 +25706,16 @@ (define_insn "*strmovqi_1"
        (plus:P (match_dup 3)
                (const_int 1)))]
   "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^movsb"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 0);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^movsb\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "memory" "both")
    (set (attr "prefix_rex")
@@ -25723,8 +25754,16 @@ (define_insn "*rep_movdi_rex64"
    (use (match_dup 5))]
   "TARGET_64BIT
    && !(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^rep{%;} movsq"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 3))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 3);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^rep{%;} movsq\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "prefix_rep" "1")
    (set_attr "memory" "both")
@@ -25743,8 +25782,16 @@ (define_insn "*rep_movsi"
        (mem:BLK (match_dup 4)))
    (use (match_dup 5))]
   "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^rep{%;} movs{l|d}"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 3))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 3);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^rep{%;} movs{l|d}\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "prefix_rep" "1")
    (set_attr "memory" "both")
@@ -25761,8 +25808,16 @@ (define_insn "*rep_movqi"
        (mem:BLK (match_dup 4)))
    (use (match_dup 5))]
   "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
-  "%^rep{%;} movsb"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 3))))"
+{
+  rtx exp = XVECEXP (PATTERN (insn), 0, 3);
+
+  operands[0] = SET_DEST (exp);
+  operands[1] = SET_SRC (exp);
+
+  return "%^rep{%;} movsb\t{%1, %0|%0, %1}";
+}
   [(set_attr "type" "str")
    (set_attr "prefix_rep" "1")
    (set_attr "memory" "both")
@@ -25844,7 +25899,8 @@ (define_insn "*strsetdi_rex_1"
    (unspec [(const_int 0)] UNSPEC_STOS)]
   "TARGET_64BIT
    && !(fixed_regs[AX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
   "%^stosq"
   [(set_attr "type" "str")
    (set_attr "memory" "store")
@@ -25858,7 +25914,8 @@ (define_insn "*strsetsi_1"
                (const_int 4)))
    (unspec [(const_int 0)] UNSPEC_STOS)]
   "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
   "%^stos{l|d}"
   [(set_attr "type" "str")
    (set_attr "memory" "store")
@@ -25872,7 +25929,8 @@ (define_insn "*strsethi_1"
                (const_int 2)))
    (unspec [(const_int 0)] UNSPEC_STOS)]
   "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
   "%^stosw"
   [(set_attr "type" "str")
    (set_attr "memory" "store")
@@ -25886,7 +25944,8 @@ (define_insn "*strsetqi_1"
                (const_int 1)))
    (unspec [(const_int 0)] UNSPEC_STOS)]
   "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 0))))"
   "%^stosb"
   [(set_attr "type" "str")
    (set_attr "memory" "store")
@@ -25922,7 +25981,8 @@ (define_insn "*rep_stosdi_rex64"
    (use (match_dup 4))]
   "TARGET_64BIT
    && !(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 2))))"
   "%^rep{%;} stosq"
   [(set_attr "type" "str")
    (set_attr "prefix_rep" "1")
@@ -25940,7 +26000,8 @@ (define_insn "*rep_stossi"
    (use (match_operand:SI 2 "register_operand" "a"))
    (use (match_dup 4))]
   "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 2))))"
   "%^rep{%;} stos{l|d}"
   [(set_attr "type" "str")
    (set_attr "prefix_rep" "1")
@@ -25957,7 +26018,8 @@ (define_insn "*rep_stosqi"
    (use (match_operand:QI 2 "register_operand" "a"))
    (use (match_dup 4))]
   "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
-   && ix86_check_no_addr_space (insn)"
+   && ADDR_SPACE_GENERIC_P
+       (MEM_ADDR_SPACE (SET_DEST (XVECEXP (PATTERN (insn), 0, 2))))"
   "%^rep{%;} stosb"
   [(set_attr "type" "str")
    (set_attr "prefix_rep" "1")
diff --git a/gcc/testsuite/gcc.target/i386/pr111657-1.c 
b/gcc/testsuite/gcc.target/i386/pr111657-1.c
new file mode 100644
index 00000000000..2e0084aa67f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111657-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mtune=skylake -masm=att" } */
+
+typedef unsigned long uword __attribute__ ((mode (word)));
+
+struct a { uword arr[30]; };
+
+__seg_gs struct a m;
+void bar (struct a *dst) { *dst = m; }
+
+/* { dg-final { scan-assembler "rep\[; \t\]+movs(l|q)\[ \t\]+%gs:" } } */

Reply via email to