This patch changes the sequence that gcc generates for inline expansion of
strcmp/strncmp using scalar (gpr) instructions. The new sequence is one
instruction shorter and uses cmpb/cmpb/orc./bne which I also have been
told that valgrind should be able to understand as the defined/undefined
info can be propagated and should show that the branch is not based on
any undefined data past the end of the string.

Performance is mostly a wash. The new sequence is faster if there is a
difference in the string, the old sequence is sligntly faster for short strings
that do not differ. The new sequence is faster for long strings that do not
differ, but that isn't important because if vsx is enabled, the gpr
sequence is only used for 15 bytes or less.

Bootstrap/regtest passes on ppc64le (power8, power9), ppc64 (power8)
and ppc32 (power8). Ok for trunk?

Thanks,
    Aaron


2018-10-25  Aaron Sawdey  <acsaw...@linux.ibm.com>

        * config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to
        a shorter sequence with fewer branches.
        (emit_final_str_compare_gpr): Ditto.


Index: gcc/config/rs6000/rs6000-string.c
===================================================================
--- gcc/config/rs6000/rs6000-string.c   (revision 265393)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -259,7 +259,7 @@
       gcc_assert (mode == E_QImode);
       emit_move_insn (reg, mem);
       break;
-
+
     default:
       gcc_unreachable ();
       break;
@@ -726,7 +726,7 @@
     {
       if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
        /* Do not expect length longer than word_mode.  */
-       return false;
+       return false;
       else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE 
(word_mode))
        {
          bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
@@ -770,7 +770,7 @@
   rtx j;

   /* Example of generated code for 35 bytes aligned 1 byte.
-
+
             mtctr 8
             li 6,0
             li 5,8
@@ -798,7 +798,7 @@
             popcntd 9,9
             subfe 10,10,10
             or 9,9,10
-
+
      Compiled with -fno-reorder-blocks for clarity.  */

   /* Structure of what we're going to do:
@@ -1041,7 +1041,7 @@
       if (!bytes_is_const)
        {
          /* If we're dealing with runtime length, we have to check if
-            it's zero after the loop. When length is known at compile
+            it's zero after the loop.  When length is known at compile
             time the no-remainder condition is dealt with above.  By
             doing this after cleanup_label, we also deal with the
             case where length is 0 at the start and we bypass the
@@ -1411,7 +1411,7 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
   /* P7/P8 code uses cond for subfc. but P9 uses
-     it for cmpld which needs CCUNSmode. */
+     it for cmpld which needs CCUNSmode.  */
   rtx cond;
   if (TARGET_P9_MISC)
     cond = gen_reg_rtx (CCUNSmode);
@@ -1655,7 +1655,7 @@
        emit_label (convert_label);

       /* We need to produce DI result from sub, then convert to target SI
-        while maintaining <0 / ==0 / >0 properties. This sequence works:
+        while maintaining <0 / ==0 / >0 properties.  This sequence works:
         subfc L,A,B
         subfe H,H,H
         popcntd L,L
@@ -1740,7 +1740,7 @@
    to strcmp/strncmp if we have equality at the end of the inline comparison.
    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
    to clean up and generate the final comparison result.
-   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
    set the final result.  */
 static void
 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
@@ -1763,12 +1763,9 @@
   while (bytes_to_compare > 0)
     {
       /* GPR compare sequence:
-         check each 8B with: ld/ld cmpd bne
-        If equal, use rldicr/cmpb to check for zero byte.
+         check each 8B with: ld/ld/cmpb/cmpb/orc./bne
+
          cleanup code at end:
-         cmpb          get byte that differs
-         cmpb          look for zero byte
-         orc           combine
          cntlzd        get bit of first zero/diff byte
          subfic        convert for rldcl use
          rldcl rldcl   extract diff/zero byte
@@ -1776,7 +1773,7 @@

          The last compare can branch around the cleanup code if the
          result is zero because the strings are exactly equal.  */
-
+
       unsigned int align = compute_current_alignment (base_align, offset);
       load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
       load_mode_size = GET_MODE_SIZE (load_mode);
@@ -1801,34 +1798,49 @@
           rid of the extra bytes.  */
        cmp_bytes = bytes_to_compare;

-      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
+      rtx offset_reg = gen_reg_rtx (Pmode);
+      emit_move_insn (offset_reg, GEN_INT (offset));
+
+      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg);
       do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, 
orig_src1);
-      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
+      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg);
       do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, 
orig_src2);

       /* We must always left-align the data we read, and
         clear any bytes to the right that are beyond the string.
         Otherwise the cmpb sequence won't produce the correct
-        results.  The beginning of the compare will be done
-        with word_mode so will not have any extra shifts or
-        clear rights.  */
+        results.  However if there is only one byte left, we
+        can just subtract to get the final result so the shifts
+        and clears are not needed.  */

-      if (load_mode_size < word_mode_size)
+      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
+
+      /* Loading just a single byte is a special case.  If we are
+        loading more than that, we have to check whether we are
+        looking at the entire chunk of data.  If not, rotate left and
+        clear right so that bytes we aren't supposed to look at are
+        zeroed, and the first byte we are supposed to compare is
+        leftmost.  */
+      if (load_mode_size != 1)
        {
-         /* Rotate left first. */
-         rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
-         do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
-         do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
-       }
+         if (load_mode_size < word_mode_size)
+           {
+             /* Rotate left first.  */
+             rtx sh = GEN_INT (BITS_PER_UNIT
+                               * (word_mode_size - load_mode_size));
+             do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
+             do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
+           }

-      if (cmp_bytes < word_mode_size)
-       {
-         /* Now clear right.  This plus the rotate can be
-            turned into a rldicr instruction. */
-         HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-         rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-         do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
-         do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
+         if (cmp_bytes < word_mode_size)
+           {
+             /* Now clear right.  This plus the rotate can be
+                turned into a rldicr instruction.  */
+             HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+             rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+             do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
+             do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
+           }
        }

       /* Cases to handle.  A and B are chunks of the two strings.
@@ -1842,8 +1854,6 @@
         A == B: branch to result 0.
         A != B: cleanup code to compute result.  */

-      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
-
       rtx dst_label;
       if (remain > 0 || equality_compare_rest)
        {
@@ -1857,54 +1867,89 @@
        /* Branch to end and produce result of 0.  */
        dst_label = final_move_label;

-      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
-      rtx cond = gen_reg_rtx (CCmode);
+      if (load_mode_size == 1)
+       {
+         /* Special case for comparing just single byte.  */
+         if (equality_compare_rest)
+           {
+             /* Use subf./bne to branch to final_move_label if the
+                byte differs, otherwise fall through to the strncmp
+                call.  We must also check for a zero byte here as we
+                must not make the library call if this is the end of
+                the string.  */

-      /* Always produce the 0 result, it is needed if
-        cmpb finds a 0 byte in this chunk.  */
-      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
-      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
+             rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
+             rtx cond = gen_reg_rtx (CCmode);
+             rtx diff_rtx = gen_rtx_MINUS (word_mode,
+                                           tmp_reg_src1, tmp_reg_src2);
+             rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
+             rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);

-      rtx cmp_rtx;
-      if (remain == 0 && !equality_compare_rest)
-       cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
-      else
-       cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+             rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
+                                                lab_ref, pc_rtx);
+             rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+             JUMP_LABEL (j) = final_move_label;
+             LABEL_NUSES (final_move_label) += 1;

-      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-                                        lab_ref, pc_rtx);
-      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-      JUMP_LABEL (j) = dst_label;
-      LABEL_NUSES (dst_label) += 1;
+             /* Check for zero byte here before fall through to
+                library call.  This catches the case where the
+                strings are equal and end in a zero byte at this
+                position.  */

-      if (remain > 0 || equality_compare_rest)
+             rtx cond0 = gen_reg_rtx (CCmode);
+             emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
+                                                     const0_rtx));
+
+             rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
+
+             rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
+                                                lab_ref, pc_rtx);
+             rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
+             JUMP_LABEL (j0) = final_move_label;
+             LABEL_NUSES (final_move_label) += 1;
+           }
+         else
+           {
+             /* This is the last byte to be compared so we can use
+                subf to compute the final result and branch
+                unconditionally to final_move_label.  */
+
+             do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
+
+             rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
+             rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+             JUMP_LABEL (j) = final_move_label;
+             LABEL_NUSES (final_move_label) += 1;
+             emit_barrier ();
+           }
+       }
+      else
        {
-         /* Generate a cmpb to test for a 0 byte and branch
-            to final result if found.  */
          rtx cmpb_zero = gen_reg_rtx (word_mode);
-         rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
-         rtx condz = gen_reg_rtx (CCmode);
+         rtx cmpb_diff = gen_reg_rtx (word_mode);
          rtx zero_reg = gen_reg_rtx (word_mode);
+         rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
+         rtx cond = gen_reg_rtx (CCmode);
+
          emit_move_insn (zero_reg, GEN_INT (0));
+         do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
          do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
+         rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
+         rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);

-         if (cmp_bytes < word_mode_size)
-           {
-             /* Don't want to look at zero bytes past end.  */
-             HOST_WIDE_INT mb =
-               BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-             rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-             do_and3 (cmpb_zero, cmpb_zero, mask);
-           }
+         rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);

-         emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
-         rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
-         rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
-                                            lab_ref_fin, pc_rtx);
-         rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-         JUMP_LABEL (j2) = final_move_label;
-         LABEL_NUSES (final_move_label) += 1;
+         rtx cmp_rtx;
+         if (remain == 0 && !equality_compare_rest)
+           cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
+         else
+           cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);

+         rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
+                                            lab_ref, pc_rtx);
+         rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+         JUMP_LABEL (j) = dst_label;
+         LABEL_NUSES (dst_label) += 1;
        }

       offset += cmp_bytes;
@@ -1915,7 +1960,7 @@
   return;
 }

-/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
    instructions.

    BYTES_TO_COMPARE is the number of bytes to be compared.
@@ -1931,7 +1976,7 @@
    to strcmp/strncmp if we have equality at the end of the inline comparison.
    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code 
to clean up
    and generate the final comparison result.
-   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
    set the final result.  */
 static void
 expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
@@ -1982,12 +2027,12 @@
         bne 6,.Lmismatch

         Use the overlapping compare trick for the last block if it is
-        less than 16 bytes.
+        less than 16 bytes.
       */

       load_mode = V16QImode;
       load_mode_size = GET_MODE_SIZE (load_mode);
-
+
       if (bytes_to_compare >= load_mode_size)
        cmp_bytes = load_mode_size;
       else
@@ -2046,10 +2091,10 @@
       if (branch_to_cleanup)
        {
          /* Branch to cleanup code, otherwise fall through to do more
-            compares. P8 and P9 use different CR bits because on P8
+            compares.  P8 and P9 use different CR bits because on P8
             we are looking at the result of a comparsion vs a
             register of zeroes so the all-true condition means no
-            difference or zero was found. On P9, vcmpnezb sets a byte
+            difference or zero was found.  On P9, vcmpnezb sets a byte
             to 0xff if there is a mismatch or zero, so the all-false
             condition indicates we found no difference or zero.  */
          if (!cleanup_label)
@@ -2062,7 +2107,7 @@
        }
       else
        {
-         /* Branch to final return or fall through to cleanup,
+         /* Branch to final return or fall through to cleanup,
             result is already set to 0.  */
          dst_label = final_move_label;
          if (TARGET_P9_VECTOR)
@@ -2088,10 +2133,7 @@
 /* Generate the final sequence that identifies the differing
    byte and generates the final result, taking into account
    zero bytes:
-
-   cmpb              cmpb_result1, src1, src2
-   cmpb              cmpb_result2, src1, zero
-   orc               cmpb_result1, cmp_result1, cmpb_result2
+
    cntlzd            get bit of first zero/diff byte
    addi              convert for rldcl use
    rldcl rldcl       extract diff/zero byte
@@ -2105,10 +2147,7 @@
 emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
 {
   machine_mode m = GET_MODE (str1);
-  rtx cmpb_diff = gen_reg_rtx (m);
-  rtx cmpb_zero = gen_reg_rtx (m);
   rtx rot_amt = gen_reg_rtx (m);
-  rtx zero_reg = gen_reg_rtx (m);

   rtx rot1_1 = gen_reg_rtx (m);
   rtx rot1_2 = gen_reg_rtx (m);
@@ -2117,12 +2156,7 @@

   if (m == SImode)
     {
-      emit_insn (gen_cmpbsi3 (cmpb_diff, str1, str2));
-      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
-      emit_insn (gen_cmpbsi3 (cmpb_zero, str1, zero_reg));
-      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
-      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
-      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
+      emit_insn (gen_clzsi2 (rot_amt, result));
       emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
       emit_insn (gen_rotlsi3 (rot1_1, str1,
                              gen_lowpart (SImode, rot_amt)));
@@ -2134,12 +2168,7 @@
     }
   else if (m == DImode)
     {
-      emit_insn (gen_cmpbdi3 (cmpb_diff, str1, str2));
-      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
-      emit_insn (gen_cmpbdi3 (cmpb_zero, str1, zero_reg));
-      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
-      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
-      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
+      emit_insn (gen_clzdi2 (rot_amt, result));
       emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
       emit_insn (gen_rotldi3 (rot1_1, str1,
                              gen_lowpart (SImode, rot_amt)));
@@ -2151,7 +2180,7 @@
     }
   else
     gcc_unreachable ();
-
+
   return;
 }

@@ -2169,10 +2198,10 @@
         lbzx 10,28,9    # use that offset to load differing byte
         lbzx 3,29,9
         subf 3,3,10     # subtract for final result
-
+
    P9:
         vclzlsbb            # counts trailing bytes with lsb=0
-        vextublx            # extract differing byte
+        vextublx            # extract differing byte

    STR1 is the reg rtx for data from string 1.
    STR2 is the reg rtx for data from string 2.
@@ -2208,7 +2237,7 @@
       gcc_assert (TARGET_P8_VECTOR);
       rtx diffix = gen_reg_rtx (DImode);
       rtx result_gbbd = gen_reg_rtx (V16QImode);
-      /* Since each byte of the input is either 00 or FF, the bytes in
+      /* Since each byte of the input is either 00 or FF, the bytes in
         dw0 and dw1 after vgbbd are all identical to each other.  */
       emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
       /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
@@ -2226,7 +2255,7 @@
       else
        emit_insn (gen_ctzdi2 (count, diffix));

-      /* P8 doesn't have a good solution for extracting one byte from
+      /* P8 doesn't have a good solution for extracting one byte from
         a vsx reg like vextublx on P9 so we just compute the offset
         of the differing byte and load it from each string.  */
       do_add3 (off_reg, off_reg, count);
@@ -2247,7 +2276,7 @@
 }

 /* Expand a string compare operation with length, and return
-   true if successful. Return false if we should let the
+   true if successful.  Return false if we should let the
    compiler generate normal code, probably a strncmp call.

    OPERANDS[0] is the target (result).
@@ -2279,9 +2308,9 @@
   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));

-  /* If we have a length, it must be constant. This simplifies things
+  /* If we have a length, it must be constant.  This simplifies things
      a bit as we don't have to generate code to check if we've exceeded
-     the length. Later this could be expanded to handle this case.  */
+     the length.  Later this could be expanded to handle this case.  */
   if (!no_length && !CONST_INT_P (bytes_rtx))
     return false;

@@ -2311,7 +2340,7 @@
   else
     bytes = UINTVAL (bytes_rtx);

-  /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
+  /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
      at least POWER8.  That way we can rely on overlapping compares to
      do the final comparison of less than 16 bytes.  Also I do not
@@ -2363,7 +2392,7 @@
   rtx final_move_label = gen_label_rtx ();
   rtx final_label = gen_label_rtx ();
   rtx begin_compare_label = NULL;
-
+
   if (base_align < required_align)
     {
       /* Generate code that checks distance to 4k boundary for this case.  */
@@ -2472,7 +2501,7 @@
                                 &cleanup_label, final_move_label);

   offset = compare_length;
-
+
   if (equality_compare_rest)
     {
       /* Update pointers past what has been compared already.  */


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain

Reply via email to