Hi Aaron, On Wed, Aug 22, 2018 at 12:31:51PM -0500, Aaron Sawdey wrote: > This patch teaches rs6000 inline expansion of strcmp/strncmp how to > generate vector/vsx code for power8/power9 processors. Compares 16 > bytes and longer are generated using the vector code, which is > considerably faster than the gpr based code.
:-) > (expand_strn_compare): Support for vector strncmp. "Add support"? > -(define_insn "*altivec_vcmpequ<VI_char>_p" > +(define_insn "altivec_vcmpequ<VI_char>_p" > [(set (reg:CC CR6_REGNO) > (unspec:CC [(eq:CC (match_operand:VI2 1 "register_operand" "v") > (match_operand:VI2 2 "register_operand" "v"))] You missed this one in the changelog. > --- gcc/config/rs6000/rs6000-string.c (revision 263753) > +++ gcc/config/rs6000/rs6000-string.c (working copy) > @@ -157,6 +157,29 @@ > { > switch (GET_MODE (reg)) > { > + case E_V16QImode: > + switch (mode) { > + case E_V16QImode: > + if (!BYTES_BIG_ENDIAN) This has a trailing space. > + if (TARGET_P9_VECTOR) > + emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem)); > + else > + { > + rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, V16QImode, 0); > + gcc_assert (MEM_P (mem)); > + rtx addr = XEXP (mem, 0); > + rtx mem_v2di = gen_rtx_MEM (V2DImode, addr); > + MEM_COPY_ATTRIBUTES (mem_v2di, mem); > + set_mem_size (mem, GET_MODE_SIZE (V2DImode)); > + emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di)); > + } > + else If you have to outdent more than one, you usually should put the inner if in a block of its own. Another option is to make a helper function, if you can come up with something for which you can make a sane and short name. > + emit_insn (gen_vsx_movv2di_64bit (reg, mem)); > + break; > + default: > + gcc_unreachable (); > + } > + break; This cannot be correct (same indent for break and preceding } ). > + switch (mode) > + { > + case E_QImode: > + emit_move_insn (reg, mem); > + break; > + default: > + debug_rtx(reg); > + gcc_unreachable (); > + break; > + } Unless you have a reason you want to keep debug_rtx here, why not just do gcc_assert (mode == E_QImode); emit_move_insn (reg, mem); > +static void > +expand_strncmp_vec_sequence(unsigned HOST_WIDE_INT bytes_to_compare, > + rtx orig_src1, rtx orig_src2, > + rtx s1addr, rtx s2addr, rtx off_reg, > + rtx s1data, rtx s2data, > + rtx vec_result, bool equality_compare_rest, > + rtx &cleanup_label, rtx final_move_label) Space before (. Wow what a monster :-) Why use a reference instead of a pointer for cleanup_label? Will that work even, it seems to allow NULL? > + unsigned int i; > + rtx zr[16]; > + for (i = 0; i < 16; i++) > + zr[i] = GEN_INT (0); > + rtvec zv = gen_rtvec_v (16, zr); > + rtx zero_reg = gen_reg_rtx (V16QImode); > + rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv)); Is there no helper for this already? Should there be? > + if (extra_bytes < offset) > + { > + offset -= extra_bytes; > + cmp_bytes = load_mode_size; > + bytes_to_compare = cmp_bytes; > + } > + else > + /* We do not call this with less than 16 bytes. */ > + gcc_unreachable (); gcc_assert (offset > extra_bytes); offset -= extra_bytes; cmp_bytes = load_mode_size; bytes_to_compare = cmp_bytes; > + /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at > + least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is > + at least POWER8. That way we can rely on overlapping compares to > + do the final comparison of less than 16 bytes. Also I do not want > + to deal with making this work for 32 bits. */ Two spaces after full stop. > + if (use_vec) > + { > + s1addr = gen_reg_rtx (Pmode); > + s2addr = gen_reg_rtx (Pmode); > + off_reg = gen_reg_rtx (Pmode); > + vec_result = gen_reg_rtx (load_mode); > + emit_move_insn (result_reg, GEN_INT (0)); > + expand_strncmp_vec_sequence(compare_length, Space before (. Please fix these nits, and then it is okay for trunk. Thanks! Segher