On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu...@intel.com> wrote: > Since mov<mode>_internal patterns handle both aligned/unaligned load > and store, we can simplify ix86_avx256_split_vector_move_misalign and > ix86_expand_vector_move_misalign. > > Tested on x86-64. OK for trunk? > > H.J. > --- > * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): > Short-cut unaligned load and store cases. Handle all integer > vector modes. > (ix86_expand_vector_move_misalign): Short-cut unaligned load > and store cases. Call ix86_avx256_split_vector_move_misalign > directly without checking mode class.
LGTM, but it is hard to review interwoven code movements and deletions... Hopefully OK. Thanks, Uros. > --- > gcc/config/i386/i386.c | 252 > ++++++++++++++++--------------------------------- > 1 file changed, 81 insertions(+), 171 deletions(-) > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 4e48572..e056f68 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -18820,7 +18820,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx > op1) > rtx (*extract) (rtx, rtx, rtx); > machine_mode mode; > > - switch (GET_MODE (op0)) > + if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) > + || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) > + { > + emit_insn (gen_rtx_SET (op0, op1)); > + return; > + } > + > + rtx orig_op0 = NULL_RTX; > + mode = GET_MODE (op0); > + switch (GET_MODE_CLASS (mode)) > + { > + case MODE_VECTOR_INT: > + case MODE_INT: > + if (mode != V32QImode) > + { > + if (!MEM_P (op0)) > + { > + orig_op0 = op0; > + op0 = gen_reg_rtx (V32QImode); > + } > + else > + op0 = gen_lowpart (V32QImode, op0); > + op1 = gen_lowpart (V32QImode, op1); > + mode = V32QImode; > + } > + break; > + case MODE_VECTOR_FLOAT: > + break; > + default: > + gcc_unreachable (); > + } > + > + switch (mode) > { > default: > gcc_unreachable (); > @@ -18840,34 +18872,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, > rtx op1) > > if (MEM_P (op1)) > { > - if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD > - && optimize_insn_for_speed_p ()) > - { > - rtx r = gen_reg_rtx (mode); > - m = adjust_address (op1, mode, 0); > - emit_move_insn (r, m); > - m = adjust_address (op1, mode, 16); > - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); > - emit_move_insn (op0, r); > - } > - else > - emit_insn (gen_rtx_SET (op0, op1)); > + rtx r = gen_reg_rtx (mode); > + m = adjust_address (op1, mode, 0); > + emit_move_insn (r, m); > + m = adjust_address (op1, mode, 16); > + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); > + emit_move_insn (op0, r); > } > else if (MEM_P (op0)) > { > - if (TARGET_AVX256_SPLIT_UNALIGNED_STORE > - && optimize_insn_for_speed_p ()) > - { > - m = adjust_address (op0, mode, 0); > - emit_insn (extract (m, op1, const0_rtx)); > - m = adjust_address (op0, mode, 16); > - emit_insn (extract (m, op1, const1_rtx)); > - } > - else > - emit_insn (gen_rtx_SET (op0, op1)); > + m = adjust_address (op0, mode, 0); > + emit_insn (extract (m, op1, const0_rtx)); > + m = adjust_address (op0, mode, 16); > + emit_insn (extract (m, op1, const1_rtx)); > } > else > gcc_unreachable (); > + > + if (orig_op0) > + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > } > > /* Implement the movmisalign patterns for SSE. Non-SSE modes go > @@ -18925,118 +18948,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, > rtx op1) > void > ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) > { > - rtx op0, op1, orig_op0 = NULL_RTX, m; > + rtx op0, op1, m; > > op0 = operands[0]; > op1 = operands[1]; > > - if (GET_MODE_SIZE (mode) == 64) > + /* Use unaligned load/store for AVX512 or when optimizing for size. */ > + if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) > { > - switch (GET_MODE_CLASS (mode)) > - { > - case MODE_VECTOR_INT: > - case MODE_INT: > - if (GET_MODE (op0) != V16SImode) > - { > - if (!MEM_P (op0)) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V16SImode); > - } > - else > - op0 = gen_lowpart (V16SImode, op0); > - } > - op1 = gen_lowpart (V16SImode, op1); > - /* FALLTHRU */ > - > - case MODE_VECTOR_FLOAT: > - > - emit_insn (gen_rtx_SET (op0, op1)); > - if (orig_op0) > - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > - break; > - > - default: > - gcc_unreachable (); > - } > - > + emit_insn (gen_rtx_SET (op0, op1)); > return; > } > > - if (TARGET_AVX > - && GET_MODE_SIZE (mode) == 32) > + if (TARGET_AVX) > { > - switch (GET_MODE_CLASS (mode)) > - { > - case MODE_VECTOR_INT: > - case MODE_INT: > - if (GET_MODE (op0) != V32QImode) > - { > - if (!MEM_P (op0)) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V32QImode); > - } > - else > - op0 = gen_lowpart (V32QImode, op0); > - } > - op1 = gen_lowpart (V32QImode, op1); > - /* FALLTHRU */ > - > - case MODE_VECTOR_FLOAT: > - ix86_avx256_split_vector_move_misalign (op0, op1); > - if (orig_op0) > - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > - break; > + if (GET_MODE_SIZE (mode) == 32) > + ix86_avx256_split_vector_move_misalign (op0, op1); > + else > + /* Always use 128-bit mov<mode>_internal pattern for AVX. */ > + emit_insn (gen_rtx_SET (op0, op1)); > + return; > + } > > - default: > - gcc_unreachable (); > - } > + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) > + { > + emit_insn (gen_rtx_SET (op0, op1)); > + return; > + } > > + /* ??? If we have typed data, then it would appear that using > + movdqu is the only way to get unaligned data loaded with > + integer type. */ > + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > + { > + emit_insn (gen_rtx_SET (op0, op1)); > return; > } > > if (MEM_P (op1)) > { > - /* Normal *mov<mode>_internal pattern will handle > - unaligned loads just fine if misaligned_operand > - is true, and without the UNSPEC it can be combined > - with arithmetic instructions. */ > - if (TARGET_AVX > - && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > - || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) > - && misaligned_operand (op1, GET_MODE (op1))) > - emit_insn (gen_rtx_SET (op0, op1)); > - /* ??? If we have typed data, then it would appear that using > - movdqu is the only way to get unaligned data loaded with > - integer type. */ > - else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > - { > - if (GET_MODE (op0) != V16QImode) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V16QImode); > - } > - op1 = gen_lowpart (V16QImode, op1); > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - if (orig_op0) > - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > - } > - else if (TARGET_SSE2 && mode == V2DFmode) > + if (TARGET_SSE2 && mode == V2DFmode) > { > rtx zero; > > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - { > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - return; > - } > - > /* When SSE registers are split into halves, we can avoid > writing to the top half twice. */ > if (TARGET_SSE_SPLIT_REGS) > @@ -19066,24 +19021,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, > rtx operands[]) > { > rtx t; > > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - { > - if (GET_MODE (op0) != V4SFmode) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V4SFmode); > - } > - op1 = gen_lowpart (V4SFmode, op1); > - emit_insn (gen_rtx_SET (op0, op1)); > - if (orig_op0) > - emit_move_insn (orig_op0, > - gen_lowpart (GET_MODE (orig_op0), op0)); > - return; > - } > - > if (mode != V4SFmode) > t = gen_reg_rtx (V4SFmode); > else > @@ -19104,49 +19041,22 @@ ix86_expand_vector_move_misalign (machine_mode > mode, rtx operands[]) > } > else if (MEM_P (op0)) > { > - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > - { > - op0 = gen_lowpart (V16QImode, op0); > - op1 = gen_lowpart (V16QImode, op1); > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - } > - else if (TARGET_SSE2 && mode == V2DFmode) > - { > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - else > - { > - m = adjust_address (op0, DFmode, 0); > - emit_insn (gen_sse2_storelpd (m, op1)); > - m = adjust_address (op0, DFmode, 8); > - emit_insn (gen_sse2_storehpd (m, op1)); > - } > + if (TARGET_SSE2 && mode == V2DFmode) > + { > + m = adjust_address (op0, DFmode, 0); > + emit_insn (gen_sse2_storelpd (m, op1)); > + m = adjust_address (op0, DFmode, 8); > + emit_insn (gen_sse2_storehpd (m, op1)); > } > else > { > if (mode != V4SFmode) > op1 = gen_lowpart (V4SFmode, op1); > > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - { > - op0 = gen_lowpart (V4SFmode, op0); > - emit_insn (gen_rtx_SET (op0, op1)); > - } > - else > - { > - m = adjust_address (op0, V2SFmode, 0); > - emit_insn (gen_sse_storelps (m, op1)); > - m = adjust_address (op0, V2SFmode, 8); > - emit_insn (gen_sse_storehps (m, op1)); > - } > + m = adjust_address (op0, V2SFmode, 0); > + emit_insn (gen_sse_storelps (m, op1)); > + m = adjust_address (op0, V2SFmode, 8); > + emit_insn (gen_sse_storehps (m, op1)); > } > } > else > -- > 2.5.5 >