On Wed, Sep 23, 2015 at 12:19 PM, Ilya Enkovich <enkovich....@gmail.com> wrote: > On 14 Sep 17:50, Uros Bizjak wrote: >> >> +(define_insn_and_split "*zext<mode>_doubleword" >> + [(set (match_operand:DI 0 "register_operand" "=r") >> + (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))] >> + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2" >> + "#" >> + "&& reload_completed && GENERAL_REG_P (operands[0])" >> + [(set (match_dup 0) (zero_extend:SI (match_dup 1))) >> + (set (match_dup 2) (const_int 0))] >> + "split_double_mode (DImode, &operands[0], 1, &operands[0], >> &operands[2]);") >> + >> +(define_insn_and_split "*zextqi_doubleword" >> + [(set (match_operand:DI 0 "register_operand" "=r") >> + (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))] >> + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2" >> + "#" >> + "&& reload_completed && GENERAL_REG_P (operands[0])" >> + [(set (match_dup 0) (zero_extend:SI (match_dup 1))) >> + (set (match_dup 2) (const_int 0))] >> + "split_double_mode (DImode, &operands[0], 1, &operands[0], >> &operands[2]);") >> + >> >> Please put the above patterns together with other zero_extend >> patterns. You can also merge these two patterns using SWI124 mode >> iterator with <r> mode attribute as a register constraint. Also, no >> need to check for GENERAL_REG_P after reload, when "r" constraint is >> in effect: >> >> (define_insn_and_split "*zext<mode>_doubleword" >> [(set (match_operand:DI 0 "register_operand" "=r") >> (zero_extend:DI (match_operand:SWI124 1 "nonimmediate_operand" "<r>m")))] >> "!TARGET_64BIT && TARGET_STV && TARGET_SSE2" >> "#" >> "&& reload_completed" >> [(set (match_dup 0) (zero_extend:SI (match_dup 1))) >> (set (match_dup 2) (const_int 0))] >> "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);") > > Register constraint doesn't affect split and I need GENERAL_REG_P to filter > other registers case.
OK. > I merged QI and HI cases of zext but made a separate pattern for SI case > because it doesn't need zero_extend in resulting code. Bootstrapped and > regtested for x86_64-unknown-linux-gnu. This change is OK. The patch LGTM, but please wait a couple of days if Jeff has some comment on algorithmic aspect of the patch. Thanks, Uros. > > Thanks, > Ilya > -- > gcc/ > > 2015-09-23 Ilya Enkovich <enkovich....@gmail.com> > > * config/i386/i386.c: Include dbgcnt.h. > (has_non_address_hard_reg): New. > (convertible_comparison_p): New. > (scalar_to_vector_candidate_p): New. > (remove_non_convertible_regs): New. > (scalar_chain): New. > (scalar_chain::scalar_chain): New. > (scalar_chain::~scalar_chain): New. > (scalar_chain::add_to_queue): New. > (scalar_chain::mark_dual_mode_def): New. > (scalar_chain::analyze_register_chain): New. > (scalar_chain::add_insn): New. > (scalar_chain::build): New. > (scalar_chain::compute_convert_gain): New. > (scalar_chain::replace_with_subreg): New. > (scalar_chain::replace_with_subreg_in_insn): New. > (scalar_chain::emit_conversion_insns): New. > (scalar_chain::make_vector_copies): New. > (scalar_chain::convert_reg): New. > (scalar_chain::convert_op): New. > (scalar_chain::convert_insn): New. > (scalar_chain::convert): New. > (convert_scalars_to_vector): New. > (pass_data_stv): New. > (pass_stv): New. > (make_pass_stv): New. > (ix86_option_override): Created and register stv pass. > (flag_opts): Add -mstv. > (ix86_option_override_internal): Likewise. > * config/i386/i386.md (SWIM1248x): New. > (*movdi_internal): Add xmm to mem alternative for TARGET_STV. > (and<mode>3): Use SWIM1248x iterator instead of SWIM. > (*anddi3_doubleword): New. > (*zext<mode>_doubleword): New. > (*zextsi_doubleword): New. > (<code><mode>3): Use SWIM1248x iterator instead of SWIM. > (*<code>di3_doubleword): New. > * config/i386/i386.opt (mstv): New. > * dbgcnt.def (stv_conversion): New. > > gcc/testsuite/ > > 2015-09-23 Ilya Enkovich <enkovich....@gmail.com> > > * gcc.target/i386/pr65105-1.c: New. > * gcc.target/i386/pr65105-2.c: New. > * gcc.target/i386/pr65105-3.c: New. > * gcc.target/i386/pr65105-4.C: New. > * gcc.dg/lower-subreg-1.c: Add -mno-stv options for ia32. > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index d547cfd..2663f85 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -87,6 +87,7 @@ along with GCC; see the file COPYING3. If not see > #include "tree-iterator.h" > #include "tree-chkp.h" > #include "rtl-chkp.h" > +#include "dbgcnt.h" > > /* This file should be included last. */ > #include "target-def.h" > @@ -2600,6 +2601,908 @@ rest_of_handle_insert_vzeroupper (void) > return 0; > } > > +/* Return 1 if INSN uses or defines a hard register. > + Hard register uses in a memory address are ignored. > + Clobbers and flags definitions are ignored. */ > + > +static bool > +has_non_address_hard_reg (rtx_insn *insn) > +{ > + df_ref ref; > + FOR_EACH_INSN_DEF (ref, insn) > + if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) > + && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER) > + && DF_REF_REGNO (ref) != FLAGS_REG) > + return true; > + > + FOR_EACH_INSN_USE (ref, insn) > + if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref))) > + return true; > + > + return false; > +} > + > +/* Check if comparison INSN may be transformed > + into vector comparison. Currently we transform > + zero checks only which look like: > + > + (set (reg:CCZ 17 flags) > + (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4) > + (subreg:SI (reg:DI x) 0)) > + (const_int 0 [0]))) */ > + > +static bool > +convertible_comparison_p (rtx_insn *insn) > +{ > + if (!TARGET_SSE4_1) > + return false; > + > + rtx def_set = single_set (insn); > + > + gcc_assert (def_set); > + > + rtx src = SET_SRC (def_set); > + rtx dst = SET_DEST (def_set); > + > + gcc_assert (GET_CODE (src) == COMPARE); > + > + if (GET_CODE (dst) != REG > + || REGNO (dst) != FLAGS_REG > + || GET_MODE (dst) != CCZmode) > + return false; > + > + rtx op1 = XEXP (src, 0); > + rtx op2 = XEXP (src, 1); > + > + if (op2 != CONST0_RTX (GET_MODE (op2))) > + return false; > + > + if (GET_CODE (op1) != IOR) > + return false; > + > + op2 = XEXP (op1, 1); > + op1 = XEXP (op1, 0); > + > + if (!SUBREG_P (op1) > + || !SUBREG_P (op2) > + || GET_MODE (op1) != SImode > + || GET_MODE (op2) != SImode > + || ((SUBREG_BYTE (op1) != 0 > + || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode)) > + && (SUBREG_BYTE (op2) != 0 > + || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode)))) > + return false; > + > + op1 = SUBREG_REG (op1); > + op2 = SUBREG_REG (op2); > + > + if (op1 != op2 > + || !REG_P (op1) > + || GET_MODE (op1) != DImode) > + return false; > + > + return true; > +} > + > +/* Return 1 if INSN may be converted into vector > + instruction. */ > + > +static bool > +scalar_to_vector_candidate_p (rtx_insn *insn) > +{ > + rtx def_set = single_set (insn); > + > + if (!def_set) > + return false; > + > + if (has_non_address_hard_reg (insn)) > + return false; > + > + rtx src = SET_SRC (def_set); > + rtx dst = SET_DEST (def_set); > + > + if (GET_CODE (src) == COMPARE) > + return convertible_comparison_p (insn); > + > + /* We are interested in DImode promotion only. */ > + if (GET_MODE (src) != DImode > + || GET_MODE (dst) != DImode) > + return false; > + > + if (!REG_P (dst) && !MEM_P (dst)) > + return false; > + > + switch (GET_CODE (src)) > + { > + case PLUS: > + case MINUS: > + case IOR: > + case XOR: > + case AND: > + break; > + > + case REG: > + return true; > + > + case MEM: > + return REG_P (dst); > + > + default: > + return false; > + } > + > + if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0))) > + return false; > + > + if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1))) > + return false; > + > + if (GET_MODE (XEXP (src, 0)) != DImode > + || GET_MODE (XEXP (src, 1)) != DImode) > + return false; > + > + return true; > +} > + > +/* For a given bitmap of insn UIDs scans all instruction and > + remove insn from CANDIDATES in case it has both convertible > + and not convertible definitions. > + > + All insns in a bitmap are conversion candidates according to > + scalar_to_vector_candidate_p. Currently it implies all insns > + are single_set. */ > + > +static void > +remove_non_convertible_regs (bitmap candidates) > +{ > + bitmap_iterator bi; > + unsigned id; > + bitmap regs = BITMAP_ALLOC (NULL); > + > + EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi) > + { > + rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); > + rtx reg = SET_DEST (def_set); > + > + if (!REG_P (reg) > + || bitmap_bit_p (regs, REGNO (reg)) > + || HARD_REGISTER_P (reg)) > + continue; > + > + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg)); > + def; > + def = DF_REF_NEXT_REG (def)) > + { > + if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) > + { > + if (dump_file) > + fprintf (dump_file, > + "r%d has non convertible definition in insn %d\n", > + REGNO (reg), DF_REF_INSN_UID (def)); > + > + bitmap_set_bit (regs, REGNO (reg)); > + break; > + } > + } > + } > + > + EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) > + { > + for (df_ref def = DF_REG_DEF_CHAIN (id); > + def; > + def = DF_REF_NEXT_REG (def)) > + if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) > + { > + if (dump_file) > + fprintf (dump_file, "Removing insn %d from candidates list\n", > + DF_REF_INSN_UID (def)); > + > + bitmap_clear_bit (candidates, DF_REF_INSN_UID (def)); > + } > + } > + > + BITMAP_FREE (regs); > +} > + > +class scalar_chain > +{ > + public: > + scalar_chain (); > + ~scalar_chain (); > + > + static unsigned max_id; > + > + /* ID of a chain. */ > + unsigned int chain_id; > + /* A queue of instructions to be included into a chain. */ > + bitmap queue; > + /* Instructions included into a chain. */ > + bitmap insns; > + /* All registers defined by a chain. */ > + bitmap defs; > + /* Registers used in both vector and sclar modes. */ > + bitmap defs_conv; > + > + void build (bitmap candidates, unsigned insn_uid); > + int compute_convert_gain (); > + int convert (); > + > + private: > + void add_insn (bitmap candidates, unsigned insn_uid); > + void add_to_queue (unsigned insn_uid); > + void mark_dual_mode_def (df_ref def); > + void analyze_register_chain (bitmap candidates, df_ref ref); > + rtx replace_with_subreg (rtx x, rtx reg, rtx subreg); > + void emit_conversion_insns (rtx insns, rtx_insn *pos); > + void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg); > + void convert_insn (rtx_insn *insn); > + void convert_op (rtx *op, rtx_insn *insn); > + void convert_reg (unsigned regno); > + void make_vector_copies (unsigned regno); > +}; > + > +unsigned scalar_chain::max_id = 0; > + > +/* Initialize new chain. */ > + > +scalar_chain::scalar_chain () > +{ > + chain_id = ++max_id; > + > + if (dump_file) > + fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id); > + > + bitmap_obstack_initialize (NULL); > + insns = BITMAP_ALLOC (NULL); > + defs = BITMAP_ALLOC (NULL); > + defs_conv = BITMAP_ALLOC (NULL); > + queue = NULL; > +} > + > +/* Free chain's data. */ > + > +scalar_chain::~scalar_chain () > +{ > + BITMAP_FREE (insns); > + BITMAP_FREE (defs); > + BITMAP_FREE (defs_conv); > + bitmap_obstack_release (NULL); > +} > + > +/* Add instruction into chains' queue. */ > + > +void > +scalar_chain::add_to_queue (unsigned insn_uid) > +{ > + if (bitmap_bit_p (insns, insn_uid) > + || bitmap_bit_p (queue, insn_uid)) > + return; > + > + if (dump_file) > + fprintf (dump_file, " Adding insn %d into chain's #%d queue\n", > + insn_uid, chain_id); > + bitmap_set_bit (queue, insn_uid); > +} > + > +/* Mark register defined by DEF as requiring conversion. */ > + > +void > +scalar_chain::mark_dual_mode_def (df_ref def) > +{ > + gcc_assert (DF_REF_REG_DEF_P (def)); > + > + if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def))) > + return; > + > + if (dump_file) > + fprintf (dump_file, > + " Mark r%d def in insn %d as requiring both modes in chain > #%d\n", > + DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); > + > + bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); > +} > + > +/* Check REF's chain to add new insns into a queue > + and find registers requiring conversion. */ > + > +void > +scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref) > +{ > + df_link *chain; > + > + gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)) > + || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))); > + add_to_queue (DF_REF_INSN_UID (ref)); > + > + for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next) > + { > + unsigned uid = DF_REF_INSN_UID (chain->ref); > + if (!DF_REF_REG_MEM_P (chain->ref)) > + { > + if (bitmap_bit_p (insns, uid)) > + continue; > + > + if (bitmap_bit_p (candidates, uid)) > + { > + add_to_queue (uid); > + continue; > + } > + } > + > + if (DF_REF_REG_DEF_P (chain->ref)) > + { > + if (dump_file) > + fprintf (dump_file, " r%d def in insn %d isn't convertible\n", > + DF_REF_REGNO (chain->ref), uid); > + mark_dual_mode_def (chain->ref); > + } > + else > + { > + if (dump_file) > + fprintf (dump_file, " r%d use in insn %d isn't convertible\n", > + DF_REF_REGNO (chain->ref), uid); > + mark_dual_mode_def (ref); > + } > + } > +} > + > +/* Add instruction into a chain. */ > + > +void > +scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid) > +{ > + if (bitmap_bit_p (insns, insn_uid)) > + return; > + > + if (dump_file) > + fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, > chain_id); > + > + bitmap_set_bit (insns, insn_uid); > + > + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; > + rtx def_set = single_set (insn); > + if (def_set && REG_P (SET_DEST (def_set)) > + && !HARD_REGISTER_P (SET_DEST (def_set))) > + bitmap_set_bit (defs, REGNO (SET_DEST (def_set))); > + > + df_ref ref; > + df_ref def; > + for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) > + if (!HARD_REGISTER_P (DF_REF_REG (ref))) > + for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); > + def; > + def = DF_REF_NEXT_REG (def)) > + analyze_register_chain (candidates, def); > + for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) > + if (!DF_REF_REG_MEM_P (ref)) > + analyze_register_chain (candidates, ref); > +} > + > +/* Build new chain starting from insn INSN_UID recursively > + adding all dependent uses and definitions. */ > + > +void > +scalar_chain::build (bitmap candidates, unsigned insn_uid) > +{ > + queue = BITMAP_ALLOC (NULL); > + bitmap_set_bit (queue, insn_uid); > + > + if (dump_file) > + fprintf (dump_file, "Building chain #%d...\n", chain_id); > + > + while (!bitmap_empty_p (queue)) > + { > + insn_uid = bitmap_first_set_bit (queue); > + bitmap_clear_bit (queue, insn_uid); > + bitmap_clear_bit (candidates, insn_uid); > + add_insn (candidates, insn_uid); > + } > + > + if (dump_file) > + { > + fprintf (dump_file, "Collected chain #%d...\n", chain_id); > + fprintf (dump_file, " insns: "); > + dump_bitmap (dump_file, insns); > + if (!bitmap_empty_p (defs_conv)) > + { > + bitmap_iterator bi; > + unsigned id; > + const char *comma = ""; > + fprintf (dump_file, " defs to convert: "); > + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) > + { > + fprintf (dump_file, "%sr%d", comma, id); > + comma = ", "; > + } > + fprintf (dump_file, "\n"); > + } > + } > + > + BITMAP_FREE (queue); > +} > + > +/* Compute a gain for chain conversion. */ > + > +int > +scalar_chain::compute_convert_gain () > +{ > + bitmap_iterator bi; > + unsigned insn_uid; > + int gain = 0; > + int cost = 0; > + > + if (dump_file) > + fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); > + > + EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) > + { > + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; > + rtx def_set = single_set (insn); > + rtx src = SET_SRC (def_set); > + rtx dst = SET_DEST (def_set); > + > + if (REG_P (src) && REG_P (dst)) > + gain += COSTS_N_INSNS (2) - ix86_cost->sse_move; > + else if (REG_P (src) && MEM_P (dst)) > + gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; > + else if (MEM_P (src) && REG_P (dst)) > + gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1]; > + else if (GET_CODE (src) == PLUS > + || GET_CODE (src) == MINUS > + || GET_CODE (src) == IOR > + || GET_CODE (src) == XOR > + || GET_CODE (src) == AND) > + gain += ix86_cost->add; > + else if (GET_CODE (src) == COMPARE) > + { > + /* Assume comparison cost is the same. */ > + } > + else > + gcc_unreachable (); > + } > + > + if (dump_file) > + fprintf (dump_file, " Instruction convertion gain: %d\n", gain); > + > + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi) > + cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer; > + > + if (dump_file) > + fprintf (dump_file, " Registers convertion cost: %d\n", cost); > + > + gain -= cost; > + > + if (dump_file) > + fprintf (dump_file, " Total gain: %d\n", gain); > + > + return gain; > +} > + > +/* Replace REG in X with a V2DI subreg of NEW_REG. */ > + > +rtx > +scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg) > +{ > + if (x == reg) > + return gen_rtx_SUBREG (V2DImode, new_reg, 0); > + > + const char *fmt = GET_RTX_FORMAT (GET_CODE (x)); > + int i, j; > + for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) > + { > + if (fmt[i] == 'e') > + XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg); > + else if (fmt[i] == 'E') > + for (j = XVECLEN (x, i) - 1; j >= 0; j--) > + XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j), > + reg, new_reg); > + } > + > + return x; > +} > + > +/* Replace REG in INSN with a V2DI subreg of NEW_REG. */ > + > +void > +scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx > new_reg) > +{ > + replace_with_subreg (single_set (insn), reg, new_reg); > +} > + > +/* Insert generated conversion instruction sequence INSNS > + after instruction AFTER. New BB may be required in case > + instruction has EH region attached. */ > + > +void > +scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after) > +{ > + if (!control_flow_insn_p (after)) > + { > + emit_insn_after (insns, after); > + return; > + } > + > + basic_block bb = BLOCK_FOR_INSN (after); > + edge e = find_fallthru_edge (bb->succs); > + gcc_assert (e); > + > + basic_block new_bb = split_edge (e); > + emit_insn_after (insns, BB_HEAD (new_bb)); > +} > + > +/* Make vector copies for all register REGNO definitions > + and replace its uses in a chain. */ > + > +void > +scalar_chain::make_vector_copies (unsigned regno) > +{ > + rtx reg = regno_reg_rtx[regno]; > + rtx vreg = gen_reg_rtx (DImode); > + df_ref ref; > + > + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) > + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) > + { > + rtx_insn *insn = DF_REF_INSN (ref); > + > + start_sequence (); > + if (TARGET_SSE4_1) > + { > + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), > + CONST0_RTX (V4SImode), > + gen_rtx_SUBREG (SImode, reg, 0))); > + emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), > + gen_rtx_SUBREG (V4SImode, vreg, 0), > + gen_rtx_SUBREG (SImode, reg, 4), > + GEN_INT (2))); > + } > + else if (TARGET_INTER_UNIT_MOVES_TO_VEC) > + { > + rtx tmp = gen_reg_rtx (DImode); > + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), > + CONST0_RTX (V4SImode), > + gen_rtx_SUBREG (SImode, reg, 0))); > + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), > + CONST0_RTX (V4SImode), > + gen_rtx_SUBREG (SImode, reg, 4))); > + emit_insn (gen_vec_interleave_lowv4si > + (gen_rtx_SUBREG (V4SImode, vreg, 0), > + gen_rtx_SUBREG (V4SImode, vreg, 0), > + gen_rtx_SUBREG (V4SImode, tmp, 0))); > + } > + else > + { > + rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP); > + emit_move_insn (adjust_address (tmp, SImode, 0), > + gen_rtx_SUBREG (SImode, reg, 0)); > + emit_move_insn (adjust_address (tmp, SImode, 4), > + gen_rtx_SUBREG (SImode, reg, 4)); > + emit_move_insn (vreg, tmp); > + } > + emit_conversion_insns (get_insns (), insn); > + end_sequence (); > + > + if (dump_file) > + fprintf (dump_file, > + " Copied r%d to a vector register r%d for insn %d\n", > + regno, REGNO (vreg), DF_REF_INSN_UID (ref)); > + } > + > + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) > + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) > + { > + replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg); > + > + if (dump_file) > + fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", > + regno, REGNO (vreg), DF_REF_INSN_UID (ref)); > + } > +} > + > +/* Convert all definitions of register REGNO > + and fix its uses. Scalar copies may be created > + in case register is used in not convertible insn. */ > + > +void > +scalar_chain::convert_reg (unsigned regno) > +{ > + bool scalar_copy = bitmap_bit_p (defs_conv, regno); > + rtx reg = regno_reg_rtx[regno]; > + rtx scopy = NULL_RTX; > + df_ref ref; > + bitmap conv; > + > + conv = BITMAP_ALLOC (NULL); > + bitmap_copy (conv, insns); > + > + if (scalar_copy) > + scopy = gen_reg_rtx (DImode); > + > + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) > + { > + rtx_insn *insn = DF_REF_INSN (ref); > + rtx def_set = single_set (insn); > + rtx src = SET_SRC (def_set); > + rtx reg = DF_REF_REG (ref); > + > + if (!MEM_P (src)) > + { > + replace_with_subreg_in_insn (insn, reg, reg); > + bitmap_clear_bit (conv, INSN_UID (insn)); > + } > + > + if (scalar_copy) > + { > + rtx vcopy = gen_reg_rtx (V2DImode); > + > + start_sequence (); > + if (TARGET_INTER_UNIT_MOVES_FROM_VEC) > + { > + emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0)); > + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), > + gen_rtx_SUBREG (SImode, vcopy, 0)); > + emit_move_insn (vcopy, > + gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT > (32))); > + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), > + gen_rtx_SUBREG (SImode, vcopy, 0)); > + } > + else > + { > + rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP); > + emit_move_insn (tmp, reg); > + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), > + adjust_address (tmp, SImode, 0)); > + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), > + adjust_address (tmp, SImode, 4)); > + } > + emit_conversion_insns (get_insns (), insn); > + end_sequence (); > + > + if (dump_file) > + fprintf (dump_file, > + " Copied r%d to a scalar register r%d for insn %d\n", > + regno, REGNO (scopy), INSN_UID (insn)); > + } > + } > + > + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) > + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) > + { > + if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref))) > + { > + rtx def_set = single_set (DF_REF_INSN (ref)); > + if (!MEM_P (SET_DEST (def_set)) > + || !REG_P (SET_SRC (def_set))) > + replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg); > + bitmap_clear_bit (conv, DF_REF_INSN_UID (ref)); > + } > + } > + else > + { > + replace_rtx (DF_REF_INSN (ref), reg, scopy); > + df_insn_rescan (DF_REF_INSN (ref)); > + } > + > + BITMAP_FREE (conv); > +} > + > +/* Convert operand OP in INSN. All register uses > + are converted during registers conversion. > + Therefore we should just handle memory operands. */ > + > +void > +scalar_chain::convert_op (rtx *op, rtx_insn *insn) > +{ > + *op = copy_rtx_if_shared (*op); > + > + if (MEM_P (*op)) > + { > + rtx tmp = gen_reg_rtx (DImode); > + > + emit_insn_before (gen_move_insn (tmp, *op), insn); > + *op = gen_rtx_SUBREG (V2DImode, tmp, 0); > + > + if (dump_file) > + fprintf (dump_file, " Preloading operand for insn %d into r%d\n", > + INSN_UID (insn), REGNO (tmp)); > + } > + else > + { > + gcc_assert (SUBREG_P (*op)); > + gcc_assert (GET_MODE (*op) == V2DImode); > + } > +} > + > +/* Convert INSN to vector mode. */ > + > +void > +scalar_chain::convert_insn (rtx_insn *insn) > +{ > + rtx def_set = single_set (insn); > + rtx src = SET_SRC (def_set); > + rtx dst = SET_DEST (def_set); > + rtx subreg; > + > + if (MEM_P (dst) && !REG_P (src)) > + { > + /* There are no scalar integer instructions and therefore > + temporary register usage is required. */ > + rtx tmp = gen_reg_rtx (DImode); > + emit_conversion_insns (gen_move_insn (dst, tmp), insn); > + dst = gen_rtx_SUBREG (V2DImode, tmp, 0); > + } > + > + switch (GET_CODE (src)) > + { > + case PLUS: > + case MINUS: > + case IOR: > + case XOR: > + case AND: > + convert_op (&XEXP (src, 0), insn); > + convert_op (&XEXP (src, 1), insn); > + PUT_MODE (src, V2DImode); > + break; > + > + case MEM: > + if (!REG_P (dst)) > + convert_op (&src, insn); > + break; > + > + case REG: > + break; > + > + case SUBREG: > + gcc_assert (GET_MODE (src) == V2DImode); > + break; > + > + case COMPARE: > + src = SUBREG_REG (XEXP (XEXP (src, 0), 0)); > + > + gcc_assert ((REG_P (src) && GET_MODE (src) == DImode) > + || (SUBREG_P (src) && GET_MODE (src) == V2DImode)); > + > + if (REG_P (src)) > + subreg = gen_rtx_SUBREG (V2DImode, src, 0); > + else > + subreg = copy_rtx_if_shared (src); > + emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared > (subreg), > + copy_rtx_if_shared > (subreg), > + copy_rtx_if_shared > (subreg)), > + insn); > + dst = gen_rtx_REG (CCmode, FLAGS_REG); > + src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src), > + copy_rtx_if_shared (src)), > + UNSPEC_PTEST); > + break; > + > + default: > + gcc_unreachable (); > + } > + > + SET_SRC (def_set) = src; > + SET_DEST (def_set) = dst; > + > + /* Drop possible dead definitions. */ > + PATTERN (insn) = def_set; > + > + INSN_CODE (insn) = -1; > + recog_memoized (insn); > + df_insn_rescan (insn); > +} > + > +/* Convert whole chain creating required register > + conversions and copies. */ > + > +int > +scalar_chain::convert () > +{ > + bitmap_iterator bi; > + unsigned id; > + int converted_insns = 0; > + > + if (!dbg_cnt (stv_conversion)) > + return 0; > + > + if (dump_file) > + fprintf (dump_file, "Converting chain #%d...\n", chain_id); > + > + EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi) > + convert_reg (id); > + > + EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi) > + make_vector_copies (id); > + > + EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi) > + { > + convert_insn (DF_INSN_UID_GET (id)->insn); > + converted_insns++; > + } > + > + return converted_insns; > +} > + > +/* Main STV pass function. Find and convert scalar > + instructions into vector mode when profitable. */ > + > +static unsigned int > +convert_scalars_to_vector () > +{ > + basic_block bb; > + bitmap candidates; > + int converted_insns = 0; > + > + bitmap_obstack_initialize (NULL); > + candidates = BITMAP_ALLOC (NULL); > + > + calculate_dominance_info (CDI_DOMINATORS); > + df_set_flags (DF_DEFER_INSN_RESCAN); > + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); > + df_md_add_problem (); > + df_analyze (); > + > + /* Find all instructions we want to convert into vector mode. */ > + if (dump_file) > + fprintf (dump_file, "Searching for mode convertion candidates...\n"); > + > + FOR_EACH_BB_FN (bb, cfun) > + { > + rtx_insn *insn; > + FOR_BB_INSNS (bb, insn) > + if (scalar_to_vector_candidate_p (insn)) > + { > + if (dump_file) > + fprintf (dump_file, " insn %d is marked as a candidate\n", > + INSN_UID (insn)); > + > + bitmap_set_bit (candidates, INSN_UID (insn)); > + } > + } > + > + remove_non_convertible_regs (candidates); > + > + if (bitmap_empty_p (candidates)) > + if (dump_file) > + fprintf (dump_file, "There are no candidates for optimization.\n"); > + > + while (!bitmap_empty_p (candidates)) > + { > + unsigned uid = bitmap_first_set_bit (candidates); > + scalar_chain chain; > + > + /* Find instructions chain we want to convert to vector mode. > + Check all uses and definitions to estimate all required > + conversions. */ > + chain.build (candidates, uid); > + > + if (chain.compute_convert_gain () > 0) > + converted_insns += chain.convert (); > + else > + if (dump_file) > + fprintf (dump_file, "Chain #%d conversion is not profitable\n", > + chain.chain_id); > + } > + > + if (dump_file) > + fprintf (dump_file, "Total insns converted: %d\n", converted_insns); > + > + BITMAP_FREE (candidates); > + bitmap_obstack_release (NULL); > + df_process_deferred_rescans (); > + > + /* Conversion means we may have 128bit register spills/fills > + which require aligned stack. */ > + if (converted_insns) > + { > + if (crtl->stack_alignment_needed < 128) > + crtl->stack_alignment_needed = 128; > + if (crtl->stack_alignment_estimated < 128) > + crtl->stack_alignment_estimated = 128; > + } > + > + return 0; > +} > + > namespace { > > const pass_data pass_data_insert_vzeroupper = > @@ -2637,6 +3540,39 @@ public: > > }; // class pass_insert_vzeroupper > > +const pass_data pass_data_stv = > +{ > + RTL_PASS, /* type */ > + "stv", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + TV_NONE, /* tv_id */ > + 0, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + TODO_df_finish, /* todo_flags_finish */ > +}; > + > +class pass_stv : public rtl_opt_pass > +{ > +public: > + pass_stv (gcc::context *ctxt) > + : rtl_opt_pass (pass_data_stv, ctxt) > + {} > + > + /* opt_pass methods: */ > + virtual bool gate (function *) > + { > + return !TARGET_64BIT && TARGET_STV && TARGET_SSE2 && optimize > 1; > + } > + > + virtual unsigned int execute (function *) > + { > + return convert_scalars_to_vector (); > + } > + > +}; // class pass_stv > + > } // anon namespace > > rtl_opt_pass * > @@ -2645,6 +3581,12 @@ make_pass_insert_vzeroupper (gcc::context *ctxt) > return new pass_insert_vzeroupper (ctxt); > } > > +rtl_opt_pass * > +make_pass_stv (gcc::context *ctxt) > +{ > + return new pass_stv (ctxt); > +} > + > /* Return true if a red-zone is in use. */ > > static inline bool > @@ -2754,6 +3696,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const > char *arch, > { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS }, > { "-m8bit-idiv", MASK_USE_8BIT_IDIV }, > { "-mvzeroupper", MASK_VZEROUPPER }, > + { "-mstv", MASK_STV}, > { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD}, > { "-mavx256-split-unaligned-store", > MASK_AVX256_SPLIT_UNALIGNED_STORE}, > { "-mprefer-avx128", MASK_PREFER_AVX128}, > @@ -4366,6 +5309,8 @@ ix86_option_override_internal (bool main_args_p, > > if (!(opts_set->x_target_flags & MASK_VZEROUPPER)) > opts->x_target_flags |= MASK_VZEROUPPER; > + if (!(opts_set->x_target_flags & MASK_STV)) > + opts->x_target_flags |= MASK_STV; > if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL] > && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) > opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; > @@ -4479,12 +5424,18 @@ ix86_option_override (void) > = { pass_insert_vzeroupper, "reload", > 1, PASS_POS_INSERT_AFTER > }; > + opt_pass *pass_stv = make_pass_stv (g); > + struct register_pass_info stv_info > + = { pass_stv, "combine", > + 1, PASS_POS_INSERT_AFTER > + }; > > ix86_option_override_internal (true, &global_options, &global_options_set); > > > /* This needs to be done at start up. It's convenient to do it here. */ > register_pass (&insert_vzeroupper_info); > + register_pass (&stv_info); > } > > /* Implement the TARGET_OFFLOAD_OPTIONS hook. */ > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 7808705..89b74c9 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -978,6 +978,11 @@ > (HI "TARGET_HIMODE_MATH") > SI]) > > +;; Math-dependant integer modes with DImode. > +(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH") > + (HI "TARGET_HIMODE_MATH") > + SI (DI "(TARGET_STV && TARGET_SSE2) || > TARGET_64BIT")]) > + > ;; Math-dependant single word integer modes without QImode. > (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH") > SI (DI "TARGET_64BIT")]) > @@ -2094,9 +2099,9 @@ > > (define_insn "*movdi_internal" > [(set (match_operand:DI 0 "nonimmediate_operand" > - "=r ,o ,r,r ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,?r > ,?r,?*Yi,?*Ym,?*Yi,*k,*k ,*r ,*m") > + "=r ,o ,r,r ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,m,?r > ,?r,?*Yi,?*Ym,?*Yi,*k,*k ,*r ,*m") > (match_operand:DI 1 "general_operand" > - "riFo,riF,Z,rem,i,re,C ,*y,m ,*y,*Yn,r ,C ,*v,m ,*v,*Yj,*v,r ,*Yj > ,*Yn ,*r ,*km,*k,*k"))] > + "riFo,riF,Z,rem,i,re,C ,*y,m ,*y,*Yn,r ,C ,*v,m ,*v,v,*Yj,*v,r ,*Yj > ,*Yn ,*r ,*km,*k,*k"))] > "!(MEM_P (operands[0]) && MEM_P (operands[1]))" > { > switch (get_attr_type (insn)) > @@ -2174,9 +2179,9 @@ > [(set (attr "isa") > (cond [(eq_attr "alternative" "0,1") > (const_string "nox64") > - (eq_attr "alternative" "2,3,4,5,10,11,16,18,21,23") > + (eq_attr "alternative" "2,3,4,5,10,11,17,19,22,24") > (const_string "x64") > - (eq_attr "alternative" "17") > + (eq_attr "alternative" "18") > (const_string "x64_sse4") > ] > (const_string "*"))) > @@ -2187,13 +2192,13 @@ > (const_string "mmx") > (eq_attr "alternative" "7,8,9,10,11") > (const_string "mmxmov") > - (eq_attr "alternative" "12,17") > + (eq_attr "alternative" "12,18") > (const_string "sselog1") > - (eq_attr "alternative" "13,14,15,16,18") > + (eq_attr "alternative" "13,14,15,16,17,19") > (const_string "ssemov") > - (eq_attr "alternative" "19,20") > + (eq_attr "alternative" "20,21") > (const_string "ssecvt") > - (eq_attr "alternative" "21,22,23,24") > + (eq_attr "alternative" "22,23,24,25") > (const_string "mskmov") > (and (match_operand 0 "register_operand") > (match_operand 1 "pic_32bit_operand")) > @@ -2208,16 +2213,16 @@ > (set (attr "length_immediate") > (cond [(and (eq_attr "alternative" "4") (eq_attr "type" "imov")) > (const_string "8") > - (eq_attr "alternative" "17") > + (eq_attr "alternative" "18") > (const_string "1") > ] > (const_string "*"))) > (set (attr "prefix_rex") > - (if_then_else (eq_attr "alternative" "10,11,16,17,18") > + (if_then_else (eq_attr "alternative" "10,11,17,18,19") > (const_string "1") > (const_string "*"))) > (set (attr "prefix_extra") > - (if_then_else (eq_attr "alternative" "17") > + (if_then_else (eq_attr "alternative" "18") > (const_string "1") > (const_string "*"))) > (set (attr "prefix") > @@ -2245,13 +2250,26 @@ > ] > (const_string "TI")) > > - (and (eq_attr "alternative" "14,15") > + (and (eq_attr "alternative" "14,15,16") > (not (match_test "TARGET_SSE2"))) > (const_string "V2SF") > - (eq_attr "alternative" "17") > + (eq_attr "alternative" "18") > (const_string "TI") > ] > - (const_string "DI")))]) > + (const_string "DI"))) > + (set (attr "enabled") > + (cond [(eq_attr "alternative" "15") > + (if_then_else > + (match_test "TARGET_STV && TARGET_SSE2") > + (symbol_ref "false") > + (const_string "*")) > + (eq_attr "alternative" "16") > + (if_then_else > + (match_test "TARGET_STV && TARGET_SSE2") > + (symbol_ref "true") > + (symbol_ref "false")) > + ] > + (const_string "*")))]) > > (define_split > [(set (match_operand:DI 0 "nonimmediate_operand") > @@ -3811,6 +3829,26 @@ > "movz{bl|x}\t{%1, %k0|%k0, %1}" > [(set_attr "type" "imovx") > (set_attr "mode" "SI")]) > + > +(define_insn_and_split "*zext<mode>_doubleword" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (zero_extend:DI (match_operand:SWI12 1 "nonimmediate_operand" > "<r>m")))] > + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2" > + "#" > + "&& reload_completed && GENERAL_REG_P (operands[0])" > + [(set (match_dup 0) (zero_extend:SI (match_dup 1))) > + (set (match_dup 2) (const_int 0))] > + "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);") > + > +(define_insn_and_split "*zextsi_doubleword" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "rm")))] > + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2" > + "#" > + "&& reload_completed && GENERAL_REG_P (operands[0])" > + [(set (match_dup 0) (match_dup 1)) > + (set (match_dup 2) (const_int 0))] > + "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);") > > ;; Sign extension instructions > > @@ -7860,9 +7898,9 @@ > ;; it should be done with splitters. > > (define_expand "and<mode>3" > - [(set (match_operand:SWIM 0 "nonimmediate_operand") > - (and:SWIM (match_operand:SWIM 1 "nonimmediate_operand") > - (match_operand:SWIM 2 "<general_szext_operand>")))] > + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") > + (and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand") > + (match_operand:SWIM1248x 2 "<general_szext_operand>")))] > "" > { > machine_mode mode = <MODE>mode; > @@ -7940,6 +7978,23 @@ > (const_string "*"))) > (set_attr "mode" "SI,DI,DI,SI,DI")]) > > +(define_insn_and_split "*anddi3_doubleword" > + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r") > + (and:DI > + (match_operand:DI 1 "nonimmediate_operand" "%0,0,0") > + (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm"))) > + (clobber (reg:CC FLAGS_REG))] > + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 && ix86_binary_operator_ok > (AND, DImode, operands)" > + "#" > + "&& reload_completed" > + [(parallel [(set (match_dup 0) > + (and:SI (match_dup 1) (match_dup 2))) > + (clobber (reg:CC FLAGS_REG))]) > + (parallel [(set (match_dup 3) > + (and:SI (match_dup 4) (match_dup 5))) > + (clobber (reg:CC FLAGS_REG))])] > + "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);") > + > (define_insn "*andsi_1" > [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya,!k") > (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm,k") > @@ -8427,9 +8482,9 @@ > ;; If this is considered useful, it should be done with splitters. > > (define_expand "<code><mode>3" > - [(set (match_operand:SWIM 0 "nonimmediate_operand") > - (any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand") > - (match_operand:SWIM 2 "<general_operand>")))] > + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") > + (any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand") > + (match_operand:SWIM1248x 2 > "<general_operand>")))] > "" > "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;") > > @@ -8447,6 +8502,23 @@ > [(set_attr "type" "alu,alu,msklog") > (set_attr "mode" "<MODE>")]) > > +(define_insn_and_split "*<code>di3_doubleword" > + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r") > + (any_or:DI > + (match_operand:DI 1 "nonimmediate_operand" "%0,0,0") > + (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm"))) > + (clobber (reg:CC FLAGS_REG))] > + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 && ix86_binary_operator_ok > (<CODE>, DImode, operands)" > + "#" > + "&& reload_completed" > + [(parallel [(set (match_dup 0) > + (any_or:SI (match_dup 1) (match_dup 2))) > + (clobber (reg:CC FLAGS_REG))]) > + (parallel [(set (match_dup 3) > + (any_or:SI (match_dup 4) (match_dup 5))) > + (clobber (reg:CC FLAGS_REG))])] > + "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);") > + > (define_insn "*<code>hi_1" > [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k") > (any_or:HI > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt > index 042f3c1..dae5c5d 100644 > --- a/gcc/config/i386/i386.opt > +++ b/gcc/config/i386/i386.opt > @@ -567,6 +567,11 @@ Target Report Mask(VZEROUPPER) Save > Generate vzeroupper instruction before a transfer of control flow out of > the function. > > +mstv > +Target Report Mask(STV) Save > +Disable Scalar to Vector optimization pass transforming 64-bit integer > +computations into a vector ones. > + > mdispatch-scheduler > Target RejectNegative Var(flag_dispatch_scheduler) > Do dispatch scheduling if processor is bdver1 or bdver2 or bdver3 or bdver4 > and Haifa scheduling > diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def > index 95f6b06..583b16b 100644 > --- a/gcc/dbgcnt.def > +++ b/gcc/dbgcnt.def > @@ -186,6 +186,7 @@ DEBUG_COUNTER (sel_sched_region_cnt) > DEBUG_COUNTER (sms_sched_loop) > DEBUG_COUNTER (split_for_sched2) > DEBUG_COUNTER (store_motion) > +DEBUG_COUNTER (stv_conversion) > DEBUG_COUNTER (tail_call) > DEBUG_COUNTER (treepre_insert) > DEBUG_COUNTER (tree_sra) > diff --git a/gcc/testsuite/gcc.dg/lower-subreg-1.c > b/gcc/testsuite/gcc.dg/lower-subreg-1.c > index 6362d37..47057fe 100644 > --- a/gcc/testsuite/gcc.dg/lower-subreg-1.c > +++ b/gcc/testsuite/gcc.dg/lower-subreg-1.c > @@ -1,5 +1,6 @@ > /* { dg-do compile { target { ! { mips64 || { aarch64*-*-* arm*-*-* ia64-*-* > sparc*-*-* spu-*-* tilegx-*-* } } } } } */ > /* { dg-options "-O -fdump-rtl-subreg1" } */ > +/* { dg-additional-options "-mno-stv" { target ia32 } } */ > /* { dg-skip-if "" { { i?86-*-* x86_64-*-* } && x32 } { "*" } { "" } } */ > /* { dg-require-effective-target ilp32 } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr65105-1.c > b/gcc/testsuite/gcc.target/i386/pr65105-1.c > new file mode 100644 > index 0000000..bac6c07 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr65105-1.c > @@ -0,0 +1,50 @@ > +/* PR target/pr65105 */ > +/* { dg-do run { target { ia32 } } } */ > +/* { dg-options "-O2 -march=slm" } */ > +/* { dg-final { scan-assembler "por" } } */ > +/* { dg-final { scan-assembler "pand" } } */ > + > +#include "stdlib.h" > + > +static int count = 0; > + > +void __attribute__((noinline)) > +counter (long long l) > +{ > + count++; > + if (!l || count > 5) > + exit (1); > +} > + > +void __attribute__((noinline)) > +test (long long *arr) > +{ > + register unsigned long long tmp; > + > + tmp = arr[0] | arr[1] & arr[2]; > + while (tmp) > + { > + counter (tmp); > + tmp = *(arr++) & tmp; > + } > +} > + > +void __attribute__((noinline)) > +fill_data (long long *arr) > +{ > + arr[0] = 0x00ffffffL; > + arr[1] = 0xffffff00L; > + arr[2] = 0x00ffffffL; > + arr[3] = 0x0000ff00L; > + arr[4] = 0x00ff0000L; > + arr[5] = 0xff000000L; > +} > + > +int > +main (int argc, const char **argv) > +{ > + long long arr[6]; > + fill_data (arr); > + test (arr); > + return count - 5; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr65105-2.c > b/gcc/testsuite/gcc.target/i386/pr65105-2.c > new file mode 100644 > index 0000000..9216894 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr65105-2.c > @@ -0,0 +1,12 @@ > +/* PR target/pr65105 */ > +/* { dg-do compile { target { ia32 } } } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { scan-assembler "por" } } */ > + > +long long i1, i2, res; > + > +void > +test () > +{ > + res = i1 | i2; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr65105-3.c > b/gcc/testsuite/gcc.target/i386/pr65105-3.c > new file mode 100644 > index 0000000..b83989f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr65105-3.c > @@ -0,0 +1,16 @@ > +/* PR target/pr65105 */ > +/* { dg-do compile { target { ia32 } } } */ > +/* { dg-options "-O2 -march=slm -msse4.2" } */ > +/* { dg-final { scan-assembler "pand" } } */ > +/* { dg-final { scan-assembler "por" } } */ > +/* { dg-final { scan-assembler "ptest" } } */ > + > +long long i1, i2, i3, res; > + > +void > +test () > +{ > + res = i1 | i2; > + if (res) > + res &= i3; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr65105-4.C > b/gcc/testsuite/gcc.target/i386/pr65105-4.C > new file mode 100644 > index 0000000..9acf368 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr65105-4.C > @@ -0,0 +1,19 @@ > +/* PR target/pr65105 */ > +/* { dg-do run { target { ia32 } } } */ > +/* { dg-options "-O2 -march=slm" } */ > + > +struct s { > + long long l1, l2, l3, l4, l5; > +} *a; > +long long b; > +long long fn1() > +{ > + try > + { > + b = (a->l1 | a->l2 | a->l3 | a->l4 | a->l5); > + return a->l1; > + } > + catch (int) > + { > + } > +}