Hi! This patch adds pattern recognition (see attached testcase on what it e.g. can handle) of the i?86/x86_64 lock; bt[src] operations. It is too late to do this during or after RTL expansion, so it is done late during gimple, by recognizing these sequences in the fold builtins pass, turning those into an internal call which represents atomically setting, complementing or resetting a bit and remembering the previous value of the bit.
The patch doesn't handle (yet) the weirdo handling of memory operands where the counter can be actually not just in between 0 and bitsize - 1 of the particular mode, but can be much larger and the CPU locates the right memory word first, but could be extended to handle that later. I'd like to find out if there are other targets that have similar instructions in their ISAs, or if x86_64/i686 is the only one. Bootstrapped/regtested on x86_64-linux and i686-linux (relies on the gimple.c patch I've just posted, otherwise the expected number of scan-assembler-times would need to be tweaked for the short int cases). Ok for trunk? 2016-05-02 Jakub Jelinek <ja...@redhat.com> PR target/49244 * tree-ssa-ccp.c: Include stor-layout.h and optabs-query.h. (optimize_atomic_bit_test_and): New function. (pass_fold_builtins::execute): Use it. * optabs.def (atomic_bit_test_and_set_optab, atomic_bit_test_and_complement_optab, atomic_bit_test_and_reset_optab): New optabs. * internal-fn.def (ATOMIC_BIT_TEST_AND_SET, ATOMIC_BIT_TEST_AND_COMPLEMENT, ATOMIC_BIT_TEST_AND_RESET): New ifns. * builtins.h (expand_ifn_atomic_bit_test_and): New prototype. * builtins.c (expand_ifn_atomic_bit_test_and): New function. * internal-fn.c (expand_ATOMIC_BIT_TEST_AND_SET, expand_ATOMIC_BIT_TEST_AND_COMPLEMENT, expand_ATOMIC_BIT_TEST_AND_RESET): New functions. * doc/md.texi (atomic_bit_test_and_set@var{mode}, atomic_bit_test_and_complement@var{mode}, atomic_bit_test_and_reset@var{mode}): Document. * config/i386/sync.md (atomic_bit_test_and_set<mode>, atomic_bit_test_and_complement<mode>, atomic_bit_test_and_reset<mode>): New expanders. (atomic_bit_test_and_set<mode>_1, atomic_bit_test_and_complement<mode>_1, atomic_bit_test_and_reset<mode>_1): New insns. * gcc.target/i386/pr49244-1.c: New test. * gcc.target/i386/pr49244-2.c: New test. --- gcc/tree-ssa-ccp.c.jj 2016-05-01 12:21:05.063587549 +0200 +++ gcc/tree-ssa-ccp.c 2016-05-02 13:01:36.367044729 +0200 @@ -140,6 +140,8 @@ along with GCC; see the file COPYING3. #include "builtins.h" #include "tree-chkp.h" #include "cfgloop.h" +#include "stor-layout.h" +#include "optabs-query.h" /* Possible lattice values. */ @@ -2697,6 +2699,224 @@ optimize_unreachable (gimple_stmt_iterat return ret; } +/* Optimize + mask_2 = 1 << cnt_1; + _4 = __atomic_fetch_or_* (ptr_6, mask_2, _3); + _5 = _4 & mask_2; + to + _4 = ATOMIC_BIT_TEST_AND_SET (ptr_6, cnt_1, 0, _3); + _5 = _4; + If _5 is only used in _5 != 0 or _5 == 0 comparisons, 1 + is passed instead of 0, and the builtin just returns a zero + or 1 value instead of the actual bit. + Similarly for __sync_fetch_and_or_* (without the ", _3" part + in there), and/or if mask_2 is a power of 2 constant. + Similarly for xor instead of or, use ATOMIC_BIT_TEST_AND_COMPLEMENT + in that case. And similarly for and instead of or, except that + the second argument to the builtin needs to be one's complement + of the mask instead of mask. */ + +static void +optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip, + enum internal_fn fn, bool has_model_arg, + bool after) +{ + gimple *call = gsi_stmt (*gsip); + tree lhs = gimple_call_lhs (call); + use_operand_p use_p; + gimple *use_stmt; + tree mask, bit; + optab optab; + + if (!flag_inline_atomics + || optimize_debug + || !gimple_call_builtin_p (call, BUILT_IN_NORMAL) + || !lhs + || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs) + || !single_imm_use (lhs, &use_p, &use_stmt) + || !is_gimple_assign (use_stmt) + || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR + || !gimple_vdef (call)) + return; + + switch (fn) + { + case IFN_ATOMIC_BIT_TEST_AND_SET: + optab = atomic_bit_test_and_set_optab; + break; + case IFN_ATOMIC_BIT_TEST_AND_COMPLEMENT: + optab = atomic_bit_test_and_complement_optab; + break; + case IFN_ATOMIC_BIT_TEST_AND_RESET: + optab = atomic_bit_test_and_reset_optab; + break; + default: + return; + } + + if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing) + return; + + mask = gimple_call_arg (call, 1); + tree use_lhs = gimple_assign_lhs (use_stmt); + if (!use_lhs) + return; + + if (TREE_CODE (mask) == INTEGER_CST) + { + if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET) + mask = const_unop (BIT_NOT_EXPR, TREE_TYPE (mask), mask); + mask = fold_convert (TREE_TYPE (lhs), mask); + int ibit = tree_log2 (mask); + if (ibit < 0) + return; + bit = build_int_cst (TREE_TYPE (lhs), ibit); + } + else if (TREE_CODE (mask) == SSA_NAME) + { + gimple *g = SSA_NAME_DEF_STMT (mask); + if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET) + { + if (!is_gimple_assign (g) + || gimple_assign_rhs_code (g) != BIT_NOT_EXPR) + return; + mask = gimple_assign_rhs1 (g); + if (TREE_CODE (mask) != SSA_NAME) + return; + g = SSA_NAME_DEF_STMT (mask); + } + if (!is_gimple_assign (g) + || gimple_assign_rhs_code (g) != LSHIFT_EXPR + || !integer_onep (gimple_assign_rhs1 (g))) + return; + bit = gimple_assign_rhs2 (g); + } + else + return; + + if (gimple_assign_rhs1 (use_stmt) == lhs) + { + if (!operand_equal_p (gimple_assign_rhs2 (use_stmt), mask, 0)) + return; + } + else if (gimple_assign_rhs2 (use_stmt) != lhs + || !operand_equal_p (gimple_assign_rhs1 (use_stmt), mask, 0)) + return; + + bool use_bool = true; + bool has_debug_uses = false; + imm_use_iterator iter; + gimple *g; + + if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use_lhs)) + use_bool = false; + FOR_EACH_IMM_USE_STMT (g, iter, use_lhs) + { + enum tree_code code = ERROR_MARK; + tree op0, op1; + if (is_gimple_debug (g)) + { + has_debug_uses = true; + continue; + } + else if (is_gimple_assign (g)) + switch (gimple_assign_rhs_code (g)) + { + case COND_EXPR: + op1 = gimple_assign_rhs1 (g); + code = TREE_CODE (op1); + op0 = TREE_OPERAND (op1, 0); + op1 = TREE_OPERAND (op1, 1); + break; + case EQ_EXPR: + case NE_EXPR: + code = gimple_assign_rhs_code (g); + op0 = gimple_assign_rhs1 (g); + op1 = gimple_assign_rhs2 (g); + break; + default: + break; + } + else if (gimple_code (g) == GIMPLE_COND) + { + code = gimple_cond_code (g); + op0 = gimple_cond_lhs (g); + op1 = gimple_cond_rhs (g); + } + + if ((code == EQ_EXPR || code == NE_EXPR) + && op0 == use_lhs + && integer_zerop (op1)) + { + use_operand_p use_p; + int n = 0; + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + n++; + if (n == 1) + continue; + } + + use_bool = false; + BREAK_FROM_IMM_USE_STMT (iter); + } + + tree new_lhs = make_ssa_name (TREE_TYPE (lhs)); + tree flag = build_int_cst (TREE_TYPE (lhs), use_bool); + if (has_model_arg) + g = gimple_build_call_internal (fn, 4, gimple_call_arg (call, 0), + bit, flag, gimple_call_arg (call, 2)); + else + g = gimple_build_call_internal (fn, 3, gimple_call_arg (call, 0), + bit, flag); + gimple_call_set_lhs (g, new_lhs); + gimple_set_location (g, gimple_location (call)); + gimple_set_vuse (g, gimple_vuse (call)); + gimple_set_vdef (g, gimple_vdef (call)); + SSA_NAME_DEF_STMT (gimple_vdef (call)) = g; + gimple_stmt_iterator gsi = *gsip; + gsi_insert_after (&gsi, g, GSI_NEW_STMT); + if (after) + { + /* The internal function returns the value of the specified bit + before the atomic operation. If we are interested in the value + of the specified bit after the atomic operation (makes only sense + for xor, otherwise the bit content is compile time known), + we need to invert the bit. */ + g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)), + BIT_XOR_EXPR, new_lhs, + use_bool ? build_int_cst (TREE_TYPE (lhs), 1) + : mask); + new_lhs = gimple_assign_lhs (g); + gsi_insert_after (&gsi, g, GSI_NEW_STMT); + } + if (use_bool && has_debug_uses) + { + tree temp = make_node (DEBUG_EXPR_DECL); + DECL_ARTIFICIAL (temp) = 1; + TREE_TYPE (temp) = TREE_TYPE (lhs); + DECL_MODE (temp) = TYPE_MODE (TREE_TYPE (lhs)); + tree t = build2 (LSHIFT_EXPR, TREE_TYPE (lhs), new_lhs, bit); + g = gimple_build_debug_bind (temp, t, g); + gsi_insert_after (&gsi, g, GSI_NEW_STMT); + FOR_EACH_IMM_USE_STMT (g, iter, use_lhs) + if (is_gimple_debug (g)) + { + use_operand_p use_p; + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, temp); + update_stmt (g); + } + } + SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_lhs) + = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use_lhs); + replace_uses_by (use_lhs, new_lhs); + gsi = gsi_for_stmt (use_stmt); + gsi_remove (&gsi, true); + release_defs (use_stmt); + gsi_remove (gsip, true); + release_ssa_name (lhs); +} + /* A simple pass that attempts to fold all builtin functions. This pass is run after we've propagated as many constants as we can. */ @@ -2806,6 +3026,78 @@ pass_fold_builtins::execute (function *f cfg_changed = true; break; + case BUILT_IN_ATOMIC_FETCH_OR_1: + case BUILT_IN_ATOMIC_FETCH_OR_2: + case BUILT_IN_ATOMIC_FETCH_OR_4: + case BUILT_IN_ATOMIC_FETCH_OR_8: + case BUILT_IN_ATOMIC_FETCH_OR_16: + optimize_atomic_bit_test_and (&i, + IFN_ATOMIC_BIT_TEST_AND_SET, + true, false); + break; + case BUILT_IN_SYNC_FETCH_AND_OR_1: + case BUILT_IN_SYNC_FETCH_AND_OR_2: + case BUILT_IN_SYNC_FETCH_AND_OR_4: + case BUILT_IN_SYNC_FETCH_AND_OR_8: + case BUILT_IN_SYNC_FETCH_AND_OR_16: + optimize_atomic_bit_test_and (&i, + IFN_ATOMIC_BIT_TEST_AND_SET, + false, false); + break; + + case BUILT_IN_ATOMIC_FETCH_XOR_1: + case BUILT_IN_ATOMIC_FETCH_XOR_2: + case BUILT_IN_ATOMIC_FETCH_XOR_4: + case BUILT_IN_ATOMIC_FETCH_XOR_8: + case BUILT_IN_ATOMIC_FETCH_XOR_16: + optimize_atomic_bit_test_and + (&i, IFN_ATOMIC_BIT_TEST_AND_COMPLEMENT, true, false); + break; + case BUILT_IN_SYNC_FETCH_AND_XOR_1: + case BUILT_IN_SYNC_FETCH_AND_XOR_2: + case BUILT_IN_SYNC_FETCH_AND_XOR_4: + case BUILT_IN_SYNC_FETCH_AND_XOR_8: + case BUILT_IN_SYNC_FETCH_AND_XOR_16: + optimize_atomic_bit_test_and + (&i, IFN_ATOMIC_BIT_TEST_AND_COMPLEMENT, false, false); + break; + + case BUILT_IN_ATOMIC_XOR_FETCH_1: + case BUILT_IN_ATOMIC_XOR_FETCH_2: + case BUILT_IN_ATOMIC_XOR_FETCH_4: + case BUILT_IN_ATOMIC_XOR_FETCH_8: + case BUILT_IN_ATOMIC_XOR_FETCH_16: + optimize_atomic_bit_test_and + (&i, IFN_ATOMIC_BIT_TEST_AND_COMPLEMENT, true, true); + break; + case BUILT_IN_SYNC_XOR_AND_FETCH_1: + case BUILT_IN_SYNC_XOR_AND_FETCH_2: + case BUILT_IN_SYNC_XOR_AND_FETCH_4: + case BUILT_IN_SYNC_XOR_AND_FETCH_8: + case BUILT_IN_SYNC_XOR_AND_FETCH_16: + optimize_atomic_bit_test_and + (&i, IFN_ATOMIC_BIT_TEST_AND_COMPLEMENT, false, true); + break; + + case BUILT_IN_ATOMIC_FETCH_AND_1: + case BUILT_IN_ATOMIC_FETCH_AND_2: + case BUILT_IN_ATOMIC_FETCH_AND_4: + case BUILT_IN_ATOMIC_FETCH_AND_8: + case BUILT_IN_ATOMIC_FETCH_AND_16: + optimize_atomic_bit_test_and (&i, + IFN_ATOMIC_BIT_TEST_AND_RESET, + true, false); + break; + case BUILT_IN_SYNC_FETCH_AND_AND_1: + case BUILT_IN_SYNC_FETCH_AND_AND_2: + case BUILT_IN_SYNC_FETCH_AND_AND_4: + case BUILT_IN_SYNC_FETCH_AND_AND_8: + case BUILT_IN_SYNC_FETCH_AND_AND_16: + optimize_atomic_bit_test_and (&i, + IFN_ATOMIC_BIT_TEST_AND_RESET, + false, false); + break; + case BUILT_IN_VA_START: case BUILT_IN_VA_END: case BUILT_IN_VA_COPY: --- gcc/optabs.def.jj 2016-05-01 12:21:04.600593737 +0200 +++ gcc/optabs.def 2016-05-02 09:22:47.814226751 +0200 @@ -337,6 +337,9 @@ OPTAB_D (atomic_add_fetch_optab, "atomic OPTAB_D (atomic_add_optab, "atomic_add$I$a") OPTAB_D (atomic_and_fetch_optab, "atomic_and_fetch$I$a") OPTAB_D (atomic_and_optab, "atomic_and$I$a") +OPTAB_D (atomic_bit_test_and_set_optab, "atomic_bit_test_and_set$I$a") +OPTAB_D (atomic_bit_test_and_complement_optab, "atomic_bit_test_and_complement$I$a") +OPTAB_D (atomic_bit_test_and_reset_optab, "atomic_bit_test_and_reset$I$a") OPTAB_D (atomic_compare_and_swap_optab, "atomic_compare_and_swap$I$a") OPTAB_D (atomic_exchange_optab, "atomic_exchange$I$a") OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$I$a") --- gcc/internal-fn.def.jj 2016-05-01 12:21:04.574594084 +0200 +++ gcc/internal-fn.def 2016-05-02 09:22:47.815226737 +0200 @@ -189,6 +189,11 @@ DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NO current target. */ DEF_INTERNAL_FN (SET_EDOM, ECF_LEAF | ECF_NOTHROW, NULL) +/* Atomic functions. */ +DEF_INTERNAL_FN (ATOMIC_BIT_TEST_AND_SET, ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (ATOMIC_BIT_TEST_AND_COMPLEMENT, ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (ATOMIC_BIT_TEST_AND_RESET, ECF_LEAF | ECF_NOTHROW, NULL) + #undef DEF_INTERNAL_INT_FN #undef DEF_INTERNAL_FLT_FN #undef DEF_INTERNAL_OPTAB_FN --- gcc/builtins.h.jj 2016-05-01 12:21:04.915589527 +0200 +++ gcc/builtins.h 2016-05-02 09:22:47.816226723 +0200 @@ -71,6 +71,7 @@ extern tree std_fn_abi_va_list (tree); extern tree std_canonical_va_list_type (tree); extern void std_expand_builtin_va_start (tree, rtx); extern void expand_builtin_trap (void); +extern void expand_ifn_atomic_bit_test_and (gcall *); extern rtx expand_builtin (tree, rtx, rtx, machine_mode, int); extern rtx expand_builtin_with_bounds (tree, rtx, rtx, machine_mode, int); extern enum built_in_function builtin_mathfn_code (const_tree); --- gcc/builtins.c.jj 2016-05-01 12:21:04.856590316 +0200 +++ gcc/builtins.c 2016-05-02 09:22:47.818226695 +0200 @@ -5310,6 +5310,90 @@ expand_builtin_atomic_fetch_op (machine_ return ret; } +/* Expand IFN_ATOMIC_BIT_TEST_AND_* internal function. */ + +void +expand_ifn_atomic_bit_test_and (gcall *call) +{ + tree ptr = gimple_call_arg (call, 0); + tree bit = gimple_call_arg (call, 1); + tree flag = gimple_call_arg (call, 2); + tree lhs = gimple_call_lhs (call); + enum memmodel model = MEMMODEL_SYNC_SEQ_CST; + machine_mode mode = TYPE_MODE (TREE_TYPE (flag)); + enum rtx_code code; + optab optab; + struct expand_operand ops[5]; + + gcc_assert (flag_inline_atomics); + + if (gimple_call_num_args (call) == 4) + model = get_memmodel (gimple_call_arg (call, 3)); + + rtx mem = get_builtin_sync_mem (ptr, mode); + rtx val = expand_expr_force_mode (bit, mode); + + switch (gimple_call_internal_fn (call)) + { + case IFN_ATOMIC_BIT_TEST_AND_SET: + code = IOR; + optab = atomic_bit_test_and_set_optab; + break; + case IFN_ATOMIC_BIT_TEST_AND_COMPLEMENT: + code = XOR; + optab = atomic_bit_test_and_complement_optab; + break; + case IFN_ATOMIC_BIT_TEST_AND_RESET: + code = AND; + optab = atomic_bit_test_and_reset_optab; + break; + default: + gcc_unreachable (); + } + + if (lhs == NULL_TREE) + { + val = expand_simple_binop (mode, ASHIFT, const1_rtx, + val, NULL_RTX, true, OPTAB_DIRECT); + if (code == AND) + val = expand_simple_unop (mode, NOT, val, NULL_RTX, true); + expand_atomic_fetch_op (const0_rtx, mem, val, code, model, false); + return; + } + + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + enum insn_code icode = direct_optab_handler (optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + create_output_operand (&ops[0], target, mode); + create_fixed_operand (&ops[1], mem); + create_convert_operand_to (&ops[2], val, mode, true); + create_integer_operand (&ops[3], model); + create_integer_operand (&ops[4], integer_onep (flag)); + if (maybe_expand_insn (icode, 5, ops)) + return; + + rtx bitval = val; + val = expand_simple_binop (mode, ASHIFT, const1_rtx, + val, NULL_RTX, true, OPTAB_DIRECT); + rtx maskval = val; + if (code == AND) + val = expand_simple_unop (mode, NOT, val, NULL_RTX, true); + rtx result = expand_atomic_fetch_op (gen_reg_rtx (mode), mem, val, + code, model, false); + if (integer_onep (flag)) + { + result = expand_simple_binop (mode, ASHIFTRT, result, bitval, + NULL_RTX, true, OPTAB_DIRECT); + result = expand_simple_binop (mode, AND, result, const1_rtx, target, + true, OPTAB_DIRECT); + } + else + result = expand_simple_binop (mode, AND, result, maskval, target, true, + OPTAB_DIRECT); + if (result != target) + emit_move_insn (target, result); +} + /* Expand an atomic clear operation. void _atomic_clear (BOOL *obj, enum memmodel) EXP is the call expression. */ --- gcc/internal-fn.c.jj 2016-05-01 12:21:04.952589033 +0200 +++ gcc/internal-fn.c 2016-05-02 09:22:47.815226737 +0200 @@ -39,6 +39,7 @@ along with GCC; see the file COPYING3. #include "expr.h" #include "ubsan.h" #include "recog.h" +#include "builtins.h" /* The names of each internal function, indexed by function number. */ const char *const internal_fn_name_array[] = { @@ -2118,6 +2119,30 @@ expand_SET_EDOM (internal_fn, gcall *) #endif } +/* Expand atomic bit test and set. */ + +static void +expand_ATOMIC_BIT_TEST_AND_SET (internal_fn, gcall *call) +{ + expand_ifn_atomic_bit_test_and (call); +} + +/* Expand atomic bit test and complement. */ + +static void +expand_ATOMIC_BIT_TEST_AND_COMPLEMENT (internal_fn, gcall *call) +{ + expand_ifn_atomic_bit_test_and (call); +} + +/* Expand atomic bit test and reset. */ + +static void +expand_ATOMIC_BIT_TEST_AND_RESET (internal_fn, gcall *call) +{ + expand_ifn_atomic_bit_test_and (call); +} + /* Expand a call to FN using the operands in STMT. FN has a single output operand and NARGS input operands. */ --- gcc/doc/md.texi.jj 2016-02-22 22:26:40.000000000 +0100 +++ gcc/doc/md.texi 2016-05-02 09:37:55.018690799 +0200 @@ -6909,6 +6909,33 @@ The specific value that defines "set" is is normally based on what is performed by the native atomic test and set instruction. +@cindex @code{atomic_bit_test_and_set@var{mode}} instruction pattern +@cindex @code{atomic_bit_test_and_complement@var{mode}} instruction pattern +@cindex @code{atomic_bit_test_and_reset@var{mode}} instruction pattern +@item @samp{atomic_bit_test_and_set@var{mode}} +@itemx @samp{atomic_bit_test_and_complement@var{mode}} +@itemx @samp{atomic_bit_test_and_reset@var{mode}} +These patterns emit code for an atomic bitwise operation on memory with memory +model semantics, and return the original value of the specified bit. +Operand 0 is an output operand which contains the value of the specified bit +from the memory location before the operation was performed. Operand 1 is the +memory on which the atomic operation is performed. Operand 2 is the bit within +the operand, starting with least significant bit. Operand 3 is the memory model +to be used by the operation. Operand 4 is a flag - it is @code{const1_rtx} +if operand 0 should contain the original value of the specified bit in the +least significant bit of the operand, and @code{const0_rtx} if the bit should +be in its original position in the operand. +@code{atomic_bit_test_and_set@var{mode}} atomically sets the specified bit after +remembering its original value, @code{atomic_bit_test_and_complement@var{mode}} +inverts the specified bit and @code{atomic_bit_test_and_reset@var{mode}} clears +the specified bit. + +If these patterns are not defined, attempts will be made to use +@code{atomic_fetch_or@var{mode}}, @code{atomic_fetch_xor@var{mode}} or +@code{atomic_fetch_and@var{mode}} instruction patterns, or their @code{sync} +counterparts. If none of these are available a compare-and-swap +loop will be used. + @cindex @code{mem_thread_fence@var{mode}} instruction pattern @item @samp{mem_thread_fence@var{mode}} This pattern emits code required to implement a thread fence with --- gcc/config/i386/sync.md.jj 2016-05-01 12:21:05.013588217 +0200 +++ gcc/config/i386/sync.md 2016-05-02 09:22:47.819226682 +0200 @@ -605,3 +605,120 @@ (define_insn "atomic_<logic><mode>" (clobber (reg:CC FLAGS_REG))] "" "lock{%;} %K2<logic>{<imodesuffix>}\t{%1, %0|%0, %1}") + +(define_expand "atomic_bit_test_and_set<mode>" + [(match_operand:SWI248 0 "register_operand") + (match_operand:SWI248 1 "memory_operand") + (match_operand:SWI248 2 "nonmemory_operand") + (match_operand:SI 3 "const_int_operand") ;; model + (match_operand:SI 4 "const_int_operand")] + "" +{ + emit_insn (gen_atomic_bit_test_and_set<mode>_1 (operands[1], operands[2], + operands[3])); + operands[5] = gen_reg_rtx (QImode); + ix86_expand_setcc (operands[5], EQ, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + rtx result = convert_modes (<MODE>mode, QImode, operands[5], 1); + if (operands[4] == const0_rtx) + result = expand_simple_binop (<MODE>mode, ASHIFT, result, + operands[2], operands[0], 0, OPTAB_DIRECT); + if (result != operands[0]) + emit_move_insn (operands[0], result); + DONE; +}) + +(define_insn "atomic_bit_test_and_set<mode>_1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (unspec_volatile:SWI248 + [(match_operand:SWI248 0 "memory_operand" "+m") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_XCHG) + (const_int 0))) + (set (zero_extract:SWI248 (match_dup 0) + (const_int 1) + (match_operand:SWI248 1 + "nonmemory_operand" "<r>N")) + (const_int 1))] + "" + "lock{%;} %K2bts{<imodesuffix>}\t{%1, %0|%0, %1}") + +(define_expand "atomic_bit_test_and_complement<mode>" + [(match_operand:SWI248 0 "register_operand") + (match_operand:SWI248 1 "memory_operand") + (match_operand:SWI248 2 "nonmemory_operand") + (match_operand:SI 3 "const_int_operand") ;; model + (match_operand:SI 4 "const_int_operand")] + "" +{ + emit_insn (gen_atomic_bit_test_and_complement<mode>_1 (operands[1], + operands[2], + operands[3])); + operands[5] = gen_reg_rtx (QImode); + ix86_expand_setcc (operands[5], EQ, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + rtx result = convert_modes (<MODE>mode, QImode, operands[5], 1); + if (operands[4] == const0_rtx) + result = expand_simple_binop (<MODE>mode, ASHIFT, result, + operands[2], operands[0], 0, OPTAB_DIRECT); + if (result != operands[0]) + emit_move_insn (operands[0], result); + DONE; +}) + +(define_insn "atomic_bit_test_and_complement<mode>_1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (unspec_volatile:SWI248 + [(match_operand:SWI248 0 "memory_operand" "+m") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_XCHG) + (const_int 0))) + (set (zero_extract:SWI248 (match_dup 0) + (const_int 1) + (match_operand:SWI248 1 + "nonmemory_operand" "<r>N")) + (not:SWI248 (zero_extract:SWI248 (match_dup 0) + (const_int 1) + (match_dup 1))))] + "" + "lock{%;} %K2btc{<imodesuffix>}\t{%1, %0|%0, %1}") + +(define_expand "atomic_bit_test_and_reset<mode>" + [(match_operand:SWI248 0 "register_operand") + (match_operand:SWI248 1 "memory_operand") + (match_operand:SWI248 2 "nonmemory_operand") + (match_operand:SI 3 "const_int_operand") ;; model + (match_operand:SI 4 "const_int_operand")] + "" +{ + emit_insn (gen_atomic_bit_test_and_reset<mode>_1 (operands[1], operands[2], + operands[3])); + operands[5] = gen_reg_rtx (QImode); + ix86_expand_setcc (operands[5], EQ, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + rtx result = convert_modes (<MODE>mode, QImode, operands[5], 1); + if (operands[4] == const0_rtx) + result = expand_simple_binop (<MODE>mode, ASHIFT, result, + operands[2], operands[0], 0, OPTAB_DIRECT); + if (result != operands[0]) + emit_move_insn (operands[0], result); + DONE; +}) + +(define_insn "atomic_bit_test_and_reset<mode>_1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (unspec_volatile:SWI248 + [(match_operand:SWI248 0 "memory_operand" "+m") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_XCHG) + (const_int 0))) + (set (zero_extract:SWI248 (match_dup 0) + (const_int 1) + (match_operand:SWI248 1 + "nonmemory_operand" "<r>N")) + (const_int 0))] + "" + "lock{%;} %K2btr{<imodesuffix>}\t{%1, %0|%0, %1}") --- gcc/testsuite/gcc.target/i386/pr49244-1.c.jj 2016-05-02 14:52:56.776814774 +0200 +++ gcc/testsuite/gcc.target/i386/pr49244-1.c 2016-05-02 12:39:52.126750700 +0200 @@ -0,0 +1,188 @@ +/* PR target/49244 */ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +void bar (void); + +__attribute__((noinline, noclone)) int +f1 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + return (__sync_fetch_and_or (a, mask) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f2 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + unsigned int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED); + unsigned int t2 = t1 & mask; + return t2 != 0; +} + +__attribute__((noinline, noclone)) long int +f3 (long int *a, int bit) +{ + unsigned long int mask = (1ul << bit); + return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0; +} + +__attribute__((noinline, noclone)) int +f4 (int *a) +{ + unsigned int mask = (1u << 7); + return (__sync_fetch_and_or (a, mask) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f5 (int *a) +{ + unsigned int mask = (1u << 13); + return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f6 (int *a) +{ + unsigned int mask = (1u << 0); + return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +__attribute__((noinline, noclone)) void +f7 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + if ((__sync_fetch_and_xor (a, mask) & mask) != 0) + bar (); +} + +__attribute__((noinline, noclone)) void +f8 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0) + bar (); +} + +__attribute__((noinline, noclone)) int +f9 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f10 (int *a) +{ + unsigned int mask = (1u << 7); + return (__sync_fetch_and_xor (a, mask) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f11 (int *a) +{ + unsigned int mask = (1u << 13); + return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f12 (int *a) +{ + unsigned int mask = (1u << 0); + return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f13 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + return (__sync_fetch_and_and (a, ~mask) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f14 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f15 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f16 (int *a) +{ + unsigned int mask = (1u << 7); + return (__sync_fetch_and_and (a, ~mask) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f17 (int *a) +{ + unsigned int mask = (1u << 13); + return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0; +} + +__attribute__((noinline, noclone)) int +f18 (int *a) +{ + unsigned int mask = (1u << 0); + return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +__attribute__((noinline, noclone)) unsigned long int +f19 (unsigned long int *a, int bit) +{ + unsigned long int mask = (1ul << bit); + return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +__attribute__((noinline, noclone)) unsigned long int +f20 (unsigned long int *a) +{ + unsigned long int mask = (1ul << 7); + return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0; +} + +__attribute__((noinline, noclone)) int +f21 (int *a, int bit) +{ + unsigned int mask = (1u << bit); + return (__sync_fetch_and_or (a, mask) & mask); +} + +__attribute__((noinline, noclone)) unsigned long int +f22 (unsigned long int *a) +{ + unsigned long int mask = (1ul << 7); + return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask); +} + +__attribute__((noinline, noclone)) unsigned long int +f23 (unsigned long int *a) +{ + unsigned long int mask = (1ul << 7); + return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask); +} + +__attribute__((noinline, noclone)) unsigned short int +f24 (unsigned short int *a) +{ + unsigned short int mask = (1u << 7); + return (__sync_fetch_and_or (a, mask) & mask) != 0; +} + +__attribute__((noinline, noclone)) unsigned short int +f25 (unsigned short int *a) +{ + unsigned short int mask = (1u << 7); + return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0; +} + +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */ +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */ +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */ --- gcc/testsuite/gcc.target/i386/pr49244-2.c.jj 2016-05-02 12:51:51.501983254 +0200 +++ gcc/testsuite/gcc.target/i386/pr49244-2.c 2016-05-02 14:47:30.240202019 +0200 @@ -0,0 +1,109 @@ +/* PR target/49244 */ +/* { dg-do run } */ +/* { dg-options "-O2 -g" } */ +/* { dg-additional-options "-march=i486" { target ia32 } } */ + +int cnt; + +__attribute__((noinline, noclone)) void +bar (void) +{ + cnt++; +} + +#include "pr49244-1.c" + +int a; +long int b; +unsigned long int c; +unsigned short int d; + +int +main () +{ + __atomic_store_n (&a, 15, __ATOMIC_RELAXED); + if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15 + || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31) + __builtin_abort (); + if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31 + || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63) + __builtin_abort (); + __atomic_store_n (&b, 24, __ATOMIC_RELAXED); + if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28 + || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28) + __builtin_abort (); + __atomic_store_n (&a, 0, __ATOMIC_RELAXED); + if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128 + || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128) + __builtin_abort (); + if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320 + || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320) + __builtin_abort (); + if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321 + || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if (cnt != 0 + || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193 + || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193 + || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129 + || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193 + || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129 + || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320 + || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321) + __builtin_abort (); + if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193 + || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193) + __builtin_abort (); + if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1 + || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1) + __builtin_abort (); + if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0 + || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0) + __builtin_abort (); + __atomic_store_n (&a, 8321, __ATOMIC_RELAXED); + if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193 + || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193) + __builtin_abort (); + if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1 + || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1) + __builtin_abort (); + if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0 + || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0) + __builtin_abort (); + if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128 + || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0) + __builtin_abort (); + if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128 + || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0) + __builtin_abort (); + __atomic_store_n (&a, 128, __ATOMIC_RELAXED); + if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144 + || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144) + __builtin_abort (); + __atomic_store_n (&c, 1, __ATOMIC_RELAXED); + if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129 + || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1) + __builtin_abort (); + if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129 + || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1) + __builtin_abort (); + if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128 + || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128) + __builtin_abort (); + __atomic_store_n (&d, 1, __ATOMIC_RELAXED); + if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129 + || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129 + || cnt != 2) + __builtin_abort (); + return 0; +} Jakub