Dominik Vogt wrote: > > v3: > > > > * Remove sne* patterns. > > * Move alignment check from s390_expand_cs to s390.md. > > * Use s_operand instead of memory_nosymref_operand. > > * Remove memory_nosymref_operand. > > * Allow any CC-mode in cstorecc4 for TARGET_Z196. > > * Fix EQ with TARGET_Z196 in cstorecc4. > > * Duplicate CS patterns for CCZmode. > > > > Bootstrapped and regression tested on a zEC12 with s390 and s390x > > biarch.
> s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem, > - rtx cmp, rtx new_rtx) > + rtx cmp, rtx new_rtx, machine_mode ccmode) > { > - emit_insn (gen_atomic_compare_and_swapsi_internal (old, mem, cmp, > new_rtx)); > - return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM), > - const0_rtx); > + switch (GET_MODE (mem)) > + { > + case SImode: > + if (ccmode == CCZ1mode) > + emit_insn (gen_atomic_compare_and_swapsiccz1_internal (old, mem, cmp, > + new_rtx)); > + else > + emit_insn (gen_atomic_compare_and_swapsiccz_internal (old, mem, cmp, > + new_rtx)); > + break; > + case DImode: > + if (ccmode == CCZ1mode) > + emit_insn (gen_atomic_compare_and_swapdiccz1_internal (old, mem, cmp, > + new_rtx)); > + else > + emit_insn (gen_atomic_compare_and_swapdiccz_internal (old, mem, cmp, > + new_rtx)); > + break; > + case TImode: > + if (ccmode == CCZ1mode) > + emit_insn (gen_atomic_compare_and_swapticcz1_internal (old, mem, cmp, > + new_rtx)); > + else > + emit_insn (gen_atomic_compare_and_swapticcz_internal (old, mem, cmp, > + new_rtx)); > + break; These expanders don't really do anything different depending on the mode of the accessed word (SI/DI/TImode), so this seems like a bit of unncessary duplication. The original code was correct in always calling the SImode variant, even if this looks a bit odd. Maybe a better fix is to just remove the mode from this expander. > + if (TARGET_Z196 > + && (mode == SImode || mode == DImode)) > + do_const_opt = (is_weak && CONST_INT_P (cmp)); > + > + if (do_const_opt) > + { > + const int very_unlikely = REG_BR_PROB_BASE / 100 - 1; > + rtx cc = gen_rtx_REG (CCZmode, CC_REGNUM); > + > + skip_cs_label = gen_label_rtx (); > + emit_move_insn (output, mem); > + emit_move_insn (btarget, const0_rtx); > + emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (CCZmode, output, cmp))); > + s390_emit_jump (skip_cs_label, gen_rtx_NE (VOIDmode, cc, const0_rtx)); > + add_int_reg_note (get_last_insn (), REG_BR_PROB, very_unlikely); > + /* If the jump is not taken, OUTPUT is the expected value. */ > + cmp = output; > + /* Reload newval to a register manually, *after* the compare and jump > + above. Otherwise Reload might place it before the jump. */ > + } > + else > + cmp = force_reg (mode, cmp); > + new_rtx = force_reg (mode, new_rtx); > + s390_emit_compare_and_swap (EQ, output, mem, cmp, new_rtx, > + (do_const_opt) ? CCZmode : CCZ1mode); > + > + /* We deliberately accept non-register operands in the predicate > + to ensure the write back to the output operand happens *before* > + the store-flags code below. This makes it easier for combine > + to merge the store-flags code with a potential test-and-branch > + pattern following (immediately!) afterwards. */ > + if (output != vtarget) > + emit_move_insn (vtarget, output); > + > + if (skip_cs_label != NULL) > + emit_label (skip_cs_label); So if do_const_opt is true, but output != vtarget, the code above will write to output, but this is then never moved to vtarget. This looks incorrect. > + if (TARGET_Z196 && do_const_opt) do_const_opt seems to always imply TARGET_Z196. > +; Peephole to combine a load-and-test from volatile memory which combine does > +; not do. > +(define_peephole2 > + [(set (match_operand:GPR 0 "register_operand") > + (match_operand:GPR 2 "memory_operand")) > + (set (reg CC_REGNUM) > + (compare (match_dup 0) (match_operand:GPR 1 "const0_operand")))] > + "s390_match_ccmode(insn, CCSmode) && TARGET_EXTIMM > + && GENERAL_REG_P (operands[0]) > + && satisfies_constraint_T (operands[2])" > + [(parallel > + [(set (reg:CCS CC_REGNUM) > + (compare:CCS (match_dup 2) (match_dup 1))) > + (set (match_dup 0) (match_dup 2))])]) We should really try to understand why this isn't done earlier and fix the problem there ... > [(parallel > [(set (match_operand:SI 0 "register_operand" "") > (match_operator:SI 1 "s390_eqne_operator" > - [(match_operand:CCZ1 2 "register_operand") > + [(match_operand 2 "cc_reg_operand") > (match_operand 3 "const0_operand")])) > (clobber (reg:CC CC_REGNUM))])] > "" > - "emit_insn (gen_sne (operands[0], operands[2])); > - if (GET_CODE (operands[1]) == EQ) > - emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx)); > + "machine_mode mode = GET_MODE (operands[2]); > + if (TARGET_Z196) > + { > + rtx cond, ite; > + > + if (GET_CODE (operands[1]) == NE) > + cond = gen_rtx_NE (VOIDmode, operands[2], const0_rtx); > + else > + cond = gen_rtx_EQ (VOIDmode, operands[2], const0_rtx); I guess as a result cond is now always the same as operands[1] and could be just taken from there? > + ite = gen_rtx_IF_THEN_ELSE (SImode, cond, const1_rtx, const0_rtx); > + emit_insn (gen_rtx_SET (operands[0], ite)); Also, since you're just emitting RTL directly, maybe you could simply use the expander pattern above to do so (and not use emit_insn followed by DONE in this case?) > @@ -6535,9 +6570,7 @@ > "" > "#" > "reload_completed" > - [(parallel > - [(set (match_dup 0) (ashiftrt:SI (match_dup 0) (const_int 28))) > - (clobber (reg:CC CC_REGNUM))])]) > + [(set (match_dup 0) (lshiftrt:SI (match_dup 0) (const_int 28)))]) Why is this necessary? > -(define_expand "atomic_compare_and_swap<mode>_internal" > +(define_expand "atomic_compare_and_swap<DGPR:mode><CCZZ1:mode>_internal" > [(parallel > [(set (match_operand:DGPR 0 "register_operand") > - (match_operand:DGPR 1 "memory_operand")) > + (match_operand:DGPR 1 "s_operand")) > (set (match_dup 1) > (unspec_volatile:DGPR > [(match_dup 1) > (match_operand:DGPR 2 "register_operand") > (match_operand:DGPR 3 "register_operand")] > UNSPECV_CAS)) > - (set (reg:CCZ1 CC_REGNUM) > - (compare:CCZ1 (match_dup 1) (match_dup 2)))])] > + (set (reg:CCZZ1 CC_REGNUM) > + (compare:CCZZ1 (match_dup 1) (match_dup 2)))])] > "") See above ... > ; cdsg, csg > -(define_insn "*atomic_compare_and_swap<mode>_1" > +(define_insn "*atomic_compare_and_swap<TDI:mode><CCZZ1:mode>_1" > [(set (match_operand:TDI 0 "register_operand" "=r") > - (match_operand:TDI 1 "memory_operand" "+S")) > + (match_operand:TDI 1 "s_operand" "+S")) > (set (match_dup 1) > (unspec_volatile:TDI > [(match_dup 1) > (match_operand:TDI 2 "register_operand" "0") > (match_operand:TDI 3 "register_operand" "r")] > UNSPECV_CAS)) > - (set (reg:CCZ1 CC_REGNUM) > - (compare:CCZ1 (match_dup 1) (match_dup 2)))] > + (set (reg:CCZZ1 CC_REGNUM) > + (compare:CCZZ1 (match_dup 1) (match_dup 2)))] > "TARGET_ZARCH" > "c<td>sg\t%0,%3,%S1" > [(set_attr "op_type" "RSY") > (set_attr "type" "sem")]) So for all these *insn* patterns, I think they don't need to be duplicated at all, instead they should simply use the s390_match_ccmode mechanism to allow multiple CCmodes. It seems that CCZ1mode will have to be added to s390_match_ccmode_set to make this work: if set_mode is CCZ1mode or CCZmode, a req_mode of CCZ1mode should be accepted. Then this pattern can simply use && s390_match_ccmode(insn, CCZ1mode) Bye, Ulrich