Here is my attempt at creating a couple of new instructions to generate more bfi instructions on aarch64. I haven't finished testing this but it helps with gcc.target/aarch64/combine_bfi_1.c.
Before I went any further with it I wanted to see if anyone else was working on something like this and if this seems like a reasonable approach. Steve Ellcey sell...@marvell.com 2018-01-24 Steve Ellcey <sell...@marvell.com> PR rtl-optimization/87763 * config/aarch64/aarch64-protos.h (aarch64_masks_and_shift_for_aarch64_bfi_p): New prototype. * config/aarch64/aarch64.c (aarch64_masks_and_shift_for_aarch64_bfi_p): New function. * config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>4_shift): New instruction. (*aarch64_bfi<GPI:mode>4_noshift): Ditto.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b035e35..ec90053 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -429,6 +429,7 @@ bool aarch64_label_mentioned_p (rtx); void aarch64_declare_function_name (FILE *, const char*, tree); bool aarch64_legitimate_pic_operand_p (rtx); bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx); +bool aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode, rtx, rtx, rtx); bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx); bool aarch64_move_imm (HOST_WIDE_INT, machine_mode); opt_machine_mode aarch64_sve_pred_mode (unsigned int); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 5df5a8b..69cc69f 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -9294,6 +9294,44 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask, & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0; } +/* Return true if the masks and a shift amount from an RTX of the form + ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into + a BFI instruction of mode MODE. See *arch64_bfi patterns. */ + +bool +aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode mode, rtx mask1, + rtx shft_amnt, rtx mask2) +{ + unsigned HOST_WIDE_INT m1, m2, s, t; + + if (!CONST_INT_P (mask1) || !CONST_INT_P (mask2) || !CONST_INT_P (shft_amnt)) + return false; + + m1 = UINTVAL (mask1); + m2 = UINTVAL (mask2); + s = UINTVAL (shft_amnt); + + /* Verify that there is no overlap in what bits are set in the two masks. */ + if ((m1 + m2 + 1) != 0) + return false; + + /* Verify that the shift amount is less than the mode size. */ + if (s >= GET_MODE_BITSIZE (mode)) + return false; + + /* Verify that the mask being shifted is contigious and would be in the + least significant bits after shifting by creating a mask 't' based on + the number of bits set in mask2 and the shift amount for mask2 and + comparing that to the actual mask2. */ + t = popcount_hwi (m2); + t = (1 << t) - 1; + t = t << s; + if (t != m2) + return false; + + return true; +} + /* Calculate the cost of calculating X, storing it in *COST. Result is true if the total cost of the operation has now been calculated. */ static bool diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index b7f6fe0..e1f526b 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5476,6 +5476,41 @@ [(set_attr "type" "bfm")] ) +;; Match a bfi instruction where the shift of OP3 means that we are +;; actually copying the least significant bits of OP3 into OP0 by way +;; of the AND masks and the IOR instruction. + +(define_insn "*aarch64_bfi<GPI:mode>4_shift" + [(set (match_operand:GPI 0 "register_operand" "=r") + (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0") + (match_operand:GPI 2 "const_int_operand" "n")) + (and:GPI (ashift:GPI + (match_operand:GPI 3 "register_operand" "r") + (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n")) + (match_operand:GPI 5 "const_int_operand" "n"))))] + "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], operands[4], operands[5])" +{ + return "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5"; +} + [(set_attr "type" "bfm")] +) + +;; Like the above instruction but with no shifting, we are just copying the +;; least significant bits of OP3 to OP0. + +(define_insn "*aarch64_bfi<GPI:mode>4_noshift" + [(set (match_operand:GPI 0 "register_operand" "=r") + (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0") + (match_operand:GPI 2 "const_int_operand" "n")) + (and:GPI (match_operand:GPI 3 "register_operand" "r") + (match_operand:GPI 4 "const_int_operand" "n"))))] + "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], const0_rtx, operands[4])" +{ + return "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4"; +} + [(set_attr "type" "bfm")] +) + (define_insn "*extr_insv_lower_reg<mode>" [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r") (match_operand 1 "const_int_operand" "n")