+;; ------------------------------------------------------------------------- +;; - vfwmaccbf16 +;; ------------------------------------------------------------------------- +;; Combine extend + fma to widen_fma (vfwmacc) +(define_insn_and_split "*widen_bf16_fma<mode>" + [(set (match_operand:VWEXTF_ZVFBF 0 "register_operand") + (plus:VWEXTF_ZVFBF + (mult:VWEXTF_ZVFBF + (float_extend:VWEXTF_ZVFBF + (match_operand:<V_FPWIDETOBF16_TRUNC> 2 "register_operand")) + (float_extend:VWEXTF_ZVFBF + (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand"))) + (match_operand:VWEXTF_ZVFBF 1 "register_operand")))] + "TARGET_ZVFBFWMA && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + rtx ops[] = {operands[0], operands[1], operands[2], operands[3]}; + riscv_vector::emit_vlmax_insn (code_for_pred_widen_bf16_mul (<MODE>mode), + riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops); + DONE; + } + [(set_attr "type" "vfwmaccbf16") + (set_attr "mode" "<MODE>")])
It should be in autovec-opt.md juzhe.zh...@rivai.ai From: Feng Wang Date: 2024-10-16 22:10 To: gcc-patches CC: kito.cheng; juzhe.zhong; Feng Wang Subject: [PATCH] RISC-V:Auto vect for vector bf16 This patch add auto-vect patterns for vector-bfloat16 extension. Similar to vector extensions, these patterns can use vector BF16 instructions to optimize the automatic vectorization of for loops. gcc/ChangeLog: * config/riscv/vector-bfloat16.md (extend<v_fpwidetobf16_trunc><mode>2): Add auto-vect pattern for vector-bfloat16. (trunc<mode><v_fpwidetobf16_trunc>2): Ditto. (*widen_bf16_fma<mode>): Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c: New test. * gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c: New test. * gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c: New test. Signed-off-by: Feng Wang <wangf...@eswincomputing.com> --- gcc/config/riscv/vector-bfloat16.md | 144 ++++++++++++++++-- .../riscv/rvv/autovec/vfncvt-auto-vect.c | 19 +++ .../riscv/rvv/autovec/vfwcvt-auto-vect.c | 19 +++ .../riscv/rvv/autovec/vfwmacc-auto-vect.c | 14 ++ 4 files changed, 182 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c diff --git a/gcc/config/riscv/vector-bfloat16.md b/gcc/config/riscv/vector-bfloat16.md index 562aa8ee5ed..e6482a83356 100644 --- a/gcc/config/riscv/vector-bfloat16.md +++ b/gcc/config/riscv/vector-bfloat16.md @@ -25,8 +25,24 @@ (RVVMF2SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN > 32") ]) -(define_mode_attr V_FP32TOBF16_TRUNC [ +(define_mode_iterator VSF [ + (RVVM8SF "TARGET_VECTOR_ELEN_FP_32") (RVVM4SF "TARGET_VECTOR_ELEN_FP_32") (RVVM2SF "TARGET_VECTOR_ELEN_FP_32") + (RVVM1SF "TARGET_VECTOR_ELEN_FP_32") (RVVMF2SF "TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN > 32") +]) + +(define_mode_iterator VDF [ + (RVVM8DF "TARGET_VECTOR_ELEN_FP_64") (RVVM4DF "TARGET_VECTOR_ELEN_FP_64") + (RVVM2DF "TARGET_VECTOR_ELEN_FP_64") (RVVM1DF "TARGET_VECTOR_ELEN_FP_64") +]) + +(define_mode_attr V_FPWIDETOBF16_TRUNC [ (RVVM8SF "RVVM4BF") (RVVM4SF "RVVM2BF") (RVVM2SF "RVVM1BF") (RVVM1SF "RVVMF2BF") (RVVMF2SF "RVVMF4BF") + (RVVM8DF "RVVM2BF") (RVVM4DF "RVVM1BF") (RVVM2DF "RVVMF2BF") (RVVM1DF "RVVMF4BF") +]) + +(define_mode_attr v_fpwidetobf16_trunc [ + (RVVM8SF "rvvm4bf") (RVVM4SF "rvvm2bf") (RVVM2SF "rvvm1bf") (RVVM1SF "rvvmf2bf") (RVVMF2SF "rvvmf4bf") + (RVVM8DF "rvvm2bf") (RVVM4DF "rvvm1bf") (RVVM2DF "rvvmf2bf") (RVVM1DF "rvvmf4bf") ]) (define_mode_attr VF32_SUBEL [ @@ -35,8 +51,8 @@ ;; Zvfbfmin extension (define_insn "@pred_trunc<mode>_to_bf16" - [(set (match_operand:<V_FP32TOBF16_TRUNC> 0 "register_operand" "=vd, vd, vr, vr, &vr, &vr") - (if_then_else:<V_FP32TOBF16_TRUNC> + [(set (match_operand:<V_FPWIDETOBF16_TRUNC> 0 "register_operand" "=vd, vd, vr, vr, &vr, &vr") + (if_then_else:<V_FPWIDETOBF16_TRUNC> (unspec:<VM> [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1,vmWc1,vmWc1") (match_operand 4 "vector_length_operand" " rK, rK, rK, rK, rK, rK") @@ -47,13 +63,13 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM) (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE) - (float_truncate:<V_FP32TOBF16_TRUNC> + (float_truncate:<V_FPWIDETOBF16_TRUNC> (match_operand:VWEXTF_ZVFBF 3 "register_operand" " 0, 0, 0, 0, vr, vr")) - (match_operand:<V_FP32TOBF16_TRUNC> 2 "vector_merge_operand" " vu, 0, vu, 0, vu, 0")))] + (match_operand:<V_FPWIDETOBF16_TRUNC> 2 "vector_merge_operand" " vu, 0, vu, 0, vu, 0")))] "TARGET_ZVFBFMIN" "vfncvtbf16.f.f.w\t%0,%3%p1" [(set_attr "type" "vfncvtbf16") - (set_attr "mode" "<V_FP32TOBF16_TRUNC>") + (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>") (set (attr "frm_mode") (symbol_ref "riscv_vector::get_frm_mode (operands[8])"))]) @@ -69,12 +85,12 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (float_extend:VWEXTF_ZVFBF - (match_operand:<V_FP32TOBF16_TRUNC> 3 "register_operand" " vr, vr")) + (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand" " vr, vr")) (match_operand:VWEXTF_ZVFBF 2 "vector_merge_operand" " vu, 0")))] "TARGET_ZVFBFMIN" "vfwcvtbf16.f.f.v\t%0,%3%p1" [(set_attr "type" "vfwcvtbf16") - (set_attr "mode" "<V_FP32TOBF16_TRUNC>")]) + (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>")]) (define_insn "@pred_widen_bf16_mul_<mode>" @@ -93,15 +109,15 @@ (plus:VWEXTF_ZVFBF (mult:VWEXTF_ZVFBF (float_extend:VWEXTF_ZVFBF - (match_operand:<V_FP32TOBF16_TRUNC> 3 "register_operand" " vr")) + (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand" " vr")) (float_extend:VWEXTF_ZVFBF - (match_operand:<V_FP32TOBF16_TRUNC> 4 "register_operand" " vr"))) + (match_operand:<V_FPWIDETOBF16_TRUNC> 4 "register_operand" " vr"))) (match_operand:VWEXTF_ZVFBF 2 "register_operand" " 0")) (match_dup 2)))] "TARGET_ZVFBFWMA" "vfwmaccbf16.vv\t%0,%3,%4%p1" [(set_attr "type" "vfwmaccbf16") - (set_attr "mode" "<V_FP32TOBF16_TRUNC>") + (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>") (set (attr "frm_mode") (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))]) @@ -121,15 +137,115 @@ (plus:VWEXTF_ZVFBF (mult:VWEXTF_ZVFBF (float_extend:VWEXTF_ZVFBF - (vec_duplicate:<V_FP32TOBF16_TRUNC> + (vec_duplicate:<V_FPWIDETOBF16_TRUNC> (match_operand:<VF32_SUBEL> 3 "register_operand" " f"))) (float_extend:VWEXTF_ZVFBF - (match_operand:<V_FP32TOBF16_TRUNC> 4 "register_operand" " vr"))) + (match_operand:<V_FPWIDETOBF16_TRUNC> 4 "register_operand" " vr"))) (match_operand:VWEXTF_ZVFBF 2 "register_operand" " 0")) (match_dup 2)))] "TARGET_ZVFBFWMA" "vfwmaccbf16.vf\t%0,%3,%4%p1" [(set_attr "type" "vfwmaccbf16") - (set_attr "mode" "<V_FP32TOBF16_TRUNC>") + (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>") (set (attr "frm_mode") (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))]) + +;; Auto vect pattern + +;; ------------------------------------------------------------------------- +;; ---- [BF16] Widening. +;; ------------------------------------------------------------------------- +;; - vfwcvtbf16.f.f.v +;; ------------------------------------------------------------------------- +(define_insn_and_split "extend<v_fpwidetobf16_trunc><mode>2" + [(set (match_operand:VWEXTF_ZVFBF 0 "register_operand" "=&vr") + (float_extend:VWEXTF_ZVFBF + (match_operand:<V_FPWIDETOBF16_TRUNC> 1 "register_operand" " vr")))] + "TARGET_ZVFBFMIN && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + insn_code icode = code_for_pred_extend_bf16_to (<MODE>mode); + riscv_vector::emit_vlmax_insn (icode, riscv_vector::UNARY_OP, operands); + DONE; +} + [(set_attr "type" "vfwcvtbf16") + (set_attr "mode" "<MODE>")]) + +(define_expand "extend<v_fpwidetobf16_trunc><mode>2" + [(set (match_operand:VDF 0 "register_operand") + (float_extend:VDF + (match_operand:<V_FPWIDETOBF16_TRUNC> 1 "register_operand")))] + "TARGET_ZVFBFMIN" +{ + rtx dblw = gen_reg_rtx (<V_DOUBLE_TRUNC>mode); + emit_insn (gen_extend<v_fpwidetobf16_trunc><v_double_trunc>2 (dblw, operands[1])); + emit_insn (gen_extend<v_double_trunc><mode>2 (operands[0], dblw)); + DONE; +}) + +;; ------------------------------------------------------------------------- +;; ---- [BF16] Narrowing. +;; ------------------------------------------------------------------------- +;; - vfncvtbf16.f.f.w +;; ------------------------------------------------------------------------- +(define_insn_and_split "trunc<mode><v_fpwidetobf16_trunc>2" + [(set (match_operand:<V_FPWIDETOBF16_TRUNC> 0 "register_operand" "=vr") + (float_truncate:<V_FPWIDETOBF16_TRUNC> + (match_operand:VSF 1 "register_operand" " vr")))] + "TARGET_ZVFBFMIN && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + insn_code icode = code_for_pred_trunc_to_bf16 (<MODE>mode); + riscv_vector::emit_vlmax_insn (icode, riscv_vector::UNARY_OP_FRM_DYN, operands); + DONE; +} + [(set_attr "type" "vfncvtbf16") + (set_attr "mode" "<MODE>")]) + +(define_expand "trunc<mode><v_fpwidetobf16_trunc>2" + [(set (match_operand:<V_FPWIDETOBF16_TRUNC> 0 "register_operand") + (float_truncate:<V_FPWIDETOBF16_TRUNC> + (match_operand:VDF 1 "register_operand")))] + "TARGET_ZVFBFMIN" +{ + rtx half = gen_reg_rtx (<V_DOUBLE_TRUNC>mode); + rtx opshalf[] = {half, operands[1]}; + + /* According to the RISC-V V Spec 13.19. we need to use + vfncvt.rod.f.f.w for all steps but the last. */ + insn_code icode = code_for_pred_rod_trunc (<MODE>mode); + riscv_vector::emit_vlmax_insn (icode, riscv_vector::UNARY_OP, opshalf); + + emit_insn (gen_trunc<v_double_trunc><v_fpwidetobf16_trunc>2 (operands[0], half)); + DONE; +}) + +;; ------------------------------------------------------------------------- +;; - vfwmaccbf16 +;; ------------------------------------------------------------------------- +;; Combine extend + fma to widen_fma (vfwmacc) +(define_insn_and_split "*widen_bf16_fma<mode>" + [(set (match_operand:VWEXTF_ZVFBF 0 "register_operand") + (plus:VWEXTF_ZVFBF + (mult:VWEXTF_ZVFBF + (float_extend:VWEXTF_ZVFBF + (match_operand:<V_FPWIDETOBF16_TRUNC> 2 "register_operand")) + (float_extend:VWEXTF_ZVFBF + (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand"))) + (match_operand:VWEXTF_ZVFBF 1 "register_operand")))] + "TARGET_ZVFBFWMA && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + rtx ops[] = {operands[0], operands[1], operands[2], operands[3]}; + riscv_vector::emit_vlmax_insn (code_for_pred_widen_bf16_mul (<MODE>mode), + riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops); + DONE; + } + [(set_attr "type" "vfwmaccbf16") + (set_attr "mode" "<MODE>")]) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c new file mode 100644 index 00000000000..7ba3615ccf1 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv32gcv_zvfbfmin -mabi=ilp32d" } */ + +__attribute__((noipa)) +void vfncvt_float_BFloat16 (__bf16 *dst, float *a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = (__bf16)a[i]; +} + +__attribute__((noipa)) +void vfncvt_double_BFloat16 (__bf16 *dst, double *a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = (__bf16)a[i]; +} + +/* { dg-final { scan-assembler-times {\tvfncvtbf16\.f\.f\.w} 2 } } */ +/* { dg-final { scan-assembler-times {\tvfncvt\.rod\.f\.f\.w} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c new file mode 100644 index 00000000000..6629dd909a0 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv32gcv_zvfbfmin -mabi=ilp32d" } */ + +__attribute__((noipa)) +void vfwcvt__BFloat16float (float *dst, __bf16 *a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = (float)a[i]; +} + +__attribute__((noipa)) +void vfwcvt__BFloat16double (double *dst, __bf16 *a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = (double)a[i]; +} + +/* { dg-final { scan-assembler-times {\tvfwcvtbf16\.f\.f\.v} 2 } } */ +/* { dg-final { scan-assembler-times {\tvfwcvt\.f\.f\.v} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c new file mode 100644 index 00000000000..a767f2c8ef8 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv32gcv_zvfbfwma -mabi=ilp32d -ffast-math" } */ + +__attribute__ ((noipa)) +void vwmacc_float_bf16 (float *__restrict dst, + __bf16 *__restrict a, + __bf16 *__restrict b, + int n) +{ + for (int i = 0; i < n; i++) + dst[i] += (float) (a[i] * b[i]); +} + +/* { dg-final { scan-assembler-times {\tvfwmaccbf16\.vv} 1 } } */ -- 2.17.1