On 14/09/12 19:03, Ulrich Weigand wrote: > Hello, > > following up on the prior patch, this patch exploits more opportunities to > generate the vld1 / vst1 family of instructions, this time to implement the > vec_set and vec_extract patterns with memory scalar operands. > > Without the patch, vec_set/vec_extract only support register operands for the > scalar, possibly requiring extra moves. In some cases we'd still get a vst1 > instruction as a result, since combine would match a neon_vst1_lane pattern. > > However, that pattern seems to be actually incorrect for big-endian systems > (due to a line number ordering mismatch). The patch therefore also changes > neon_vst1_lane to use an UNSPEC instead of vec_select to model its operation, > just like all the other NEON intrinsic patterns depending on line numbers > already do. > > Benchmarking showed only marginal improvements, but no regression. It seems > useful to support this anyway ... > > Tested on arm-linux-gnueabi. > OK for mainline? >
You confused me for a bit with the reference to "line numbers", but I think you must mean "lane numbers". FTR, I see no reason why GCC would have problems with 64-bit vectors in big-endian mode, it's only 128-bit (and larger) vectors that pose a problem (ok, the HW numbers the lanes from the other end, but the transformation is trivial). This is OK. R. > Bye, > Ulrich > > > 2012-09-14 Ulrich Weigand <ulrich.weig...@linaro.org> > > * config/arm/arm.c (arm_rtx_costs_1): Handle vec_extract and vec_set > patterns. > * config/arm/arm.md ("vec_set<mode>_internal"): Support memory source > operands, implemented via vld1 instruction. > ("vec_extract<mode>"): Support memory destination operands, > implemented > via vst1 instruction. > ("neon_vst1_lane<mode>"): Use UNSPEC_VST1_LANE instead of vec_select. > * config/arm/predicates.md ("neon_lane_number"): Remove. > > Index: gcc-head/gcc/config/arm/arm.c > =================================================================== > --- gcc-head.orig/gcc/config/arm/arm.c 2012-09-14 19:40:51.000000000 +0200 > +++ gcc-head/gcc/config/arm/arm.c 2012-09-14 19:41:14.000000000 +0200 > @@ -7666,6 +7666,28 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou > return true; > > case SET: > + /* The vec_extract patterns accept memory operands that require an > + address reload. Account for the cost of that reload to give the > + auto-inc-dec pass an incentive to try to replace them. */ > + if (TARGET_NEON && MEM_P (SET_DEST (x)) > + && GET_CODE (SET_SRC (x)) == VEC_SELECT) > + { > + *total = rtx_cost (SET_DEST (x), code, 0, speed); > + if (!neon_vector_mem_operand (SET_DEST (x), 2)) > + *total += COSTS_N_INSNS (1); > + return true; > + } > + /* Likewise for the vec_set patterns. */ > + if (TARGET_NEON && GET_CODE (SET_SRC (x)) == VEC_MERGE > + && GET_CODE (XEXP (SET_SRC (x), 0)) == VEC_DUPLICATE > + && MEM_P (XEXP (XEXP (SET_SRC (x), 0), 0))) > + { > + rtx mem = XEXP (XEXP (SET_SRC (x), 0), 0); > + *total = rtx_cost (mem, code, 0, speed); > + if (!neon_vector_mem_operand (mem, 2)) > + *total += COSTS_N_INSNS (1); > + return true; > + } > return false; > > case UNSPEC: > Index: gcc-head/gcc/config/arm/neon.md > =================================================================== > --- gcc-head.orig/gcc/config/arm/neon.md 2012-09-14 19:40:51.000000000 > +0200 > +++ gcc-head/gcc/config/arm/neon.md 2012-09-14 19:41:14.000000000 +0200 > @@ -416,30 +416,33 @@ > [(set_attr "neon_type" "neon_vld1_1_2_regs")]) > > (define_insn "vec_set<mode>_internal" > - [(set (match_operand:VD 0 "s_register_operand" "=w") > + [(set (match_operand:VD 0 "s_register_operand" "=w,w") > (vec_merge:VD > (vec_duplicate:VD > - (match_operand:<V_elem> 1 "s_register_operand" "r")) > - (match_operand:VD 3 "s_register_operand" "0") > - (match_operand:SI 2 "immediate_operand" "i")))] > + (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r")) > + (match_operand:VD 3 "s_register_operand" "0,0") > + (match_operand:SI 2 "immediate_operand" "i,i")))] > "TARGET_NEON" > { > int elt = ffs ((int) INTVAL (operands[2])) - 1; > if (BYTES_BIG_ENDIAN) > elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt; > operands[2] = GEN_INT (elt); > - > - return "vmov.<V_sz_elem>\t%P0[%c2], %1"; > + > + if (which_alternative == 0) > + return "vld1.<V_sz_elem>\t{%P0[%c2]}, %A1"; > + else > + return "vmov.<V_sz_elem>\t%P0[%c2], %1"; > } > - [(set_attr "neon_type" "neon_mcr")]) > + [(set_attr "neon_type" "neon_vld1_vld2_lane,neon_mcr")]) > > (define_insn "vec_set<mode>_internal" > - [(set (match_operand:VQ 0 "s_register_operand" "=w") > + [(set (match_operand:VQ 0 "s_register_operand" "=w,w") > (vec_merge:VQ > (vec_duplicate:VQ > - (match_operand:<V_elem> 1 "s_register_operand" "r")) > - (match_operand:VQ 3 "s_register_operand" "0") > - (match_operand:SI 2 "immediate_operand" "i")))] > + (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r")) > + (match_operand:VQ 3 "s_register_operand" "0,0") > + (match_operand:SI 2 "immediate_operand" "i,i")))] > "TARGET_NEON" > { > HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; > @@ -454,18 +457,21 @@ > operands[0] = gen_rtx_REG (<V_HALF>mode, regno + hi); > operands[2] = GEN_INT (elt); > > - return "vmov.<V_sz_elem>\t%P0[%c2], %1"; > + if (which_alternative == 0) > + return "vld1.<V_sz_elem>\t{%P0[%c2]}, %A1"; > + else > + return "vmov.<V_sz_elem>\t%P0[%c2], %1"; > } > - [(set_attr "neon_type" "neon_mcr")] > + [(set_attr "neon_type" "neon_vld1_vld2_lane,neon_mcr")] > ) > > (define_insn "vec_setv2di_internal" > - [(set (match_operand:V2DI 0 "s_register_operand" "=w") > + [(set (match_operand:V2DI 0 "s_register_operand" "=w,w") > (vec_merge:V2DI > (vec_duplicate:V2DI > - (match_operand:DI 1 "s_register_operand" "r")) > - (match_operand:V2DI 3 "s_register_operand" "0") > - (match_operand:SI 2 "immediate_operand" "i")))] > + (match_operand:DI 1 "nonimmediate_operand" "Um,r")) > + (match_operand:V2DI 3 "s_register_operand" "0,0") > + (match_operand:SI 2 "immediate_operand" "i,i")))] > "TARGET_NEON" > { > HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; > @@ -473,9 +479,12 @@ > > operands[0] = gen_rtx_REG (DImode, regno); > > - return "vmov\t%P0, %Q1, %R1"; > + if (which_alternative == 0) > + return "vld1.64\t%P0, %A1"; > + else > + return "vmov\t%P0, %Q1, %R1"; > } > - [(set_attr "neon_type" "neon_mcr_2_mcrr")] > + [(set_attr "neon_type" "neon_vld1_1_2_regs,neon_mcr_2_mcrr")] > ) > > (define_expand "vec_set<mode>" > @@ -491,10 +500,10 @@ > }) > > (define_insn "vec_extract<mode>" > - [(set (match_operand:<V_elem> 0 "s_register_operand" "=r") > + [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r") > (vec_select:<V_elem> > - (match_operand:VD 1 "s_register_operand" "w") > - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] > + (match_operand:VD 1 "s_register_operand" "w,w") > + (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] > "TARGET_NEON" > { > if (BYTES_BIG_ENDIAN) > @@ -503,16 +512,20 @@ > elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt; > operands[2] = GEN_INT (elt); > } > - return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; > + > + if (which_alternative == 0) > + return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0"; > + else > + return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; > } > - [(set_attr "neon_type" "neon_bp_simple")] > + [(set_attr "neon_type" "neon_vst1_vst2_lane,neon_bp_simple")] > ) > > (define_insn "vec_extract<mode>" > - [(set (match_operand:<V_elem> 0 "s_register_operand" "=r") > + [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r") > (vec_select:<V_elem> > - (match_operand:VQ 1 "s_register_operand" "w") > - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] > + (match_operand:VQ 1 "s_register_operand" "w,w") > + (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] > "TARGET_NEON" > { > int half_elts = GET_MODE_NUNITS (<MODE>mode) / 2; > @@ -526,25 +539,31 @@ > operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi); > operands[2] = GEN_INT (elt); > > - return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; > + if (which_alternative == 0) > + return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0"; > + else > + return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; > } > - [(set_attr "neon_type" "neon_bp_simple")] > + [(set_attr "neon_type" "neon_vst1_vst2_lane,neon_bp_simple")] > ) > > (define_insn "vec_extractv2di" > - [(set (match_operand:DI 0 "s_register_operand" "=r") > + [(set (match_operand:DI 0 "nonimmediate_operand" "=Um,r") > (vec_select:DI > - (match_operand:V2DI 1 "s_register_operand" "w") > - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] > + (match_operand:V2DI 1 "s_register_operand" "w,w") > + (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] > "TARGET_NEON" > { > int regno = REGNO (operands[1]) + 2 * INTVAL (operands[2]); > > operands[1] = gen_rtx_REG (DImode, regno); > > - return "vmov\t%Q0, %R0, %P1 @ v2di"; > + if (which_alternative == 0) > + return "vst1.64\t{%P1}, %A0 @ v2di"; > + else > + return "vmov\t%Q0, %R0, %P1 @ v2di"; > } > - [(set_attr "neon_type" "neon_int_1")] > + [(set_attr "neon_type" "neon_vst1_vst2_lane,neon_int_1")] > ) > > (define_expand "vec_init<mode>" > @@ -4449,9 +4468,10 @@ > > (define_insn "neon_vst1_lane<mode>" > [(set (match_operand:<V_elem> 0 "neon_struct_operand" "=Um") > - (vec_select:<V_elem> > - (match_operand:VDX 1 "s_register_operand" "w") > - (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))] > + (unspec:<V_elem> > + [(match_operand:VDX 1 "s_register_operand" "w") > + (match_operand:SI 2 "immediate_operand" "i")] > + UNSPEC_VST1_LANE))] > "TARGET_NEON" > { > HOST_WIDE_INT lane = INTVAL (operands[2]); > @@ -4470,9 +4490,10 @@ > > (define_insn "neon_vst1_lane<mode>" > [(set (match_operand:<V_elem> 0 "neon_struct_operand" "=Um") > - (vec_select:<V_elem> > - (match_operand:VQX 1 "s_register_operand" "w") > - (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))] > + (unspec:<V_elem> > + [(match_operand:VQX 1 "s_register_operand" "w") > + (match_operand:SI 2 "immediate_operand" "i")] > + UNSPEC_VST1_LANE))] > "TARGET_NEON" > { > HOST_WIDE_INT lane = INTVAL (operands[2]); > Index: gcc-head/gcc/config/arm/predicates.md > =================================================================== > --- gcc-head.orig/gcc/config/arm/predicates.md 2012-09-14 19:38:14.000000000 > +0200 > +++ gcc-head/gcc/config/arm/predicates.md 2012-09-14 19:41:14.000000000 > +0200 > @@ -524,10 +524,6 @@ > (ior (match_operand 0 "imm_for_neon_inv_logic_operand") > (match_operand 0 "s_register_operand"))) > > -;; TODO: We could check lane numbers more precisely based on the mode. > -(define_predicate "neon_lane_number" > - (and (match_code "const_int") > - (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 15"))) > ;; Predicates for named expanders that overlap multiple ISAs. > > (define_predicate "cmpdi_operand" > -- > Dr. Ulrich Weigand > GNU Toolchain for Linux on System z and Cell BE > ulrich.weig...@de.ibm.com > >