On Thu, Aug 20, 2015 at 9:58 PM, Ilia Mirkin <imir...@alum.mit.edu> wrote: > This greatly improves generated code, especially for the snorm variants, > since it is able to get rid of the lshift/rshift for sext, as well as > replacing each shift + mask with a single op. > > Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> > --- > > This actually slightly pessimizes nvc0 generated code for the unorm > case because it is no longer able to detect that it can instruct cvt > to read a byte directly, but that will be fixed in the backend later. > > src/glsl/ir_builder.cpp | 6 ++ > src/glsl/ir_builder.h | 1 + > src/glsl/ir_optimization.h | 1 + > src/glsl/lower_packing_builtins.cpp | 112 > ++++++++++++++++++++++++----- > src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 3 +- > 5 files changed, 105 insertions(+), 18 deletions(-) > > diff --git a/src/glsl/ir_builder.cpp b/src/glsl/ir_builder.cpp > index cd03859..c9cf124 100644 > --- a/src/glsl/ir_builder.cpp > +++ b/src/glsl/ir_builder.cpp > @@ -567,6 +567,12 @@ csel(operand a, operand b, operand c) > } > > ir_expression * > +bitfield_extract(operand a, operand b, operand c) > +{ > + return expr(ir_triop_bitfield_extract, a, b, c); > +} > + > +ir_expression * > bitfield_insert(operand a, operand b, operand c, operand d) > { > void *mem_ctx = ralloc_parent(a.val); > diff --git a/src/glsl/ir_builder.h b/src/glsl/ir_builder.h > index f76453f..b483ebf 100644 > --- a/src/glsl/ir_builder.h > +++ b/src/glsl/ir_builder.h > @@ -200,6 +200,7 @@ ir_expression *interpolate_at_sample(operand a, operand > b); > ir_expression *fma(operand a, operand b, operand c); > ir_expression *lrp(operand x, operand y, operand a); > ir_expression *csel(operand a, operand b, operand c); > +ir_expression *bitfield_extract(operand a, operand b, operand c); > ir_expression *bitfield_insert(operand a, operand b, operand c, operand d); > > ir_swizzle *swizzle(operand a, int swizzle, int components); > diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h > index b955874..265b223 100644 > --- a/src/glsl/ir_optimization.h > +++ b/src/glsl/ir_optimization.h > @@ -69,6 +69,7 @@ enum lower_packing_builtins_op { > LOWER_UNPACK_UNORM_4x8 = 0x0800, > > LOWER_PACK_USE_BFI = 0x1000, > + LOWER_PACK_USE_BFE = 0x2000, > }; > > bool do_common_optimization(exec_list *ir, bool linked, > diff --git a/src/glsl/lower_packing_builtins.cpp > b/src/glsl/lower_packing_builtins.cpp > index be17e1d..e77e5d9 100644 > --- a/src/glsl/lower_packing_builtins.cpp > +++ b/src/glsl/lower_packing_builtins.cpp > @@ -118,6 +118,7 @@ public: > *rvalue = split_unpack_half_2x16(op0); > break; > case LOWER_PACK_UNPACK_NONE: > + default: > assert(!"not reached"); > break; > } > @@ -307,6 +308,39 @@ private: > } > > /** > + * \brief Unpack a uint32 into two int16's. > + * > + * Specifically each 16-bit value is sign-extended to the full width of an > + * int32 on return. > + */ > + ir_rvalue * > + unpack_uint_to_ivec2(ir_rvalue *uint_rval) > + { > + assert(uint_rval->type == glsl_type::uint_type); > + > + if (!(op_mask & LOWER_PACK_USE_BFE)) { > + return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)), > + constant(16u)), > + constant(16u)); > + } > + > + ir_variable *i = factory.make_temp(glsl_type::int_type, > + "tmp_unpack_uint_to_ivec2_i"); > + factory.emit(assign(i, u2i(uint_rval))); > + > + /* ivec2 i2; */ > + ir_variable *i2 = factory.make_temp(glsl_type::ivec4_type, > + "tmp_unpack_uint_to_ivec4_i2");
This should, of course, be an ivec2, as the comment indicates. Oops! > + > + factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)), > + WRITEMASK_X)); > + factory.emit(assign(i2, bitfield_extract(i, constant(16), > constant(16)), > + WRITEMASK_Y)); > + > + return deref(i2).val; > + } > + > + /** > * \brief Unpack a uint32 into four uint8's. > * > * Interpret the given uint32 as a uint8 4-tuple where the uint32's least > @@ -327,21 +361,69 @@ private: > ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type, > "tmp_unpack_uint_to_uvec4_u4"); > > - /* u4.x = u & 0xffu; */ > - factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X)); > + if (op_mask & LOWER_PACK_USE_BFE) { > + factory.emit(assign(u4, bitfield_extract(u, constant(0), > constant(8)), > + WRITEMASK_X)); > + factory.emit(assign(u4, bitfield_extract(u, constant(8), > constant(8)), > + WRITEMASK_Y)); > + factory.emit(assign(u4, bitfield_extract(u, constant(16), > constant(8)), > + WRITEMASK_Z)); > + factory.emit(assign(u4, bitfield_extract(u, constant(24), > constant(8)), > + WRITEMASK_W)); Not sure if this is desirable. Perhaps only do it for the middle 2 ops which are actually 2 ops and leave the first and last single-op ones alone? > + } else { > + /* u4.x = u & 0xffu; */ > + factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X)); > + > + /* u4.y = (u >> 8u) & 0xffu; */ > + factory.emit(assign(u4, bit_and(rshift(u, constant(8u)), > + constant(0xffu)), WRITEMASK_Y)); > + > + /* u4.z = (u >> 16u) & 0xffu; */ > + factory.emit(assign(u4, bit_and(rshift(u, constant(16u)), > + constant(0xffu)), WRITEMASK_Z)); > + > + /* u4.w = (u >> 24u) */ > + factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W)); > + } > > - /* u4.y = (u >> 8u) & 0xffu; */ > - factory.emit(assign(u4, bit_and(rshift(u, constant(8u)), > - constant(0xffu)), WRITEMASK_Y)); > + return deref(u4).val; > + } > > - /* u4.z = (u >> 16u) & 0xffu; */ > - factory.emit(assign(u4, bit_and(rshift(u, constant(16u)), > - constant(0xffu)), WRITEMASK_Z)); > + /** > + * \brief Unpack a uint32 into four int8's. > + * > + * Specifically each 8-bit value is sign-extended to the full width of an > + * int32 on return. > + */ > + ir_rvalue * > + unpack_uint_to_ivec4(ir_rvalue *uint_rval) > + { > + assert(uint_rval->type == glsl_type::uint_type); > > - /* u4.w = (u >> 24u) */ > - factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W)); > + if (!(op_mask & LOWER_PACK_USE_BFE)) { > + return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)), > + constant(24u)), > + constant(24u)); > + } > > - return deref(u4).val; > + ir_variable *i = factory.make_temp(glsl_type::int_type, > + "tmp_unpack_uint_to_ivec4_i"); > + factory.emit(assign(i, u2i(uint_rval))); > + > + /* ivec4 i4; */ > + ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type, > + "tmp_unpack_uint_to_ivec4_i4"); > + > + factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)), > + WRITEMASK_X)); > + factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)), > + WRITEMASK_Y)); > + factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)), > + WRITEMASK_Z)); > + factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)), > + WRITEMASK_W)); > + > + return deref(i4).val; > } > > /** > @@ -490,9 +572,7 @@ private: > assert(uint_rval->type == glsl_type::uint_type); > > ir_rvalue *result = > - clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)), > - constant(16)), > - constant(16u))), > + clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)), > constant(32767.0f)), > constant(-1.0f), > constant(1.0f)); > @@ -549,9 +629,7 @@ private: > assert(uint_rval->type == glsl_type::uint_type); > > ir_rvalue *result = > - clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)), > - constant(24u)), > - constant(24u))), > + clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)), > constant(127.0f)), > constant(-1.0f), > constant(1.0f)); > diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > index d3c7238..123d1b7 100644 > --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > @@ -5996,7 +5996,8 @@ st_link_shader(struct gl_context *ctx, struct > gl_shader_program *prog) > LOWER_UNPACK_HALF_2x16; > > if (ctx->Extensions.ARB_gpu_shader5) > - lower_inst |= LOWER_PACK_USE_BFI; > + lower_inst |= LOWER_PACK_USE_BFI | > + LOWER_PACK_USE_BFE; > > lower_packing_builtins(ir, lower_inst); > } > -- > 2.4.6 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev