This greatly improves generated code, especially for the snorm variants, since it is able to get rid of the lshift/rshift for sext, as well as replacing each shift + mask with a single op.
Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> --- v1 -> v2: Only use BFE for .yz of a -> uvec4 conversion. src/glsl/ir_builder.cpp | 6 ++ src/glsl/ir_builder.h | 1 + src/glsl/ir_optimization.h | 1 + src/glsl/lower_packing_builtins.cpp | 103 +++++++++++++++++++++++++---- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 3 +- 5 files changed, 100 insertions(+), 14 deletions(-) diff --git a/src/glsl/ir_builder.cpp b/src/glsl/ir_builder.cpp index cd03859..c9cf124 100644 --- a/src/glsl/ir_builder.cpp +++ b/src/glsl/ir_builder.cpp @@ -567,6 +567,12 @@ csel(operand a, operand b, operand c) } ir_expression * +bitfield_extract(operand a, operand b, operand c) +{ + return expr(ir_triop_bitfield_extract, a, b, c); +} + +ir_expression * bitfield_insert(operand a, operand b, operand c, operand d) { void *mem_ctx = ralloc_parent(a.val); diff --git a/src/glsl/ir_builder.h b/src/glsl/ir_builder.h index f76453f..b483ebf 100644 --- a/src/glsl/ir_builder.h +++ b/src/glsl/ir_builder.h @@ -200,6 +200,7 @@ ir_expression *interpolate_at_sample(operand a, operand b); ir_expression *fma(operand a, operand b, operand c); ir_expression *lrp(operand x, operand y, operand a); ir_expression *csel(operand a, operand b, operand c); +ir_expression *bitfield_extract(operand a, operand b, operand c); ir_expression *bitfield_insert(operand a, operand b, operand c, operand d); ir_swizzle *swizzle(operand a, int swizzle, int components); diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index b955874..265b223 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -69,6 +69,7 @@ enum lower_packing_builtins_op { LOWER_UNPACK_UNORM_4x8 = 0x0800, LOWER_PACK_USE_BFI = 0x1000, + LOWER_PACK_USE_BFE = 0x2000, }; bool do_common_optimization(exec_list *ir, bool linked, diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp index be17e1d..1bd635e 100644 --- a/src/glsl/lower_packing_builtins.cpp +++ b/src/glsl/lower_packing_builtins.cpp @@ -118,6 +118,7 @@ public: *rvalue = split_unpack_half_2x16(op0); break; case LOWER_PACK_UNPACK_NONE: + default: assert(!"not reached"); break; } @@ -307,6 +308,39 @@ private: } /** + * \brief Unpack a uint32 into two int16's. + * + * Specifically each 16-bit value is sign-extended to the full width of an + * int32 on return. + */ + ir_rvalue * + unpack_uint_to_ivec2(ir_rvalue *uint_rval) + { + assert(uint_rval->type == glsl_type::uint_type); + + if (!(op_mask & LOWER_PACK_USE_BFE)) { + return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)), + constant(16u)), + constant(16u)); + } + + ir_variable *i = factory.make_temp(glsl_type::int_type, + "tmp_unpack_uint_to_ivec2_i"); + factory.emit(assign(i, u2i(uint_rval))); + + /* ivec2 i2; */ + ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type, + "tmp_unpack_uint_to_ivec2_i2"); + + factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)), + WRITEMASK_X)); + factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)), + WRITEMASK_Y)); + + return deref(i2).val; + } + + /** * \brief Unpack a uint32 into four uint8's. * * Interpret the given uint32 as a uint8 4-tuple where the uint32's least @@ -330,13 +364,23 @@ private: /* u4.x = u & 0xffu; */ factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X)); - /* u4.y = (u >> 8u) & 0xffu; */ - factory.emit(assign(u4, bit_and(rshift(u, constant(8u)), - constant(0xffu)), WRITEMASK_Y)); - - /* u4.z = (u >> 16u) & 0xffu; */ - factory.emit(assign(u4, bit_and(rshift(u, constant(16u)), - constant(0xffu)), WRITEMASK_Z)); + if (op_mask & LOWER_PACK_USE_BFE) { + /* u4.y = bitfield_extract(u, 8, 8); */ + factory.emit(assign(u4, bitfield_extract(u, constant(8), constant(8)), + WRITEMASK_Y)); + + /* u4.z = bitfield_extract(u, 16, 8); */ + factory.emit(assign(u4, bitfield_extract(u, constant(16), constant(8)), + WRITEMASK_Z)); + } else { + /* u4.y = (u >> 8u) & 0xffu; */ + factory.emit(assign(u4, bit_and(rshift(u, constant(8u)), + constant(0xffu)), WRITEMASK_Y)); + + /* u4.z = (u >> 16u) & 0xffu; */ + factory.emit(assign(u4, bit_and(rshift(u, constant(16u)), + constant(0xffu)), WRITEMASK_Z)); + } /* u4.w = (u >> 24u) */ factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W)); @@ -345,6 +389,43 @@ private: } /** + * \brief Unpack a uint32 into four int8's. + * + * Specifically each 8-bit value is sign-extended to the full width of an + * int32 on return. + */ + ir_rvalue * + unpack_uint_to_ivec4(ir_rvalue *uint_rval) + { + assert(uint_rval->type == glsl_type::uint_type); + + if (!(op_mask & LOWER_PACK_USE_BFE)) { + return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)), + constant(24u)), + constant(24u)); + } + + ir_variable *i = factory.make_temp(glsl_type::int_type, + "tmp_unpack_uint_to_ivec4_i"); + factory.emit(assign(i, u2i(uint_rval))); + + /* ivec4 i4; */ + ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type, + "tmp_unpack_uint_to_ivec4_i4"); + + factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)), + WRITEMASK_X)); + factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)), + WRITEMASK_Y)); + factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)), + WRITEMASK_Z)); + factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)), + WRITEMASK_W)); + + return deref(i4).val; + } + + /** * \brief Lower a packSnorm2x16 expression. * * \param vec2_rval is packSnorm2x16's input @@ -490,9 +571,7 @@ private: assert(uint_rval->type == glsl_type::uint_type); ir_rvalue *result = - clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)), - constant(16)), - constant(16u))), + clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)), constant(32767.0f)), constant(-1.0f), constant(1.0f)); @@ -549,9 +628,7 @@ private: assert(uint_rval->type == glsl_type::uint_type); ir_rvalue *result = - clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)), - constant(24u)), - constant(24u))), + clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)), constant(127.0f)), constant(-1.0f), constant(1.0f)); diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 26adf0e..0ab5ce2 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -5994,7 +5994,8 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) LOWER_UNPACK_HALF_2x16; if (ctx->Extensions.ARB_gpu_shader5) - lower_inst |= LOWER_PACK_USE_BFI; + lower_inst |= LOWER_PACK_USE_BFI | + LOWER_PACK_USE_BFE; lower_packing_builtins(ir, lower_inst); } -- 2.4.6 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev