It is fairly tricky to detect the proper conditions for using bitfield insert, but easy to just use it up front. This removes a lot of instructions on nvc0 when invoking the packing builtins.
Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> --- Not sure if all backends will benefit from this, but nvc0 definitely does, since it has a "insert bitfield" instruction which handles everything all-in-one (the combined offset/bits argument takes the position of the third source, which requires no extra logic when they're both immediate). It would be just as easy to use the 3-arg bfi op (no masking necessary since we know that we start with a clean value), but gallium doesn't support it. (Not for the least of reasons is that nvc0 doesn't support it and I'm the one that added BFI to TGSI.) Unpacking could use BFE, really the snorm logic would greatly benefit from using an IBFE (thus avoiding the shifts back and forth), but... later. src/glsl/ir_optimization.h | 4 +++- src/glsl/lower_packing_builtins.cpp | 24 +++++++++++++++++++++++- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 3 +++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index eef107e..b955874 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -66,7 +66,9 @@ enum lower_packing_builtins_op { LOWER_UNPACK_SNORM_4x8 = 0x0200, LOWER_PACK_UNORM_4x8 = 0x0400, - LOWER_UNPACK_UNORM_4x8 = 0x0800 + LOWER_UNPACK_UNORM_4x8 = 0x0800, + + LOWER_PACK_USE_BFI = 0x1000, }; bool do_common_optimization(exec_list *ir, bool linked, diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp index a6fb8a8..be17e1d 100644 --- a/src/glsl/lower_packing_builtins.cpp +++ b/src/glsl/lower_packing_builtins.cpp @@ -225,6 +225,14 @@ private: "tmp_pack_uvec2_to_uint"); factory.emit(assign(u, uvec2_rval)); + if (op_mask & LOWER_PACK_USE_BFI) { + return bitfield_insert( + bit_and(swizzle_x(u), constant(0xffffu)), + swizzle_y(u), + constant(16), + constant(16)); + } + /* return (u.y << 16) | (u.x & 0xffff); */ return bit_or(lshift(swizzle_y(u), constant(16u)), bit_and(swizzle_x(u), constant(0xffffu))); @@ -242,9 +250,23 @@ private: { assert(uvec4_rval->type == glsl_type::uvec4_type); - /* uvec4 u = UVEC4_RVAL; */ ir_variable *u = factory.make_temp(glsl_type::uvec4_type, "tmp_pack_uvec4_to_uint"); + + if (op_mask & LOWER_PACK_USE_BFI) { + /* uvec4 u = UVEC4_RVAL; */ + factory.emit(assign(u, uvec4_rval)); + + return bitfield_insert( + bitfield_insert( + bitfield_insert( + bit_and(swizzle_x(u), constant(0xffu)), + swizzle_y(u), constant(8), constant(8)), + swizzle_z(u), constant(16), constant(8)), + swizzle_w(u), constant(24), constant(8)); + } + + /* uvec4 u = UVEC4_RVAL & 0xff */ factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu)))); /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */ diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index eb47685..d3c7238 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -5995,6 +5995,9 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16; + if (ctx->Extensions.ARB_gpu_shader5) + lower_inst |= LOWER_PACK_USE_BFI; + lower_packing_builtins(ir, lower_inst); } -- 2.4.6 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev