On Thu, Aug 20, 2015 at 6:00 PM, Ilia Mirkin <imir...@alum.mit.edu> wrote: > It is fairly tricky to detect the proper conditions for using bitfield > insert, but easy to just use it up front. This removes a lot of > instructions on nvc0 when invoking the packing builtins. > > Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> > --- > > Not sure if all backends will benefit from this, but nvc0 definitely > does, since it has a "insert bitfield" instruction which handles > everything all-in-one (the combined offset/bits argument takes the > position of the third source, which requires no extra logic when > they're both immediate). > > It would be just as easy to use the 3-arg bfi op (no masking necessary > since we know that we start with a clean value), but gallium doesn't > support it. (Not for the least of reasons is that nvc0 doesn't support > it and I'm the one that added BFI to TGSI.) > > Unpacking could use BFE, really the snorm logic would greatly benefit > from using an IBFE (thus avoiding the shifts back and forth), > but... later. > > src/glsl/ir_optimization.h | 4 +++- > src/glsl/lower_packing_builtins.cpp | 24 +++++++++++++++++++++++- > src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 3 +++ > 3 files changed, 29 insertions(+), 2 deletions(-) > > diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h > index eef107e..b955874 100644 > --- a/src/glsl/ir_optimization.h > +++ b/src/glsl/ir_optimization.h > @@ -66,7 +66,9 @@ enum lower_packing_builtins_op { > LOWER_UNPACK_SNORM_4x8 = 0x0200, > > LOWER_PACK_UNORM_4x8 = 0x0400, > - LOWER_UNPACK_UNORM_4x8 = 0x0800 > + LOWER_UNPACK_UNORM_4x8 = 0x0800, > + > + LOWER_PACK_USE_BFI = 0x1000, > }; > > bool do_common_optimization(exec_list *ir, bool linked, > diff --git a/src/glsl/lower_packing_builtins.cpp > b/src/glsl/lower_packing_builtins.cpp > index a6fb8a8..be17e1d 100644 > --- a/src/glsl/lower_packing_builtins.cpp > +++ b/src/glsl/lower_packing_builtins.cpp > @@ -225,6 +225,14 @@ private: > "tmp_pack_uvec2_to_uint"); > factory.emit(assign(u, uvec2_rval)); > > + if (op_mask & LOWER_PACK_USE_BFI) { > + return bitfield_insert( > + bit_and(swizzle_x(u), constant(0xffffu)), > + swizzle_y(u), > + constant(16), > + constant(16));
Indent bitfield_insert's arguments to align with its "(" (and probably put bit_and on the same line) > + } > + > /* return (u.y << 16) | (u.x & 0xffff); */ > return bit_or(lshift(swizzle_y(u), constant(16u)), > bit_and(swizzle_x(u), constant(0xffffu))); > @@ -242,9 +250,23 @@ private: > { > assert(uvec4_rval->type == glsl_type::uvec4_type); > > - /* uvec4 u = UVEC4_RVAL; */ > ir_variable *u = factory.make_temp(glsl_type::uvec4_type, > "tmp_pack_uvec4_to_uint"); > + > + if (op_mask & LOWER_PACK_USE_BFI) { > + /* uvec4 u = UVEC4_RVAL; */ > + factory.emit(assign(u, uvec4_rval)); > + > + return bitfield_insert( > + bitfield_insert( > + bitfield_insert( > + bit_and(swizzle_x(u), constant(0xffu)), > + swizzle_y(u), constant(8), constant(8)), > + swizzle_z(u), constant(16), constant(8)), > + swizzle_w(u), constant(24), constant(8)); I don't know if there's a nice way to indent such a long expression, but I'd indent the lines after the return three spaces more than the first bitfield_insert. > + } > + > + /* uvec4 u = UVEC4_RVAL & 0xff */ > factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu)))); > > /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */ > diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > index eb47685..d3c7238 100644 > --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > @@ -5995,6 +5995,9 @@ st_link_shader(struct gl_context *ctx, struct > gl_shader_program *prog) > LOWER_PACK_HALF_2x16 | > LOWER_UNPACK_HALF_2x16; > > + if (ctx->Extensions.ARB_gpu_shader5) > + lower_inst |= LOWER_PACK_USE_BFI; > + > lower_packing_builtins(ir, lower_inst); > } _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev