Consider the following code: dp4(8) g8<1>.xF g1<4,4,1>F g5<4,4,1>F { align16 WE_normal 1Q }; mov(8) m3<1>.xF g8<4,4,1>.xF { align16 WE_normal 1Q };
Thanks to our existing compute-to-MRF code, this becomes: dp4(8) m3<1>.xF g1<4,4,1>F g5<4,4,1>F { align16 WE_normal 1Q }; However: dp4(8) g8<1>.xF g1<4,4,1>F g5<4,4,1>F { align16 WE_normal 1Q }; mov(8) m3<1>.yF g8<4,4,1>.xF { align16 WE_normal 1Q }; does not get optimized since the MRF and temporary GRF use different components, and the code does not yet support rewriting swizzles in the general case. Scalars are an easy special case: since there's only one component, you can simply change the writemask to store it in the proper component for the MRF. Reduces a simple shader in Unigine Tropics from 12 instructions to 9 by eliminating superfluous MOVs for 3 of the 4 vector components. Cc: Eric Anholt <e...@anholt.net> Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 48 ++++++++++++++++++++++---------- 1 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 5238ff5..c8daff2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -566,6 +566,9 @@ vec4_visitor::opt_compute_to_mrf() if (this->virtual_grf_use[inst->src[0].reg] > ip) continue; + bool scalar_source = inst->src[0].swizzle < 4; /* SWIZZLE_X, Y, Z, or W */ + bool scalar_destination = is_power_of_two(inst->dst.writemask); + /* We need to check interference with the MRF between this * instruction and the earliest instruction involved in writing * the GRF we're eliminating. To do that, keep track of which @@ -573,26 +576,36 @@ vec4_visitor::opt_compute_to_mrf() */ bool chans_needed[4] = {false, false, false, false}; int chans_remaining = 0; - for (int i = 0; i < 4; i++) { - int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); + if (scalar_source && scalar_destination) { + /* If we're just moving a scalar, we don't need to worry about + * complicated swizzling. Anything that writes to the single channel + * of the temporary GRF could easily write to the single channel of + * our MRF. + */ + chans_needed[inst->src[0].swizzle] = true; + chans_remaining = 1; + } else { + for (int i = 0; i < 4; i++) { + int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); - if (!(inst->dst.writemask & (1 << i))) - continue; + if (!(inst->dst.writemask & (1 << i))) + continue; - /* We don't handle compute-to-MRF across a swizzle. We would - * need to be able to rewrite instructions above to output - * results to different channels. - */ - if (chan != i) - chans_remaining = 5; + /* We don't handle compute-to-MRF across a swizzle. We would + * need to be able to rewrite instructions above to output + * results to different channels. + */ + if (chan != i) + chans_remaining = 5; - if (!chans_needed[chan]) { - chans_needed[chan] = true; - chans_remaining++; + if (!chans_needed[chan]) { + chans_needed[chan] = true; + chans_remaining++; + } } + if (chans_remaining > 4) + continue; } - if (chans_remaining > 4) - continue; /* Now walk up the instruction stream trying to see if we can * rewrite everything writing to the GRF into the MRF instead. @@ -695,6 +708,11 @@ vec4_visitor::opt_compute_to_mrf() scan_inst->dst.reg = mrf; scan_inst->dst.reg_offset = 0; scan_inst->saturate |= inst->saturate; + /* For scalars, we may have selected another channel (i.e. we + * may be replacing g8.x with m3.y). Update the writemask. + */ + if (scalar_source) + scan_inst->dst.writemask = inst->dst.writemask; } scan_inst = (vec4_instruction *)scan_inst->next; } -- 1.7.7.6 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev