A few comments below. Mostly just asking for explanation. 1-3 are
Reviewed-by: Jason Ekstrand <jason.ekstr...@intel.com> Obviously, don't merge 4/4 until it actually has users. --Jason On Thu, Jul 16, 2015 at 8:35 AM, Francisco Jerez <curroje...@riseup.net> wrote: > This lowering pass implements an algorithm to expand SIMDN > instructions into a sequence of SIMDM instructions in cases where the > hardware doesn't support the original execution size natively for some > particular instruction. The most important use-cases are: > > - Lowering send message instructions that don't support SIMD16 > natively into SIMD8 (several texturing, framebuffer write and typed > surface operations). > > - Lowering messages that don't support SIMD8 natively into SIMD16 > (*cough*gen4*cough*). > > - 64-bit precision operations (e.g. FP64 and 64-bit integer > multiplication). > > - SIMD32. > > The algorithm works by splitting the sources of the original > instruction into chunks of width appropriate for the lowered > instructions, and then interleaving the results component-wise into > the destination of the original instruction. The pass is controlled > by the get_lowered_simd_width() function that currently just returns > the original execution size making the whole pass a no-op for the > moment until some user is introduced. > --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 142 > +++++++++++++++++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_fs.h | 1 + > 2 files changed, 143 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index d031352..eeb6938 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -3204,6 +3204,147 @@ fs_visitor::lower_logical_sends() > return progress; > } > > +/** > + * Get the closest native SIMD width supported by the hardware for > instruction > + * \p inst. The instruction will be left untouched by > + * fs_visitor::lower_simd_width() if the returned value is equal to the > + * original execution size. > + */ > +static unsigned > +get_lowered_simd_width(const struct brw_device_info *devinfo, > + const fs_inst *inst) > +{ > + switch (inst->opcode) { > + default: > + return inst->exec_size; > + } > +} > + > +/** > + * The \p rows array of registers represents a \p num_rows by \p num_columns > + * matrix in row-major order, write it in column-major order into the > register > + * passed as destination. \p stride gives the separation between matrix > + * elements in the input in fs_builder::dispatch_width() units. > + */ > +static void > +emit_transpose(const fs_builder &bld, > + const fs_reg &dst, const fs_reg *rows, > + unsigned num_rows, unsigned num_columns, unsigned stride) I'm not sure what I think about calling this "emit_transpose". I guess it is kind of a transpose, but maybe it's more of a "gather". I'm not going to quibble about it though. > +{ > + fs_reg *const components = new fs_reg[num_rows * num_columns]; > + > + for (unsigned i = 0; i < num_columns; ++i) { > + for (unsigned j = 0; j < num_rows; ++j) > + components[num_rows * i + j] = offset(rows[j], bld, stride * i); > + } > + > + bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0); > + > + delete[] components; > +} > + > +bool > +fs_visitor::lower_simd_width() > +{ > + bool progress = false; > + > + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > + const unsigned lower_width = get_lowered_simd_width(devinfo, inst); > + > + if (lower_width != inst->exec_size) { > + /* Builder matching the original instruction. */ > + const fs_builder ibld = bld.at(block, inst) > + .exec_all(inst->force_writemask_all) > + .group(inst->exec_size, > inst->force_sechalf); > + > + /* Split the copies in chunks of the execution width of either the > + * original or the lowered instruction, whichever is lower. > + */ > + const unsigned copy_width = MIN2(lower_width, inst->exec_size); > + const unsigned n = inst->exec_size / copy_width; > + const unsigned dst_size = inst->regs_written * REG_SIZE / > + inst->dst.component_size(inst->exec_size); > + fs_reg dsts[4]; > + > + assert(n > 0 && n <= ARRAY_SIZE(dsts) && > + !inst->writes_accumulator && !inst->mlen); > + > + for (unsigned i = 0; i < n; i++) { > + /* Emit a copy of the original instruction with the lowered > width. > + * If the EOT flag was set throw it away except for the last > + * instruction to avoid killing the thread prematurely. > + */ > + fs_inst tmp_inst = *inst; > + tmp_inst.exec_size = lower_width; > + tmp_inst.eot = inst->eot && i == n - 1; > + > + /* Set exec_all if the lowered width is higher than the original > + * to avoid breaking the compiler invariant that no control > + * flow-masked instruction is wider than the shader's > + * dispatch_width. Then emit the lowered instruction. > + */ > + const fs_builder lbld = ibld.exec_all(lower_width > > inst->exec_size) > + .group(lower_width, i); > + fs_inst *split_inst = lbld.emit(tmp_inst); Why are you emitting the split instruction before the source transposes? Don't they need to go first? Maybe I'm missing something. > + > + for (unsigned j = 0; j < inst->sources; j++) { > + if (inst->src[j].file != BAD_FILE && > + !is_uniform(inst->src[j])) { > + /* Get the i-th copy_width-wide chunk of the source. */ > + const fs_reg src = horiz_offset(inst->src[j], copy_width * > i); > + const unsigned src_size = inst->regs_read(j) * REG_SIZE / > + inst->src[j].component_size(inst->exec_size); > + > + /* Use a trivial transposition to copy one every n > + * copy_width-wide components of the register into a > + * temporary passed as source to the lowered instruction. > + */ > + split_inst->src[j] = lbld.vgrf(inst->src[j].type, > src_size); > + emit_transpose(lbld.group(copy_width, 0).at(block, > split_inst), > + split_inst->src[j], &src, 1, src_size, n); Shouldn't this be group(copy_width, i)? > + } > + } > + > + if (inst->regs_written) { > + /* Allocate enough space to hold the result of the lowered > + * instruction and fix up the number of registers written. > + */ > + split_inst->dst = dsts[i] = > + lbld.vgrf(inst->dst.type, dst_size); > + split_inst->regs_written = > + DIV_ROUND_UP(inst->regs_written * lower_width, > + inst->exec_size); > + } > + } > + > + if (inst->regs_written) { > + /* Distance between useful channels in the temporaries, skipping > + * garbage if the lowered instruction is wider than the original. > + */ > + const unsigned m = lower_width / copy_width; > + > + /* Interleave the components of the result from the lowered > + * instructions. We need to set exec_all() when copying more > than > + * one half per component, because LOAD_PAYLOAD (in terms of > which > + * emit_transpose is implemented) can only use the same channel > + * enable signals for all of its non-header sources. > + */ > + emit_transpose(ibld.exec_all(inst->exec_size > copy_width) > + .group(copy_width, 0), > + inst->dst, dsts, n, dst_size, m); > + } > + > + inst->remove(block); > + progress = true; > + } > + } > + > + if (progress) > + invalidate_live_intervals(); > + > + return progress; > +} > + > void > fs_visitor::dump_instructions() > { > @@ -3655,6 +3796,7 @@ fs_visitor::optimize() > int iteration = 0; > int pass_num = 0; > > + OPT(lower_simd_width); > OPT(lower_logical_sends); > > do { > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index f3850d1..9582648 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -184,6 +184,7 @@ public: > bool lower_load_payload(); > bool lower_logical_sends(); > bool lower_integer_multiplication(); > + bool lower_simd_width(); > bool opt_combine_constants(); > > void emit_dummy_fs(); > -- > 2.4.3 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev