On Fri, Oct 31, 2014 at 7:35 PM, Connor Abbott <cwabbo...@gmail.com> wrote: > On Fri, Oct 31, 2014 at 9:27 PM, Matt Turner <matts...@gmail.com> wrote: >> --- >> src/mesa/drivers/dri/i965/Makefile.sources | 1 + >> src/mesa/drivers/dri/i965/brw_fs.cpp | 2 + >> src/mesa/drivers/dri/i965/brw_fs.h | 1 + >> .../drivers/dri/i965/brw_fs_combine_constants.cpp | 222 >> +++++++++++++++++++++ >> 4 files changed, 226 insertions(+) >> create mode 100644 src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp >> >> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources >> b/src/mesa/drivers/dri/i965/Makefile.sources >> index 26bf458..8dc89f4 100644 >> --- a/src/mesa/drivers/dri/i965/Makefile.sources >> +++ b/src/mesa/drivers/dri/i965/Makefile.sources >> @@ -54,6 +54,7 @@ i965_FILES = \ >> brw_ff_gs_emit.c \ >> brw_fs.cpp \ >> brw_fs_channel_expressions.cpp \ >> + brw_fs_combine_constants.cpp \ >> brw_fs_copy_propagation.cpp \ >> brw_fs_cse.cpp \ >> brw_fs_dead_code_eliminate.cpp \ >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index b594331..d763edf 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -3606,6 +3606,8 @@ fs_visitor::run() >> dead_code_eliminate(); >> } >> >> + opt_combine_constants(); >> + >> lower_uniform_pull_constant_loads(); >> >> assign_curb_setup(); >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h >> b/src/mesa/drivers/dri/i965/brw_fs.h >> index 2fa0949..f9cb99f 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.h >> +++ b/src/mesa/drivers/dri/i965/brw_fs.h >> @@ -454,6 +454,7 @@ public: >> void no16(const char *msg, ...); >> void lower_uniform_pull_constant_loads(); >> bool lower_load_payload(); >> + void opt_combine_constants(); >> >> void push_force_uncompressed(); >> void pop_force_uncompressed(); >> diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp >> b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp >> new file mode 100644 >> index 0000000..eef6bb8 >> --- /dev/null >> +++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp >> @@ -0,0 +1,222 @@ >> +/* >> + * Copyright © 2014 Intel Corporation >> + * >> + * Permission is hereby granted, free of charge, to any person obtaining a >> + * copy of this software and associated documentation files (the >> "Software"), >> + * to deal in the Software without restriction, including without limitation >> + * the rights to use, copy, modify, merge, publish, distribute, sublicense, >> + * and/or sell copies of the Software, and to permit persons to whom the >> + * Software is furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice (including the next >> + * paragraph) shall be included in all copies or substantial portions of the >> + * Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS >> OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR >> OTHER >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER >> DEALINGS >> + * IN THE SOFTWARE. >> + */ >> + >> +/** @file brw_fs_combine_constants.cpp >> + * >> + * This file contains the opt_combine_constants() pass that runs after the >> + * regular optimization loop. It passes over the instruction list and >> + * selectively promotes immediate values to registers by emitting a mov(1) >> + * instruction. >> + * >> + * This is useful on Gen 7 particularly, because a few instructions can be >> + * coissued (i.e., issued in the same cycle as another thread on the same EU >> + * issues an instruction) under some circumstances, one of which is that >> they >> + * cannot use immediate values. >> + */ >> + >> +#include "brw_fs.h" >> +#include "brw_fs_live_variables.h" >> +#include "brw_cfg.h" >> + >> +static bool >> +could_coissue(const fs_inst *inst) >> +{ >> + /* MAD can coissue, but while the PRM lists various restrictions for >> Align1 >> + * instructions related to data alignment and regioning, it doesn't list >> + * similar restrictions for Align16 instructions. I don't expect that >> there >> + * are any restrictions, since Align16 doesn't allow the kinds of >> operations >> + * that are restricted in Align1 mode. >> + * >> + * Since MAD is an Align16 instruction we assume it can always coissue. >> + */ >> + switch (inst->opcode) { >> + case BRW_OPCODE_MOV: >> + case BRW_OPCODE_CMP: >> + case BRW_OPCODE_ADD: >> + case BRW_OPCODE_MUL: >> + return true; >> + default: >> + return false; >> + } >> +} >> + >> +struct imm { >> + bblock_t *block; >> + float val; >> + >> + uint8_t subreg_offset; >> + uint16_t reg; >> + >> + uint16_t uses_by_coissue; >> +}; >> + >> +struct table { >> + struct imm *imm; >> + int size; >> + int len; >> +}; >> + >> +static struct imm * >> +find_imm(struct table *table, float val) >> +{ >> + assert(signbit(val) == 0); >> + >> + for (int i = 0; i < table->len; i++) { >> + if (table->imm[i].val == val) { >> + return &table->imm[i]; >> + } >> + } >> + return NULL; >> +} >> + >> +static struct imm * >> +new_imm(struct table *table, void *mem_ctx) >> +{ >> + if (table->len == table->size) { >> + table->size *= 2; >> + table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size); >> + } >> + return &table->imm[table->len++]; >> +} >> + >> +static int >> +compare(const void *a, const void *b) >> +{ >> + return ((struct imm *)a)->block->num - ((struct imm *)b)->block->num; >> +} >> + >> +void >> +fs_visitor::opt_combine_constants() >> +{ >> + void *const_ctx = ralloc_context(NULL); >> + >> + struct table table; >> + table.size = 8; >> + table.len = 0; >> + table.imm = ralloc_array(const_ctx, struct imm, table.size); >> + >> + cfg->calculate_idom(); >> + >> + /* Make a pass through all instructions and count the number of times >> each >> + * constant is used by coissueable instructions. >> + */ >> + foreach_block_and_inst(block, fs_inst, inst, cfg) { >> + if (!could_coissue(inst)) >> + continue; >> + >> + for (int i = 0; i < inst->sources; i++) { >> + if (inst->src[i].file != IMM) >> + continue; >> + >> + float val = fabsf(inst->src[i].fixed_hw_reg.dw1.f); >> + struct imm *imm = find_imm(&table, val); >> + >> + if (imm) { >> + imm->block = cfg_t::intersect(block, imm->block); > > Like I mentioned in the other reply, we could be smarter about keeping > track of where to insert the MOV. We can keep track of what > instruction to insert the MOV before as well as which basic block to > put it in. If you see another instruction within the current basic > block, don't update anything. If you see another instruction within a > different basic block, then if the instruction's basic block is > dominated by the current basic block (i.e. the intersection returns > the current basic block), don't do anything, otherwise put the MOV at > the end of the basic block returned by the intersection.
Yeah, I think this is a good idea. I'll give this a try. >> + imm->uses_by_coissue += could_coissue(inst); >> + } else { >> + imm = new_imm(&table, const_ctx); >> + imm->block = block; >> + imm->val = val; >> + imm->uses_by_coissue = could_coissue(inst); >> + } >> + } >> + } >> + >> + /* Remove constants from the table that don't have enough uses to make >> them >> + * profitable to store in a register. >> + */ >> + for (int i = 0; i < table.len;) { >> + struct imm *imm = &table.imm[i]; >> + >> + if (imm->uses_by_coissue < 4) { >> + table.imm[i] = table.imm[table.len - 1]; >> + table.len--; >> + continue; >> + } >> + i++; >> + } >> + if (table.len == 0) { >> + ralloc_free(const_ctx); >> + return; >> + } >> + if (cfg->num_blocks != 1) >> + qsort(table.imm, table.len, sizeof(struct imm), compare); >> + >> + fs_reg reg(this, glsl_type::float_type); >> + reg.stride = 0; >> + int i = 0; >> + >> + foreach_block(block, cfg) { >> + backend_instruction *inst = block->first_non_control_flow_inst(); >> + >> + for (; i < table.len; i++) { >> + struct imm *imm = &table.imm[i]; >> + >> + if (imm->block->num != block->num) >> + break; >> + >> + fs_inst *mov = MOV(reg, fs_reg(imm->val)); >> + mov->exec_size = 1; >> + inst->insert_before(block, mov); >> + imm->reg = reg.reg; >> + imm->subreg_offset = reg.subreg_offset; >> + >> + reg.subreg_offset += sizeof(float); >> + if (reg.subreg_offset == 32) { > > Instead of a magic "32," can we put a 8 * sizeof(float) here to make > the intent more obvious? And shouldn't this be exec_size * > sizeof(float) since we can hold 16 floats in 16-wide mode? Sure, sounds good. With the ability to allocate single SIMD8 registers in SIMD16 programs these days, I think this code is okay but I'll give exec_size * sizeof(float) a try. >> + reg.reg = virtual_grf_alloc(1); >> + reg.subreg_offset = 0; >> + } >> + } >> + >> + foreach_inst_in_block(fs_inst, inst, block) { >> + if (inst->opcode == BRW_OPCODE_MOV && >> + inst->src[0].file == IMM && >> + inst->dst.stride == 0) >> + continue; >> + >> + /* Tell Matt privately if you actually read this far. */ > > Don't worry Matt, there are good things left in the world still :) > >> + for (int i = 0; i < inst->sources; i++) { >> + if (inst->src[i].file != IMM || >> + inst->src[i].type != BRW_REGISTER_TYPE_F) >> + continue; >> + >> + float val = fabsf(inst->src[i].fixed_hw_reg.dw1.f); >> + struct imm *imm = find_imm(&table, val); >> + if (!imm) >> + continue; >> + >> + inst->src[i].file = GRF; >> + inst->src[i].reg = imm->reg; >> + inst->src[i].subreg_offset = imm->subreg_offset; >> + inst->src[i].stride = 0; >> + inst->src[i].negate = signbit(inst->src[i].fixed_hw_reg.dw1.f) >> != >> + signbit(imm->val); >> + assert(fabsf(inst->src[i].fixed_hw_reg.dw1.f) == imm->val); >> + } >> + } >> + } > > Instead of sorting the immediates and then walking over all the > instructions and looking up the immediates again, wouldn't it be more > efficient (and perhaps less code) to store all the instructions where > each immediate is used in a vector in the table entry, and then walk > over that here? I suspect so. I'll make that change. Thanks Connor! _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev