On Fri, Dec 8, 2017 at 10:04 AM, Pohjolainen, Topi < topi.pohjolai...@gmail.com> wrote:
> On Fri, Dec 08, 2017 at 08:55:56AM -0800, Jason Ekstrand wrote: > > On Fri, Dec 8, 2017 at 12:40 AM, Pohjolainen, Topi < > > topi.pohjolai...@gmail.com> wrote: > > > > > On Wed, Dec 06, 2017 at 08:34:19PM -0800, Jason Ekstrand wrote: > > > > This rewires the logic for assigning uniform locations to work in > terms > > > > of "complex alignments". The basic idea is that, as we walk the > list of > > > > instructions, we keep track of the alignment and continuity > requirements > > > > of each slot and assert that the alignments all match up. We then > use > > > > those alignments in the compaction stage to ensure that everything > gets > > > > placed at a properly aligned register. The old mechanism handled > > > > alignments by special-casing each of the bit sizes and placing 64-bit > > > > values first followed by 32-bit values. > > > > > > > > The old scheme had the advantage of never leaving a hole since all > the > > > > 64-bit values could be tightly packed and so could the 32-bit values. > > > > However, the new scheme has no type size special cases so it handles > not > > > > only 32 and 64-bit types but should gracefully extend to 16 and 8-bit > > > > types as the need arises. > > > > > > > > Tested-by: Jose Maria Casanova Crespo <jmcasan...@igalia.com> > > > > --- > > > > src/intel/compiler/brw_fs.cpp | 307 ++++++++++++++++++++++++------ > > > ------------ > > > > 1 file changed, 174 insertions(+), 133 deletions(-) > > > > > > > > diff --git a/src/intel/compiler/brw_fs.cpp > b/src/intel/compiler/brw_fs. > > > cpp > > > > index 93bb6b4..41260b4 100644 > > > > --- a/src/intel/compiler/brw_fs.cpp > > > > +++ b/src/intel/compiler/brw_fs.cpp > > > > @@ -1906,62 +1906,6 @@ fs_visitor::compact_virtual_grfs() > > > > return progress; > > > > } > > > > > > > > -static void > > > > -set_push_pull_constant_loc(unsigned uniform, int *chunk_start, > > > > - unsigned *max_chunk_bitsize, > > > > - bool contiguous, unsigned bitsize, > > > > - const unsigned target_bitsize, > > > > - int *push_constant_loc, int > > > *pull_constant_loc, > > > > - unsigned *num_push_constants, > > > > - unsigned *num_pull_constants, > > > > - const unsigned max_push_components, > > > > - const unsigned max_chunk_size, > > > > - bool allow_pull_constants, > > > > - struct brw_stage_prog_data > *stage_prog_data) > > > > -{ > > > > - /* This is the first live uniform in the chunk */ > > > > - if (*chunk_start < 0) > > > > - *chunk_start = uniform; > > > > - > > > > - /* Keep track of the maximum bit size access in contiguous > uniforms > > > */ > > > > - *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); > > > > - > > > > - /* If this element does not need to be contiguous with the next, > we > > > > - * split at this point and everything between chunk_start and u > > > forms a > > > > - * single chunk. > > > > - */ > > > > - if (!contiguous) { > > > > - /* If bitsize doesn't match the target one, skip it */ > > > > - if (*max_chunk_bitsize != target_bitsize) { > > > > - /* FIXME: right now we only support 32 and 64-bit accesses > */ > > > > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > > > > - *max_chunk_bitsize = 0; > > > > - *chunk_start = -1; > > > > - return; > > > > - } > > > > - > > > > - unsigned chunk_size = uniform - *chunk_start + 1; > > > > - > > > > - /* Decide whether we should push or pull this parameter. In > the > > > > - * Vulkan driver, push constants are explicitly exposed via > the > > > API > > > > - * so we push everything. In GL, we only push small arrays. > > > > - */ > > > > - if (!allow_pull_constants || > > > > - (*num_push_constants + chunk_size <= max_push_components > && > > > > - chunk_size <= max_chunk_size)) { > > > > - assert(*num_push_constants + chunk_size <= > > > max_push_components); > > > > - for (unsigned j = *chunk_start; j <= uniform; j++) > > > > - push_constant_loc[j] = (*num_push_constants)++; > > > > - } else { > > > > - for (unsigned j = *chunk_start; j <= uniform; j++) > > > > - pull_constant_loc[j] = (*num_pull_constants)++; > > > > - } > > > > - > > > > - *max_chunk_bitsize = 0; > > > > - *chunk_start = -1; > > > > - } > > > > -} > > > > - > > > > static int > > > > get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) > > > > { > > > > @@ -1977,6 +1921,98 @@ get_subgroup_id_param_index(const > > > brw_stage_prog_data *prog_data) > > > > } > > > > > > > > /** > > > > + * Struct for handling complex alignments. > > > > + * > > > > + * A complex alignment is stored as multiplier and an offset. A > value > > > is > > > > + * considered to be aligned if it is congruent to the offset modulo > the > > > > + * multiplier. > > > > + */ > > > > +struct cplx_align { > > > > + unsigned mul:4; > > > > + unsigned offset:4; > > > > +}; > > > > + > > > > +#define CPLX_ALIGN_MAX_MUL 8 > > > > + > > > > +static void > > > > +cplx_align_assert_sane(struct cplx_align a) > > > > +{ > > > > + assert(a.mul > 0 && util_is_power_of_two(a.mul)); > > > > + assert(a.offset < a.mul); > > > > +} > > > > + > > > > +/** > > > > + * Combines two alignments to produce a least multiple of sorts. > > > > + * > > > > + * The returned alignment is the smallest (in terms of multiplier) > such > > > that > > > > + * anything aligned to both a and b will be aligned to the new > > > alignment. > > > > + * This function will assert-fail if a and b are not compatible, > i.e. > > > if the > > > > + * offset parameters are such that no common alignment is possible. > > > > + */ > > > > +static struct cplx_align > > > > +cplx_align_combine(struct cplx_align a, struct cplx_align b) > > > > +{ > > > > + cplx_align_assert_sane(a); > > > > + cplx_align_assert_sane(b); > > > > + > > > > + /* Assert that the alignments agree. */ > > > > + assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); > > > > + > > > > + return a.mul > b.mul ? a : b; > > > > +} > > > > + > > > > +/** > > > > + * Apply a complex alignment > > > > + * > > > > + * This function will return the smallest number greater than or > equal > > > to > > > > + * offset that is aligned to align. > > > > + */ > > > > +static unsigned > > > > +cplx_align_apply(struct cplx_align align, unsigned offset) > > > > +{ > > > > + return ALIGN(offset - align.offset, align.mul) + align.offset; > > > > +} > > > > + > > > > +#define UNIFORM_SLOT_SIZE 4 > > > > + > > > > +struct uniform_slot_info { > > > > + /** True if the given uniform slot is live */ > > > > + unsigned is_live:1; > > > > + > > > > + /** True if this slot and the slot must remain contiguous */ > > > > > > Missing a word? Maybe "...this slot and the slot after must..."? > > > > > > > Yes, "slot after" and "next slot" both work. I'll fix that up. > > > > > > > > + unsigned contiguous:1; > > > > + > > > > + struct cplx_align align; > > > > +}; > > > > + > > > > +static void > > > > +mark_uniform_slots_read(struct uniform_slot_info *slots, > > > > + unsigned num_slots, unsigned alignment) > > > > +{ > > > > + assert(alignment > 0 && util_is_power_of_two(alignment)); > > > > + assert(alignment <= CPLX_ALIGN_MAX_MUL); > > > > + > > > > + /* We can't align a slot to anything less than the slot size */ > > > > + alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); > > > > + > > > > + struct cplx_align align = {alignment, 0}; > > > > + cplx_align_assert_sane(align); > > > > + > > > > + for (unsigned i = 0; i < num_slots; i++) { > > > > + slots[i].is_live = true; > > > > + if (i < num_slots - 1) > > > > + slots[i].contiguous = true; > > > > + > > > > + align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); > > > > + if (slots[i].align.mul == 0) { > > > > + slots[i].align = align; > > > > + } else { > > > > + slots[i].align = cplx_align_combine(slots[i].align, > align); > > > > + } > > > > + } > > > > +} > > > > + > > > > +/** > > > > * Assign UNIFORM file registers to either push constants or pull > > > constants. > > > > * > > > > * We allow a fragment shader to have more than the specified > minimum > > > > @@ -1994,60 +2030,44 @@ fs_visitor::assign_constant_locations() > > > > return; > > > > } > > > > > > > > - bool is_live[uniforms]; > > > > - memset(is_live, 0, sizeof(is_live)); > > > > - unsigned bitsize_access[uniforms]; > > > > - memset(bitsize_access, 0, sizeof(bitsize_access)); > > > > - > > > > - /* For each uniform slot, a value of true indicates that the > given > > > slot and > > > > - * the next slot must remain contiguous. This is used to keep us > > > from > > > > - * splitting arrays and 64-bit values apart. > > > > - */ > > > > - bool contiguous[uniforms]; > > > > - memset(contiguous, 0, sizeof(contiguous)); > > > > + struct uniform_slot_info slots[uniforms]; > > > > + memset(slots, 0, sizeof(slots)); > > > > > > > > - /* First, we walk through the instructions and do two things: > > > > - * > > > > - * 1) Figure out which uniforms are live. > > > > - * > > > > - * 2) Mark any indirectly used ranges of registers as > contiguous. > > > > - * > > > > - * Note that we don't move constant-indexed accesses to arrays. > No > > > > - * testing has been done of the performance impact of this > choice. > > > > - */ > > > > foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > > > > for (int i = 0 ; i < inst->sources; i++) { > > > > if (inst->src[i].file != UNIFORM) > > > > continue; > > > > > > > > - int constant_nr = inst->src[i].nr + inst->src[i].offset / > 4; > > > > + /* NIR tightly packs things so the uniform number might > not be > > > > + * aligned (if we have a double right after a float, for > > > instance). > > > > + * This is fine because the process of re-arranging them > will > > > ensure > > > > + * that things are properly aligned. The offset into that > > > uniform, > > > > + * however, must be aligned. > > > > > > What about in case of types having size less than 32-bits (i.e., > smaller > > > than > > > uniform slot size? Take, for example, invididual components of 16-bits > > > vectors. Their offsets are not necessarily aligned on 32-bit slot > > > boundaries. > > > > > > > I did think about that. :-) My plan was that they would have their > uniform > > nr set to floor(offset/4) with an offset of offset % 4. Then their slot > > would get aligned to MAX(2, UNIFORM_SLOT_SIZE) = 4 (see > > mark_uniform_slots_read above). Assuming things are reasonably aligned > > (i.e., no one puts a float at an offset of 2 bytes), this should work out > > ok. > > Right, I did notice the MAX2() in mark_uniform_slots_read(), and like said > below, things here seem to work also. Mostly I was thinking the division > inst->src[i].offset / UNIFORM_SLOT_SIZE. A little note saying that the > offset > may not align to slots but that it doesn't need to. One is only interested > if > the slot itself is used and that it is insignificant which part of the slot > the instruction sources. > > Took me a while to crasp things but this is actually clearer than what we > had > before: > > Reviewed-by: Topi Pohjolainen <topi.pohjolai...@intel.com> > > > > > Unfortunately, I see that we still have asserts that byte_offset % 4 == 0 > > in our handling of nir_intrinsic_load_uniform and we're passing the > tests. > > They must be pretty bad. > > Yeah, I've been wondering how many things get exercised. I seemed to hit > number of things in liveness analysis and optimization passes with SIMD8 > and 16-bit regs with glsl. Then again, I don't have clear picture how much > the 16-bit storage support requires from the 16-bit compiler > infrastructure. > I just realized why this is Ok. Vulkan has a single variable for all push constants and it's at offset 0. That said, we also have an assert that the offset is a multiple of 4 and that definitely doesn't hold. --Jason > > > > --Jason > > > > > > > See also related question a few lines further. > > > > > > > + * > > > > + * In Vulkan, we have explicit offsets but everything is > > > crammed > > > > + * into a single "variable" so inst->src[i].nr will always > be > > > 0. > > > > + * Everything will be properly aligned relative to that one > > > base. > > > > + */ > > > > + assert(inst->src[i].offset % type_sz(inst->src[i].type) == > 0); > > > > + > > > > + unsigned u = inst->src[i].nr + > > > > + inst->src[i].offset / UNIFORM_SLOT_SIZE; > > > > > > This effectively gives the same uniform slot no matter which individual > > > byte(s) within are addressed. The logic here is about slots so this > seems > > > fine. Just wanted to check that this is your intention. And if so, > should > > > we > > > comment here a little? > > > > > > > > > > > + if (u >= uniforms) > > > > + continue; > > > > + > > > > + unsigned slots_read; > > > > if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { > > > > - assert(inst->src[2].ud % 4 == 0); > > > > - unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; > > > > - assert(last < uniforms); > > > > - > > > > - for (unsigned j = constant_nr; j < last; j++) { > > > > - is_live[j] = true; > > > > - contiguous[j] = true; > > > > - bitsize_access[j] = MAX2(bitsize_access[j], > > > type_sz(inst->src[i].type)); > > > > - } > > > > - is_live[last] = true; > > > > - bitsize_access[last] = MAX2(bitsize_access[last], > > > type_sz(inst->src[i].type)); > > > > + slots_read = DIV_ROUND_UP(inst->src[2].ud, > > > UNIFORM_SLOT_SIZE); > > > > } else { > > > > - if (constant_nr >= 0 && constant_nr < (int) uniforms) { > > > > - int regs_read = inst->components_read(i) * > > > > - type_sz(inst->src[i].type) / 4; > > > > - assert(regs_read <= 2); > > > > - if (regs_read == 2) > > > > - contiguous[constant_nr] = true; > > > > - for (int j = 0; j < regs_read; j++) { > > > > - is_live[constant_nr + j] = true; > > > > - bitsize_access[constant_nr + j] = > > > > - MAX2(bitsize_access[constant_nr + j], > > > type_sz(inst->src[i].type)); > > > > - } > > > > - } > > > > + unsigned bytes_read = inst->components_read(i) * > > > > + type_sz(inst->src[i].type); > > > > + slots_read = DIV_ROUND_UP(bytes_read, > UNIFORM_SLOT_SIZE); > > > > } > > > > + > > > > + assert(u + slots_read <= uniforms); > > > > + mark_uniform_slots_read(&slots[u], slots_read, > > > > + type_sz(inst->src[i].type)); > > > > } > > > > } > > > > > > > > @@ -2082,43 +2102,64 @@ fs_visitor::assign_constant_locations() > > > > memset(pull_constant_loc, -1, uniforms * > sizeof(*pull_constant_loc)); > > > > > > > > int chunk_start = -1; > > > > - unsigned max_chunk_bitsize = 0; > > > > - > > > > - /* First push 64-bit uniforms to ensure they are properly > aligned */ > > > > - const unsigned uniform_64_bit_size = > type_sz(BRW_REGISTER_TYPE_DF); > > > > + struct cplx_align align; > > > > for (unsigned u = 0; u < uniforms; u++) { > > > > - if (!is_live[u]) > > > > + if (!slots[u].is_live) { > > > > + assert(chunk_start == -1); > > > > continue; > > > > + } > > > > > > > > - set_push_pull_constant_loc(u, &chunk_start, > &max_chunk_bitsize, > > > > - contiguous[u], bitsize_access[u], > > > > - uniform_64_bit_size, > > > > - push_constant_loc, > pull_constant_loc, > > > > - &num_push_constants, > > > &num_pull_constants, > > > > - max_push_components, > max_chunk_size, > > > > - compiler->supports_pull_constants, > > > > - stage_prog_data); > > > > + /* Skip subgroup_id_index to put it in the last push > register. */ > > > > + if (subgroup_id_index == (int)u) > > > > + continue; > > > > > > > > - } > > > > + if (chunk_start == -1) { > > > > + chunk_start = u; > > > > + align = slots[u].align; > > > > + } else { > > > > + /* Offset into the chunk */ > > > > + unsigned chunk_offset = (u - chunk_start) * > UNIFORM_SLOT_SIZE; > > > > > > > > - /* Then push the rest of uniforms */ > > > > - const unsigned uniform_32_bit_size = > type_sz(BRW_REGISTER_TYPE_F); > > > > - for (unsigned u = 0; u < uniforms; u++) { > > > > - if (!is_live[u]) > > > > - continue; > > > > + /* Shift the slot alignment down by the chunk offset so it > is > > > > + * comparable with the base chunk alignment. > > > > + */ > > > > + struct cplx_align slot_align = slots[u].align; > > > > + slot_align.offset = > > > > + (slot_align.offset - chunk_offset) & (align.mul - 1); > > > > > > > > - /* Skip subgroup_id_index to put it in the last push > register. */ > > > > - if (subgroup_id_index == (int)u) > > > > + align = cplx_align_combine(align, slot_align); > > > > + } > > > > + > > > > + /* Sanity check the alignment */ > > > > + cplx_align_assert_sane(align); > > > > + > > > > + if (slots[u].contiguous) > > > > continue; > > > > > > > > - set_push_pull_constant_loc(u, &chunk_start, > &max_chunk_bitsize, > > > > - contiguous[u], bitsize_access[u], > > > > - uniform_32_bit_size, > > > > - push_constant_loc, > pull_constant_loc, > > > > - &num_push_constants, > > > &num_pull_constants, > > > > - max_push_components, > max_chunk_size, > > > > - compiler->supports_pull_constants, > > > > - stage_prog_data); > > > > + /* Adjust the alignment to be in terms of slots, not bytes */ > > > > + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); > > > > + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); > > > > + align.mul /= UNIFORM_SLOT_SIZE; > > > > + align.offset /= UNIFORM_SLOT_SIZE; > > > > + > > > > + unsigned push_start_align = cplx_align_apply(align, > > > num_push_constants); > > > > + unsigned chunk_size = u - chunk_start + 1; > > > > + if (!compiler->supports_pull_constants || > > > > + (chunk_size < max_chunk_size && > > > > + push_start_align + chunk_size <= max_push_components)) { > > > > + /* Align up the number of push constants */ > > > > + num_push_constants = push_start_align; > > > > + for (unsigned i = 0; i < chunk_size; i++) > > > > + push_constant_loc[chunk_start + i] = > num_push_constants++; > > > > + } else { > > > > + /* We need to pull this one */ > > > > + num_pull_constants = cplx_align_apply(align, > > > num_pull_constants); > > > > + for (unsigned i = 0; i < chunk_size; i++) > > > > + pull_constant_loc[chunk_start + i] = > num_pull_constants++; > > > > + } > > > > + > > > > + /* Reset the chunk and start again */ > > > > + chunk_start = -1; > > > > } > > > > > > > > /* Add the CS local thread ID uniform at the end of the push > > > constants */ > > > > @@ -2131,8 +2172,8 @@ fs_visitor::assign_constant_locations() > > > > uint32_t *param = stage_prog_data->param; > > > > stage_prog_data->nr_params = num_push_constants; > > > > if (num_push_constants) { > > > > - stage_prog_data->param = ralloc_array(mem_ctx, uint32_t, > > > > - num_push_constants); > > > > + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, > > > > + num_push_constants); > > > > } else { > > > > stage_prog_data->param = NULL; > > > > } > > > > @@ -2140,8 +2181,8 @@ fs_visitor::assign_constant_locations() > > > > assert(stage_prog_data->pull_param == NULL); > > > > if (num_pull_constants > 0) { > > > > stage_prog_data->nr_pull_params = num_pull_constants; > > > > - stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t, > > > > - > num_pull_constants); > > > > + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, > > > > + > num_pull_constants); > > > > } > > > > > > > > /* Now that we know how many regular uniforms we'll push, reduce > the > > > > -- > > > > 2.5.0.400.gff86faf > > > > > > > > _______________________________________________ > > > > mesa-dev mailing list > > > > mesa-dev@lists.freedesktop.org > > > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev