On Thu, Jan 5, 2017 at 2:18 AM, Samuel Iglesias Gonsálvez < sigles...@igalia.com> wrote:
> From: "Juan A. Suarez Romero" <jasua...@igalia.com> > > So far, input_reads was a bitmap tracking which vertex input locations > were being used. > > In OpenGL, an attribute bigger than a vec4 (like a dvec3 or dvec4) > consumes just one location, any other small attribute. So we mark the > proper bit in inputs_read, and also the same bit in double_inputs_read > if the attribute is a dvec3/dvec4. > > But in Vulkan, this is slightly different: a dvec3/dvec4 attribute > consumes two locations, not just one. And hence two bits would be marked > in inputs_read for the same vertex input attribute. > > To avoid handling two different situations in NIR, we just choose the > latest one: in OpenGL, when creating NIR from GLSL/IR, any dvec3/dvec4 > vertex input attribute is marked with two bits in the inputs_read bitmap > (and also in the double_inputs_read), and following attributes are > adjusted accordingly. > > As example, if in our GLSL/IR shader we have three attributes: > > layout(location = 0) vec3 attr0; > layout(location = 1) dvec4 attr1; > layout(location = 2) dvec3 attr2; > > then in our NIR shader we put attr0 in location 0, attr1 in locations 1 > and 2, and attr2 in location 3 and 4. > > Checking carefully, basically we are using slots rather than locations > in NIR. > > When emitting the vertices, we do a inverse map to know the > corresponding location for each slot. > > v2 (Jason): > - use two slots from inputs_read for dvec3/dvec4 NIR from GLSL/IR. > > v3 (Jason): > - Fix commit log error. > - Use ladder ifs and fix braces. > - elements_double is divisible by 2, don't need DIV_ROUND_UP(). > - Use if ladder instead of a switch. > > Reviewed-by: Jason Ekstrand <ja...@jlekstrand.net> > --- > src/compiler/glsl/glsl_to_nir.cpp | 28 ++++++++++++++++ > src/compiler/nir/nir_gather_info.c | 48 > +++++++++++++--------------- > src/intel/vulkan/genX_pipeline.c | 28 ++++++++++++---- > src/mesa/drivers/dri/i965/brw_draw_upload.c | 11 +++++-- > src/mesa/drivers/dri/i965/brw_fs.cpp | 13 -------- > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +- > src/mesa/drivers/dri/i965/brw_nir.c | 6 ++-- > src/mesa/drivers/dri/i965/brw_nir.h | 1 - > src/mesa/drivers/dri/i965/brw_vec4.cpp | 11 +++---- > 9 files changed, 89 insertions(+), 60 deletions(-) > > diff --git a/src/compiler/glsl/glsl_to_nir.cpp > b/src/compiler/glsl/glsl_to_nir.cpp > index 4e0a33a9e74..1e0a7f0255c 100644 > --- a/src/compiler/glsl/glsl_to_nir.cpp > +++ b/src/compiler/glsl/glsl_to_nir.cpp > @@ -129,6 +129,19 @@ private: > > } /* end of anonymous namespace */ > > +static void > +nir_remap_attributes(nir_shader *shader) > +{ > + nir_foreach_variable(var, &shader->inputs) { > + var->data.location += > _mesa_bitcount_64(shader->info->double_inputs_read > & > + BITFIELD64_MASK(var->data. > location)); > + } > + > + /* Once the remap is done, reset double_inputs_read, so later it will > have > + * which location/slots are doubles */ > + shader->info->double_inputs_read = 0; > +} > + > nir_shader * > glsl_to_nir(const struct gl_shader_program *shader_prog, > gl_shader_stage stage, > @@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program > *shader_prog, > > nir_lower_constant_initializers(shader, (nir_variable_mode)~0); > > + /* Remap the locations to slots so those requiring two slots will > occupy > + * two locations. For instance, if we have in the IR code a dvec3 > attr0 in > + * location 0 and vec4 attr1 in location 1, in NIR attr0 will use > + * locations/slots 0 and 1, and attr1 will use location/slot 2 */ > + if (shader->stage == MESA_SHADER_VERTEX) > + nir_remap_attributes(shader); > + > shader->info->name = ralloc_asprintf(shader, "GLSL%d", > shader_prog->Name); > if (shader_prog->Label) > shader->info->label = ralloc_strdup(shader, shader_prog->Label); > @@ -318,6 +338,14 @@ nir_visitor::visit(ir_variable *ir) > } else { > var->data.mode = nir_var_shader_in; > } > + > + /* Mark all the locations that require two slots */ > + if (glsl_type_is_dual_slot(glsl_without_array(var->type))) { > + for (uint i = 0; i < glsl_count_attribute_slots(var->type, > true); i++) { > + uint64_t bitfield = BITFIELD64_BIT(var->data.location + i); > + shader->info->double_inputs_read |= bitfield; > + } > + } > break; > > case ir_var_shader_out: > diff --git a/src/compiler/nir/nir_gather_info.c > b/src/compiler/nir/nir_gather_info.c > index 07c99497146..35a1ce4dec6 100644 > --- a/src/compiler/nir/nir_gather_info.c > +++ b/src/compiler/nir/nir_gather_info.c > @@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, int > offset, int len) > else > shader->info->inputs_read |= bitfield; > > - /* double inputs read is only for vertex inputs */ > - if (shader->stage == MESA_SHADER_VERTEX && > - glsl_type_is_dual_slot(glsl_without_array(var->type))) > - shader->info->double_inputs_read |= bitfield; > - > if (shader->stage == MESA_SHADER_FRAGMENT) { > shader->info->fs.uses_sample_qualifier |= var->data.sample; > } > @@ -83,26 +78,21 @@ static void > mark_whole_variable(nir_shader *shader, nir_variable *var) > { > const struct glsl_type *type = var->type; > - bool is_vertex_input = false; > > if (nir_is_per_vertex_io(var, shader->stage)) { > assert(glsl_type_is_array(type)); > type = glsl_get_array_element(type); > } > > - if (shader->stage == MESA_SHADER_VERTEX && > - var->data.mode == nir_var_shader_in) > - is_vertex_input = true; > - > const unsigned slots = > var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4) > - : glsl_count_attribute_slots(type, > is_vertex_input); > + : glsl_count_attribute_slots(type, false); > > set_io_mask(shader, var, 0, slots); > } > > static unsigned > -get_io_offset(nir_deref_var *deref, bool is_vertex_input) > +get_io_offset(nir_deref_var *deref) > { > unsigned offset = 0; > > @@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool > is_vertex_input) > return -1; > } > > - offset += glsl_count_attribute_slots(tail->type, > is_vertex_input) * > + offset += glsl_count_attribute_slots(tail->type, false) * > deref_array->base_offset; > } > /* TODO: we can get the offset for structs here see nir_lower_io() > */ > @@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var > *deref) > return false; > } > > - bool is_vertex_input = false; > - if (shader->stage == MESA_SHADER_VERTEX && > - var->data.mode == nir_var_shader_in) > - is_vertex_input = true; > - > - unsigned offset = get_io_offset(deref, is_vertex_input); > + unsigned offset = get_io_offset(deref); > if (offset == -1) > return false; > > @@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var > *deref) > } > > /* double element width for double types that takes two slots */ > - if (!is_vertex_input && > - glsl_type_is_dual_slot(glsl_without_array(type))) { > + if (glsl_type_is_dual_slot(glsl_without_array(type))) { > elem_width *= 2; > } > > @@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, > nir_shader *shader) > case nir_intrinsic_interp_var_at_sample: > case nir_intrinsic_interp_var_at_offset: > case nir_intrinsic_load_var: > - case nir_intrinsic_store_var: > - if (instr->variables[0]->var->data.mode == nir_var_shader_in || > - instr->variables[0]->var->data.mode == nir_var_shader_out) { > + case nir_intrinsic_store_var: { > + nir_variable *var = instr->variables[0]->var; > + > + if (var->data.mode == nir_var_shader_in || > + var->data.mode == nir_var_shader_out) { > if (!try_mask_partial_io(shader, instr->variables[0])) > - mark_whole_variable(shader, instr->variables[0]->var); > + mark_whole_variable(shader, var); > + > + /* We need to track which input_reads bits correspond to a > + * dvec3/dvec4 input attribute */ > + if (shader->stage == MESA_SHADER_VERTEX && > + var->data.mode == nir_var_shader_in && > + glsl_type_is_dual_slot(glsl_without_array(var->type))) { > + for (uint i = 0; i < glsl_count_attribute_slots(var->type, > false); i++) { > + int idx = var->data.location + i; > + shader->info->double_inputs_read |= BITFIELD64_BIT(idx); > + } > + } > } > break; > + } > > case nir_intrinsic_load_draw_id: > case nir_intrinsic_load_front_face: > diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_ > pipeline.c > index 845d0204d8c..48a8fad5965 100644 > --- a/src/intel/vulkan/genX_pipeline.c > +++ b/src/intel/vulkan/genX_pipeline.c > @@ -44,7 +44,15 @@ vertex_element_comp_control(enum isl_format format, > unsigned comp) > > if (bits) { > return VFCOMP_STORE_SRC; > - } else if (comp < 3) { > + } else if (comp >= 2 && > + !isl_format_layouts[format].channels.b.bits && > + isl_format_layouts[format].channels.r.type == ISL_RAW) { > + /* When emitting 64-bit attributes, we need to write either 128 or > 256 > + * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and > + * VFCOMP_STORE_0 to pad the written chunk */ > + return VFCOMP_NOSTORE; > + } else if (comp < 3 || > + (comp == 3 && isl_format_layouts[format].channels.r.type > == ISL_RAW)) { > You don't need "comp == 3" here. Also, please add a PRM citation for the restriction on PASSTHROUGH formats. With that, R-B still applies. > return VFCOMP_STORE_0; > } else if (isl_format_layouts[format].channels.r.type == ISL_UINT || > isl_format_layouts[format].channels.r.type == ISL_SINT) { > @@ -64,8 +72,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, > > /* Pull inputs_read out of the VS prog data */ > const uint64_t inputs_read = vs_prog_data->inputs_read; > + const uint64_t double_inputs_read = vs_prog_data->double_inputs_read; > assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); > const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; > + const uint32_t elements_double = double_inputs_read >> > VERT_ATTRIB_GENERIC0; > > #if GEN_GEN >= 8 > /* On BDW+, we only need to allocate space for base ids. Setting up > @@ -83,13 +93,16 @@ emit_vertex_input(struct anv_pipeline *pipeline, > vs_prog_data->uses_baseinstance; > #endif > > - uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem; > - if (elem_count == 0) > + uint32_t elem_count = __builtin_popcount(elements) - > + __builtin_popcount(elements_double) / 2; > + > + uint32_t total_elems = elem_count + needs_svgs_elem; > + if (total_elems == 0) > return; > > uint32_t *p; > > - const uint32_t num_dwords = 1 + elem_count * 2; > + const uint32_t num_dwords = 1 + total_elems * 2; > p = anv_batch_emitn(&pipeline->batch, num_dwords, > GENX(3DSTATE_VERTEX_ELEMENTS)); > memset(p + 1, 0, (num_dwords - 1) * 4); > @@ -107,7 +120,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, > if ((elements & (1 << desc->location)) == 0) > continue; /* Binding unused */ > > - uint32_t slot = __builtin_popcount(elements & ((1 << > desc->location) - 1)); > + uint32_t slot = > + __builtin_popcount(elements & ((1 << desc->location) - 1)) - > + DIV_ROUND_UP(__builtin_popcount(elements_double & > + ((1 << desc->location) -1)), 2); > > struct GENX(VERTEX_ELEMENT_STATE) element = { > .VertexBufferIndex = desc->binding, > @@ -137,7 +153,7 @@ emit_vertex_input(struct anv_pipeline *pipeline, > #endif > } > > - const uint32_t id_slot = __builtin_popcount(elements); > + const uint32_t id_slot = elem_count; > if (needs_svgs_elem) { > /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: > * "Within a VERTEX_ELEMENT_STATE structure, if a Component > diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c > b/src/mesa/drivers/dri/i965/brw_draw_upload.c > index 57815645924..b7527f2cd9b 100644 > --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c > +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c > @@ -481,11 +481,16 @@ brw_prepare_vertices(struct brw_context *brw) > /* Accumulate the list of enabled arrays. */ > brw->vb.nr_enabled = 0; > while (vs_inputs) { > - GLuint index = ffsll(vs_inputs) - 1; > + GLuint first = ffsll(vs_inputs) - 1; > + GLuint index = > + first - > DIV_ROUND_UP(_mesa_bitcount_64(vs_prog_data->double_inputs_read > & > + BITFIELD64_MASK(first)), > 2); > struct brw_vertex_element *input = &brw->vb.inputs[index]; > input->is_dual_slot = brw->gen >= 8 && > - (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0; > - vs_inputs &= ~BITFIELD64_BIT(index); > + (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) != 0; > + vs_inputs &= ~BITFIELD64_BIT(first); > + if (input->is_dual_slot) > + vs_inputs &= ~BITFIELD64_BIT(first + 1); > brw->vb.enabled[brw->vb.nr_enabled++] = input; > } > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index c8a069386dd..03f9c24d151 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type) > return 0; > } > > -/* Attribute arrays are loaded as one vec4 per element (or matrix column), > - * except for double-precision types, which are loaded as one dvec4. > - */ > -extern "C" int > -type_size_vs_input(const struct glsl_type *type) > -{ > - if (type->is_double()) { > - return type_size_dvec4(type); > - } else { > - return type_size_vec4(type); > - } > -} > - > /** > * Create a MOV to read the timestamp register. > * > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index 14415bd5c7a..5c356d61ff4 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -36,8 +36,7 @@ fs_reg * > fs_visitor::emit_vs_system_value(int location) > { > fs_reg *reg = new(this->mem_ctx) > - fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) + > - _mesa_bitcount_64(nir->info-> > double_inputs_read)), > + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read), > BRW_REGISTER_TYPE_D); > struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); > > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c > b/src/mesa/drivers/dri/i965/brw_nir.c > index 6f37e97a86f..e65a1786e2b 100644 > --- a/src/mesa/drivers/dri/i965/brw_nir.c > +++ b/src/mesa/drivers/dri/i965/brw_nir.c > @@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info) > int attr = intrin->const_index[0]; > int slot = _mesa_bitcount_64(nir_info->inputs_read & > BITFIELD64_MASK(attr)); > - int dslot = _mesa_bitcount_64(nir_info->double_inputs_read & > - BITFIELD64_MASK(attr)); > - intrin->const_index[0] = 4 * (slot + dslot); > + intrin->const_index[0] = 4 * slot; > } > } > return true; > @@ -204,7 +202,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir, > * loaded as one vec4 or dvec4 per element (or matrix column), > depending on > * whether it is a double-precision type or not. > */ > - nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0); > + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); > > /* This pass needs actual constants */ > nir_opt_constant_folding(nir); > diff --git a/src/mesa/drivers/dri/i965/brw_nir.h > b/src/mesa/drivers/dri/i965/brw_nir.h > index 8cfb6c1be68..012343b6b82 100644 > --- a/src/mesa/drivers/dri/i965/brw_nir.h > +++ b/src/mesa/drivers/dri/i965/brw_nir.h > @@ -34,7 +34,6 @@ extern "C" { > int type_size_scalar(const struct glsl_type *type); > int type_size_vec4(const struct glsl_type *type); > int type_size_dvec4(const struct glsl_type *type); > -int type_size_vs_input(const struct glsl_type *type); > > static inline int > type_size_scalar_bytes(const struct glsl_type *type) > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp > b/src/mesa/drivers/dri/i965/brw_vec4.cpp > index f096ce9e020..f84729dc636 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp > @@ -2737,7 +2737,7 @@ brw_compile_vs(const struct brw_compiler *compiler, > void *log_data, > ((1 << shader->info->cull_distance_array_size) - 1) << > shader->info->clip_distance_array_size; > > - unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read); > + unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data-> > inputs_read); > > /* gl_VertexID and gl_InstanceID are system values, but arrive via an > * incoming vertex attribute. So, add an extra slot. > @@ -2747,18 +2747,17 @@ brw_compile_vs(const struct brw_compiler > *compiler, void *log_data, > BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | > BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | > BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { > - nr_attributes++; > + nr_attribute_slots++; > } > > /* gl_DrawID has its very own vec4 */ > if (shader->info->system_values_read & > BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { > - nr_attributes++; > + nr_attribute_slots++; > } > > - unsigned nr_attribute_slots = > - nr_attributes + > - _mesa_bitcount_64(shader->info->double_inputs_read); > + unsigned nr_attributes = nr_attribute_slots - > + DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), > 2); > > /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB > Entry > * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, > in > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev