On Tue, 2017-01-03 at 14:41 -0800, Jason Ekstrand wrote: > I made a few pretty trivial comments. With those addressed, > > Reviewed-by: Jason Ekstrand <ja...@jlekstrand.net> > > On Dec 16, 2016 8:55 AM, "Juan A. Suarez Romero" <jasua...@igalia.com > > wrote: > So far, input_reads was a bitmap tracking which vertex input > locations > > were being used. > > > > In OpenGL, an attribute bigger than a vec4 (like a dvec3 or dvec4) > > consumes just one location, any other small attribute. So we mark the > > proper bit in inputs_read, and also the same bit in > double_inputs_read > > if the attribute is a dvec3/dvec4. > > > > But in Vulkan, this is slightly different: a dvec3/dvec4 attribute > > consumes two locations, not just one. And hence two bits would be > marked > > in inputs_read for the same vertex input attribute. > > > > To avoid handling two different situations in NIR, we just choose the > > latest one: in OpenGL, when creating NIR from GLSL/IR, any > dvec3/dvec4 > > vertex input attribute is marked with two bits in the inputs_read > bitmap > > (and also in the double_inputs_read), and following attributes are > > adjusted accordingly. > > > > As example, if in our GLSL/IR shader we have three attributes: > > > > layout(location = 0) vec3 attr0; > > layout(location = 1) dvec4 attr1; > > layout(location = 2) dvec3 attr2; > > > > then in our NIR shader we put attr0 in location 0, attr1 in locations > 1 > > and 2, and attr2 in location 3. > > > attr2 goes in locations 3 *and* 4, correct?
Yes, attr2 goes in locations 3 and 4. > Checking carefully, basically we are using slots rather than > locations > > in NIR. > > > > When emitting the vertices, we do a inverse map to know the > > corresponding location for each slot. > > > > v2 (Jason): > > - use two slots from inputs_read for dvec3/dvec4 NIR from GLSL/IR. > > --- > > src/compiler/glsl/glsl_to_nir.cpp | 28 +++++++++++++ > > src/compiler/nir/nir_gather_info.c | 48 ++++++++++------ > ----- > > src/intel/vulkan/genX_pipeline.c | 63 +++++++++++++++++- > ---------- > > src/mesa/drivers/dri/i965/brw_draw_upload.c | 11 +++-- > > src/mesa/drivers/dri/i965/brw_fs.cpp | 13 ------ > > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +- > > src/mesa/drivers/dri/i965/brw_nir.c | 6 +-- > > src/mesa/drivers/dri/i965/brw_nir.h | 1 - > > src/mesa/drivers/dri/i965/brw_vec4.cpp | 11 +++-- > > 9 files changed, 106 insertions(+), 78 deletions(-) > > > > diff --git a/src/compiler/glsl/glsl_to_nir.cpp > b/src/compiler/glsl/glsl_to_nir.cpp > > index 4debc37..0814dad 100644 > > --- a/src/compiler/glsl/glsl_to_nir.cpp > > +++ b/src/compiler/glsl/glsl_to_nir.cpp > > @@ -129,6 +129,19 @@ private: > > > > } /* end of anonymous namespace */ > > > > +static void > > +nir_remap_attributes(nir_shader *shader) > > +{ > > + nir_foreach_variable(var, &shader->inputs) { > > + var->data.location += _mesa_bitcount_64(shader->info- > >double_inputs_read & > > + BITFIELD64_MASK(var- > >data.location)); > > + } > > + > > + /* Once the remap is done, reset double_inputs_read, so later it > will have > > + * which location/slots are doubles */ > > + shader->info->double_inputs_read = 0; > > +} > > + > > nir_shader * > > glsl_to_nir(const struct gl_shader_program *shader_prog, > > gl_shader_stage stage, > > @@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program > *shader_prog, > > > > nir_lower_constant_initializers(shader, (nir_variable_mode)~0); > > > > + /* Remap the locations to slots so those requiring two slots will > occupy > > + * two locations. For instance, if we have in the IR code a dvec3 > attr0 in > > + * location 0 and vec4 attr1 in location 1, in NIR attr0 will use > > + * locations/slots 0 and 1, and attr1 will use location/slot 2 */ > > + if (shader->stage == MESA_SHADER_VERTEX) > > + nir_remap_attributes(shader); > > + > > shader->info->name = ralloc_asprintf(shader, "GLSL%d", > shader_prog->Name); > > if (shader_prog->Label) > > shader->info->label = ralloc_strdup(shader, shader_prog- > >Label); > > @@ -315,6 +335,14 @@ nir_visitor::visit(ir_variable *ir) > > } else { > > var->data.mode = nir_var_shader_in; > > } > > + > > + /* Mark all the locations that require two slots */ > > + if (glsl_type_is_dual_slot(glsl_without_array(var->type))) { > > + for (uint i = 0; i < glsl_count_attribute_slots(var->type, > true); i++) { > > + uint64_t bitfield = BITFIELD64_BIT(var->data.location + > i); > > + shader->info->double_inputs_read |= bitfield; > > + } > > + } > > break; > > > > case ir_var_shader_out: > > diff --git a/src/compiler/nir/nir_gather_info.c > b/src/compiler/nir/nir_gather_info.c > > index 07c9949..35a1ce4 100644 > > --- a/src/compiler/nir/nir_gather_info.c > > +++ b/src/compiler/nir/nir_gather_info.c > > @@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, > int offset, int len) > > else > > shader->info->inputs_read |= bitfield; > > > > - /* double inputs read is only for vertex inputs */ > > - if (shader->stage == MESA_SHADER_VERTEX && > > - glsl_type_is_dual_slot(glsl_without_array(var->type))) > > - shader->info->double_inputs_read |= bitfield; > > - > > if (shader->stage == MESA_SHADER_FRAGMENT) { > > shader->info->fs.uses_sample_qualifier |= var- > >data.sample; > > } > > @@ -83,26 +78,21 @@ static void > > mark_whole_variable(nir_shader *shader, nir_variable *var) > > { > > const struct glsl_type *type = var->type; > > - bool is_vertex_input = false; > > > > if (nir_is_per_vertex_io(var, shader->stage)) { > > assert(glsl_type_is_array(type)); > > type = glsl_get_array_element(type); > > } > > > > - if (shader->stage == MESA_SHADER_VERTEX && > > - var->data.mode == nir_var_shader_in) > > - is_vertex_input = true; > > - > > const unsigned slots = > > var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4) > > - : glsl_count_attribute_slots(type, > is_vertex_input); > > + : glsl_count_attribute_slots(type, false); > > > > set_io_mask(shader, var, 0, slots); > > } > > > > static unsigned > > -get_io_offset(nir_deref_var *deref, bool is_vertex_input) > > +get_io_offset(nir_deref_var *deref) > > { > > unsigned offset = 0; > > > > @@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool > is_vertex_input) > > return -1; > > } > > > > - offset += glsl_count_attribute_slots(tail->type, > is_vertex_input) * > > + offset += glsl_count_attribute_slots(tail->type, false) * > > deref_array->base_offset; > > } > > /* TODO: we can get the offset for structs here see > nir_lower_io() */ > > @@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, > nir_deref_var *deref) > > return false; > > } > > > > - bool is_vertex_input = false; > > - if (shader->stage == MESA_SHADER_VERTEX && > > - var->data.mode == nir_var_shader_in) > > - is_vertex_input = true; > > - > > - unsigned offset = get_io_offset(deref, is_vertex_input); > > + unsigned offset = get_io_offset(deref); > > if (offset == -1) > > return false; > > > > @@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, > nir_deref_var *deref) > > } > > > > /* double element width for double types that takes two slots */ > > - if (!is_vertex_input && > > - glsl_type_is_dual_slot(glsl_without_array(type))) { > > + if (glsl_type_is_dual_slot(glsl_without_array(type))) { > > elem_width *= 2; > > } > > > > @@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr > *instr, nir_shader *shader) > > case nir_intrinsic_interp_var_at_sample: > > case nir_intrinsic_interp_var_at_offset: > > case nir_intrinsic_load_var: > > - case nir_intrinsic_store_var: > > - if (instr->variables[0]->var->data.mode == nir_var_shader_in > || > > - instr->variables[0]->var->data.mode == nir_var_shader_out) > { > > + case nir_intrinsic_store_var: { > > + nir_variable *var = instr->variables[0]->var; > > + > > + if (var->data.mode == nir_var_shader_in || > > + var->data.mode == nir_var_shader_out) { > > if (!try_mask_partial_io(shader, instr->variables[0])) > > - mark_whole_variable(shader, instr->variables[0]->var); > > + mark_whole_variable(shader, var); > > + > > + /* We need to track which input_reads bits correspond to a > > + * dvec3/dvec4 input attribute */ > > + if (shader->stage == MESA_SHADER_VERTEX && > > + var->data.mode == nir_var_shader_in && > > + glsl_type_is_dual_slot(glsl_without_array(var->type))) > { > > + for (uint i = 0; i < glsl_count_attribute_slots(var- > >type, false); i++) { > > + int idx = var->data.location + i; > > + shader->info->double_inputs_read |= > BITFIELD64_BIT(idx); > > + } > > + } > > } > > break; > > + } > > > > case nir_intrinsic_load_draw_id: > > case nir_intrinsic_load_front_face: > > diff --git a/src/intel/vulkan/genX_pipeline.c > b/src/intel/vulkan/genX_pipeline.c > > index 845d020..7b94959 100644 > > --- a/src/intel/vulkan/genX_pipeline.c > > +++ b/src/intel/vulkan/genX_pipeline.c > > @@ -33,26 +33,33 @@ > > static uint32_t > > vertex_element_comp_control(enum isl_format format, unsigned comp) > > { > > - uint8_t bits; > > switch (comp) { > > - case 0: bits = isl_format_layouts[format].channels.r.bits; break; > > - case 1: bits = isl_format_layouts[format].channels.g.bits; break; > > - case 2: bits = isl_format_layouts[format].channels.b.bits; break; > > - case 3: bits = isl_format_layouts[format].channels.a.bits; break; > > - default: unreachable("Invalid component"); > > - } > > - > > - if (bits) { > > - return VFCOMP_STORE_SRC; > > - } else if (comp < 3) { > > - return VFCOMP_STORE_0; > > - } else if (isl_format_layouts[format].channels.r.type == ISL_UINT > || > > - isl_format_layouts[format].channels.r.type == ISL_SINT) > { > > - assert(comp == 3); > > - return VFCOMP_STORE_1_INT; > > - } else { > > - assert(comp == 3); > > - return VFCOMP_STORE_1_FP; > > + case 0: > > + return isl_format_layouts[format].channels.r.bits ? > > + VFCOMP_STORE_SRC : VFCOMP_STORE_0; > > + case 1: > > + return isl_format_layouts[format].channels.g.bits ? > > + VFCOMP_STORE_SRC : VFCOMP_STORE_0; > > + case 2: > > + return isl_format_layouts[format].channels.b.bits ? > > + VFCOMP_STORE_SRC : > ((isl_format_layouts[format].channels.r.type == ISL_RAW) ? > > + VFCOMP_NOSTORE : VFCOMP_STORE_0); > > > Given all the line wrapping, I think it would be clearer to just have > an if ladder here > > > + case 3: > > + if (isl_format_layouts[format].channels.a.bits) > > + return VFCOMP_STORE_SRC; > > + else > > > Please use braces when one side of the if/else is multiple lines. > > > + switch (isl_format_layouts[format].channels.r.type) { > > + case ISL_RAW: > > + return isl_format_layouts[format].channels.b.bits ? > > + VFCOMP_STORE_0 : VFCOMP_NOSTORE; > > > This seems a bit odd. Mind explaining what's going on here? > > > + case ISL_UINT: > > + case ISL_SINT: > > + return VFCOMP_STORE_1_INT; > > + default: > > + return VFCOMP_STORE_1_FP; > > + } > > + default: > > + unreachable("Invalid component"); > > } > > } > > > > @@ -64,8 +71,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, > > > > /* Pull inputs_read out of the VS prog data */ > > const uint64_t inputs_read = vs_prog_data->inputs_read; > > + const uint64_t double_inputs_read = vs_prog_data- > >double_inputs_read; > > assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); > > const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; > > + const uint32_t elements_double = double_inputs_read >> > VERT_ATTRIB_GENERIC0; > > > > #if GEN_GEN >= 8 > > /* On BDW+, we only need to allocate space for base ids. Setting > up > > @@ -83,13 +92,16 @@ emit_vertex_input(struct anv_pipeline *pipeline, > > vs_prog_data->uses_baseinstance; > > #endif > > > > - uint32_t elem_count = __builtin_popcount(elements) + > needs_svgs_elem; > > - if (elem_count == 0) > > + uint32_t elem_count = __builtin_popcount(elements) - > > + DIV_ROUND_UP(__builtin_popcount(elements_double), 2); > > > This had better be divisible by 2... > > > + > > + uint32_t total_elems = elem_count + needs_svgs_elem; > > + if (total_elems == 0) > > return; > > > > uint32_t *p; > > > > - const uint32_t num_dwords = 1 + elem_count * 2; > > + const uint32_t num_dwords = 1 + total_elems * 2; > > p = anv_batch_emitn(&pipeline->batch, num_dwords, > > GENX(3DSTATE_VERTEX_ELEMENTS)); > > memset(p + 1, 0, (num_dwords - 1) * 4); > > @@ -107,7 +119,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, > > if ((elements & (1 << desc->location)) == 0) > > continue; /* Binding unused */ > > > > - uint32_t slot = __builtin_popcount(elements & ((1 << desc- > >location) - 1)); > > + uint32_t slot = > > + __builtin_popcount(elements & ((1 << desc->location) - 1)) > - > > + DIV_ROUND_UP(__builtin_popcount(elements_double & > > + ((1 << desc->location) -1)), > 2); > > > > struct GENX(VERTEX_ELEMENT_STATE) element = { > > .VertexBufferIndex = desc->binding, > > @@ -137,7 +152,7 @@ emit_vertex_input(struct anv_pipeline *pipeline, > > #endif > > } > > > > - const uint32_t id_slot = __builtin_popcount(elements); > > + const uint32_t id_slot = elem_count; > > if (needs_svgs_elem) { > > /* From the Broadwell PRM for the 3D_Vertex_Component_Control > enum: > > * "Within a VERTEX_ELEMENT_STATE structure, if a Component > > diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c > b/src/mesa/drivers/dri/i965/brw_draw_upload.c > > index b138cb7..c3655d8 100644 > > --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c > > +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c > > @@ -472,11 +472,16 @@ brw_prepare_vertices(struct brw_context *brw) > > /* Accumulate the list of enabled arrays. */ > > brw->vb.nr_enabled = 0; > > while (vs_inputs) { > > - GLuint index = ffsll(vs_inputs) - 1; > > + GLuint first = ffsll(vs_inputs) - 1; > > + GLuint index = > > + first - DIV_ROUND_UP(_mesa_bitcount_64(vs_prog_data- > >double_inputs_read & > > + > BITFIELD64_MASK(first)), 2); > > struct brw_vertex_element *input = &brw->vb.inputs[index]; > > input->is_dual_slot = brw->gen >= 8 && > > - (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) > != 0; > > - vs_inputs &= ~BITFIELD64_BIT(index); > > + (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) > != 0; > > + vs_inputs &= ~BITFIELD64_BIT(first); > > + if (input->is_dual_slot) > > + vs_inputs &= ~BITFIELD64_BIT(first + 1); > > brw->vb.enabled[brw->vb.nr_enabled++] = input; > > } > > > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > > index c218f56..b383d98 100644 > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > > @@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type) > > return 0; > > } > > > > -/* Attribute arrays are loaded as one vec4 per element (or matrix > column), > > - * except for double-precision types, which are loaded as one dvec4. > > - */ > > -extern "C" int > > -type_size_vs_input(const struct glsl_type *type) > > -{ > > - if (type->is_double()) { > > - return type_size_dvec4(type); > > - } else { > > - return type_size_vec4(type); > > - } > > -} > > - > > /** > > * Create a MOV to read the timestamp register. > > * > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > > index 14415bd..5c356d6 100644 > > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > > @@ -36,8 +36,7 @@ fs_reg * > > fs_visitor::emit_vs_system_value(int location) > > { > > fs_reg *reg = new(this->mem_ctx) > > - fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) + > > - _mesa_bitcount_64(nir->info- > >double_inputs_read)), > > + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read), > > BRW_REGISTER_TYPE_D); > > struct brw_vs_prog_data *vs_prog_data = > brw_vs_prog_data(prog_data); > > > > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c > b/src/mesa/drivers/dri/i965/brw_nir.c > > index 763e3ec..24334c8 100644 > > --- a/src/mesa/drivers/dri/i965/brw_nir.c > > +++ b/src/mesa/drivers/dri/i965/brw_nir.c > > @@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info > *nir_info) > > int attr = intrin->const_index[0]; > > int slot = _mesa_bitcount_64(nir_info->inputs_read & > > BITFIELD64_MASK(attr)); > > - int dslot = _mesa_bitcount_64(nir_info->double_inputs_read > & > > - BITFIELD64_MASK(attr)); > > - intrin->const_index[0] = 4 * (slot + dslot); > > + intrin->const_index[0] = 4 * slot; > > } > > } > > return true; > > @@ -204,7 +202,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir, > > * loaded as one vec4 or dvec4 per element (or matrix column), > depending on > > * whether it is a double-precision type or not. > > */ > > - nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0); > > + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); > > > > /* This pass needs actual constants */ > > nir_opt_constant_folding(nir); > > diff --git a/src/mesa/drivers/dri/i965/brw_nir.h > b/src/mesa/drivers/dri/i965/brw_nir.h > > index 3c774d0..04c7ef8 100644 > > --- a/src/mesa/drivers/dri/i965/brw_nir.h > > +++ b/src/mesa/drivers/dri/i965/brw_nir.h > > @@ -34,7 +34,6 @@ extern "C" { > > int type_size_scalar(const struct glsl_type *type); > > int type_size_vec4(const struct glsl_type *type); > > int type_size_dvec4(const struct glsl_type *type); > > -int type_size_vs_input(const struct glsl_type *type); > > > > static inline int > > type_size_scalar_bytes(const struct glsl_type *type) > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp > b/src/mesa/drivers/dri/i965/brw_vec4.cpp > > index b9e592f..852e88f 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp > > @@ -2127,7 +2127,7 @@ brw_compile_vs(const struct brw_compiler > *compiler, void *log_data, > > ((1 << shader->info->cull_distance_array_size) - 1) << > > shader->info->clip_distance_array_size; > > > > - unsigned nr_attributes = _mesa_bitcount_64(prog_data- > >inputs_read); > > + unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data- > >inputs_read); > > > > /* gl_VertexID and gl_InstanceID are system values, but arrive > via an > > * incoming vertex attribute. So, add an extra slot. > > @@ -2137,18 +2137,17 @@ brw_compile_vs(const struct brw_compiler > *compiler, void *log_data, > > BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | > > BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | > > BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { > > - nr_attributes++; > > + nr_attribute_slots++; > > } > > > > /* gl_DrawID has its very own vec4 */ > > if (shader->info->system_values_read & > > BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { > > - nr_attributes++; > > + nr_attribute_slots++; > > } > > > > - unsigned nr_attribute_slots = > > - nr_attributes + > > - _mesa_bitcount_64(shader->info->double_inputs_read); > > + unsigned nr_attributes = nr_attribute_slots - > > + DIV_ROUND_UP(_mesa_bitcount_64(shader->info- > >double_inputs_read), 2); > > > > /* The 3DSTATE_VS documentation lists the lower bound on "Vertex > URB Entry > > * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. > Empirically, in > > -- > > 2.9.3 > > > > _______________________________________________ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev