+Ken On Thu, Dec 1, 2016 at 10:17 PM, Jason Ekstrand <ja...@jlekstrand.net> wrote:
> I'm not sure how I feel about this one. It seems like it would almost be > easier to just pick one convention or the other for NIR and adjust one of > the drivers accordingly. I don't know that I have a huge preference which > convention we choose. I guess the Vulkan convention matches our hardware a > bit better. In either case, converting from one to the other should be a > simple matter of building a remap table or a creative use of popcount. > > On Fri, Nov 25, 2016 at 12:52 AM, Juan A. Suarez Romero < > jasua...@igalia.com> wrote: > >> One difference between OpenGL and Vulkan regarding 64-bit vertex >> attribute types is that dvec3 and dvec4 consumes just one location in >> OpenGL, while in Vulkan it consumes two locations. >> >> Thus, in OpenGL for each dvec3/dvec4 vertex attrib we mark just one bit >> in our internal inputs_read bitmap (and also the corresponding bit in >> double_inputs_read bitmap) while in Vulkan we mark two consecutive bits >> in both bitmaps. >> >> This is handled with a nir option called "dvec3_consumes_two_locations", >> which is set to True for Vulkan code. And all the computation regarding >> emitting vertices as well as the mapping between attributes and physical >> registers use this option to correctly do the work. >> --- >> src/amd/vulkan/radv_pipeline.c | 1 + >> src/compiler/nir/nir.h | 5 +++ >> src/compiler/nir/nir_gather_info.c | 6 +-- >> src/gallium/drivers/freedreno/ir3/ir3_nir.c | 1 + >> src/intel/vulkan/anv_device.c | 2 +- >> src/intel/vulkan/genX_pipeline.c | 62 >> +++++++++++++++++----------- >> src/mesa/drivers/dri/i965/brw_compiler.c | 23 ++++++++++- >> src/mesa/drivers/dri/i965/brw_compiler.h | 2 +- >> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 14 +++++-- >> src/mesa/drivers/dri/i965/brw_nir.c | 18 +++++--- >> src/mesa/drivers/dri/i965/brw_vec4.cpp | 13 ++++-- >> src/mesa/drivers/dri/i965/intel_screen.c | 3 +- >> 12 files changed, 105 insertions(+), 45 deletions(-) >> >> diff --git a/src/amd/vulkan/radv_pipeline.c >> b/src/amd/vulkan/radv_pipeline.c >> index ee5d812..90d4650 100644 >> --- a/src/amd/vulkan/radv_pipeline.c >> +++ b/src/amd/vulkan/radv_pipeline.c >> @@ -59,6 +59,7 @@ static const struct nir_shader_compiler_options >> nir_options = { >> .lower_unpack_unorm_4x8 = true, >> .lower_extract_byte = true, >> .lower_extract_word = true, >> + .dvec3_consumes_two_locations = true, >> }; >> >> VkResult radv_CreateShaderModule( >> diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h >> index 1679d89..0fc8f39 100644 >> --- a/src/compiler/nir/nir.h >> +++ b/src/compiler/nir/nir.h >> @@ -1794,6 +1794,11 @@ typedef struct nir_shader_compiler_options { >> * information must be inferred from the list of input nir_variables. >> */ >> bool use_interpolated_input_intrinsics; >> + >> + /** >> + * In Vulkan, a dvec3/dvec4 consumes two locations instead just one. >> + */ >> + bool dvec3_consumes_two_locations; >> } nir_shader_compiler_options; >> >> typedef struct nir_shader { >> diff --git a/src/compiler/nir/nir_gather_info.c >> b/src/compiler/nir/nir_gather_info.c >> index 07c9949..8c80671 100644 >> --- a/src/compiler/nir/nir_gather_info.c >> +++ b/src/compiler/nir/nir_gather_info.c >> @@ -96,7 +96,7 @@ mark_whole_variable(nir_shader *shader, nir_variable >> *var) >> >> const unsigned slots = >> var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4) >> - : glsl_count_attribute_slots(type, >> is_vertex_input); >> + : glsl_count_attribute_slots(type, >> is_vertex_input && !shader->options->dvec3_consumes_two_locations); >> > > This makes no sense, why are we passing is_vertex_input && > !dvec3_consumes_two_locations to an argument labled is_vertex_input? > > >> >> set_io_mask(shader, var, 0, slots); >> } >> @@ -168,7 +168,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var >> *deref) >> var->data.mode == nir_var_shader_in) >> is_vertex_input = true; >> >> - unsigned offset = get_io_offset(deref, is_vertex_input); >> + unsigned offset = get_io_offset(deref, is_vertex_input && >> !shader->options->dvec3_consumes_two_locations); >> > > Same here > > >> if (offset == -1) >> return false; >> >> @@ -184,7 +184,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var >> *deref) >> } >> >> /* double element width for double types that takes two slots */ >> - if (!is_vertex_input && >> + if ((!is_vertex_input || shader->options->dvec3_consumes_two_locations) >> && >> glsl_type_is_dual_slot(glsl_without_array(type))) { >> > > This makes a bit more sense but I'm still not quite seeing it. > > >> elem_width *= 2; >> } >> diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c >> b/src/gallium/drivers/freedreno/ir3/ir3_nir.c >> index 2d86a52..5c5c9ad 100644 >> --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c >> +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c >> @@ -50,6 +50,7 @@ static const nir_shader_compiler_options options = { >> .vertex_id_zero_based = true, >> .lower_extract_byte = true, >> .lower_extract_word = true, >> + .dvec3_consumes_two_locations = false, >> }; >> >> struct nir_shader * >> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device. >> c >> index 2c8ac49..725848f 100644 >> --- a/src/intel/vulkan/anv_device.c >> +++ b/src/intel/vulkan/anv_device.c >> @@ -167,7 +167,7 @@ anv_physical_device_init(struct anv_physical_device >> *device, >> >> brw_process_intel_debug_variable(); >> >> - device->compiler = brw_compiler_create(NULL, &device->info); >> + device->compiler = brw_compiler_create(NULL, &device->info, true); >> if (device->compiler == NULL) { >> result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); >> goto fail; >> diff --git a/src/intel/vulkan/genX_pipeline.c >> b/src/intel/vulkan/genX_pipeline.c >> index cb164ad..97c40b8 100644 >> --- a/src/intel/vulkan/genX_pipeline.c >> +++ b/src/intel/vulkan/genX_pipeline.c >> @@ -33,26 +33,33 @@ >> static uint32_t >> vertex_element_comp_control(enum isl_format format, unsigned comp) >> { >> - uint8_t bits; >> switch (comp) { >> - case 0: bits = isl_format_layouts[format].channels.r.bits; break; >> - case 1: bits = isl_format_layouts[format].channels.g.bits; break; >> - case 2: bits = isl_format_layouts[format].channels.b.bits; break; >> - case 3: bits = isl_format_layouts[format].channels.a.bits; break; >> - default: unreachable("Invalid component"); >> - } >> - >> - if (bits) { >> - return VFCOMP_STORE_SRC; >> - } else if (comp < 3) { >> - return VFCOMP_STORE_0; >> - } else if (isl_format_layouts[format].channels.r.type == ISL_UINT || >> - isl_format_layouts[format].channels.r.type == ISL_SINT) { >> - assert(comp == 3); >> - return VFCOMP_STORE_1_INT; >> - } else { >> - assert(comp == 3); >> - return VFCOMP_STORE_1_FP; >> + case 0: >> + return isl_format_layouts[format].channels.r.bits ? >> + VFCOMP_STORE_SRC : VFCOMP_STORE_0; >> + case 1: >> + return isl_format_layouts[format].channels.g.bits ? >> + VFCOMP_STORE_SRC : VFCOMP_STORE_0; >> + case 2: >> + return isl_format_layouts[format].channels.b.bits ? >> + VFCOMP_STORE_SRC : ((isl_format_layouts[format].channels.r.type >> == ISL_RAW) ? >> + VFCOMP_NOSTORE : VFCOMP_STORE_0); >> + case 3: >> + if (isl_format_layouts[format].channels.a.bits) >> + return VFCOMP_STORE_SRC; >> + else >> + switch (isl_format_layouts[format].channels.r.type) { >> + case ISL_RAW: >> + return isl_format_layouts[format].channels.b.bits ? >> + VFCOMP_STORE_0 : VFCOMP_NOSTORE; >> + case ISL_UINT: >> + case ISL_SINT: >> + return VFCOMP_STORE_1_INT; >> + default: >> + return VFCOMP_STORE_1_FP; >> + } >> + default: >> + unreachable("Invalid component"); >> } >> } >> >> @@ -64,8 +71,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, >> >> /* Pull inputs_read out of the VS prog data */ >> const uint64_t inputs_read = vs_prog_data->inputs_read; >> + const uint64_t double_inputs_read = vs_prog_data->double_inputs_read; >> assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); >> const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; >> + const uint32_t elements_double = double_inputs_read >> >> VERT_ATTRIB_GENERIC0; >> >> #if GEN_GEN >= 8 >> /* On BDW+, we only need to allocate space for base ids. Setting up >> @@ -83,13 +92,16 @@ emit_vertex_input(struct anv_pipeline *pipeline, >> vs_prog_data->uses_baseinstance; >> #endif >> >> - uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem; >> - if (elem_count == 0) >> + uint32_t elem_count = >> + __builtin_popcount(elements) - >> DIV_ROUND_UP(__builtin_popcount(elements_double), >> 2); >> + >> + uint32_t total_elems = elem_count + needs_svgs_elem; >> + if (total_elems == 0) >> return; >> >> uint32_t *p; >> >> - const uint32_t num_dwords = 1 + elem_count * 2; >> + const uint32_t num_dwords = 1 + total_elems * 2; >> p = anv_batch_emitn(&pipeline->batch, num_dwords, >> GENX(3DSTATE_VERTEX_ELEMENTS)); >> memset(p + 1, 0, (num_dwords - 1) * 4); >> @@ -107,7 +119,9 @@ emit_vertex_input(struct anv_pipeline *pipeline, >> if ((elements & (1 << desc->location)) == 0) >> continue; /* Binding unused */ >> >> - uint32_t slot = __builtin_popcount(elements & ((1 << >> desc->location) - 1)); >> + uint32_t slot = >> + __builtin_popcount(elements & ((1 << desc->location) - 1)) - >> + DIV_ROUND_UP(__builtin_popcount(elements_double & ((1 << >> desc->location) - 1)), 2); >> >> struct GENX(VERTEX_ELEMENT_STATE) element = { >> .VertexBufferIndex = desc->binding, >> @@ -137,7 +151,7 @@ emit_vertex_input(struct anv_pipeline *pipeline, >> #endif >> } >> >> - const uint32_t id_slot = __builtin_popcount(elements); >> + const uint32_t id_slot = elem_count; >> if (needs_svgs_elem) { >> /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: >> * "Within a VERTEX_ELEMENT_STATE structure, if a Component >> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c >> b/src/mesa/drivers/dri/i965/brw_compiler.c >> index 1aa72bc..b9eceeb 100644 >> --- a/src/mesa/drivers/dri/i965/brw_compiler.c >> +++ b/src/mesa/drivers/dri/i965/brw_compiler.c >> @@ -55,6 +55,22 @@ static const struct nir_shader_compiler_options >> scalar_nir_options = { >> .lower_unpack_snorm_4x8 = true, >> .lower_unpack_unorm_2x16 = true, >> .lower_unpack_unorm_4x8 = true, >> + .dvec3_consumes_two_locations = false, >> +}; >> + >> +static const struct nir_shader_compiler_options >> vulkan_scalar_nir_options = { >> + COMMON_OPTIONS, >> + .lower_pack_half_2x16 = true, >> + .lower_pack_snorm_2x16 = true, >> + .lower_pack_snorm_4x8 = true, >> + .lower_pack_unorm_2x16 = true, >> + .lower_pack_unorm_4x8 = true, >> + .lower_unpack_half_2x16 = true, >> + .lower_unpack_snorm_2x16 = true, >> + .lower_unpack_snorm_4x8 = true, >> + .lower_unpack_unorm_2x16 = true, >> + .lower_unpack_unorm_4x8 = true, >> + .dvec3_consumes_two_locations = true, >> }; >> >> static const struct nir_shader_compiler_options vector_nir_options = { >> @@ -75,6 +91,7 @@ static const struct nir_shader_compiler_options >> vector_nir_options = { >> .lower_unpack_unorm_2x16 = true, >> .lower_extract_byte = true, >> .lower_extract_word = true, >> + .dvec3_consumes_two_locations = false, >> }; >> >> static const struct nir_shader_compiler_options vector_nir_options_gen6 >> = { >> @@ -92,10 +109,11 @@ static const struct nir_shader_compiler_options >> vector_nir_options_gen6 = { >> .lower_unpack_unorm_2x16 = true, >> .lower_extract_byte = true, >> .lower_extract_word = true, >> + .dvec3_consumes_two_locations = false, >> }; >> >> struct brw_compiler * >> -brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) >> +brw_compiler_create(void *mem_ctx, const struct gen_device_info >> *devinfo, bool is_vulkan) >> { >> struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); >> >> @@ -138,7 +156,8 @@ brw_compiler_create(void *mem_ctx, const struct >> gen_device_info *devinfo) >> compiler->glsl_compiler_options[i].EmitNoIndirectSampler = >> true; >> >> if (is_scalar) { >> - compiler->glsl_compiler_options[i].NirOptions = >> &scalar_nir_options; >> + compiler->glsl_compiler_options[i].NirOptions = is_vulkan ? >> &vulkan_scalar_nir_options >> + : >> &scalar_nir_options; >> } else { >> compiler->glsl_compiler_options[i].NirOptions = >> devinfo->gen < 6 ? &vector_nir_options : >> &vector_nir_options_gen6; >> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h >> b/src/mesa/drivers/dri/i965/brw_compiler.h >> index 65a7478..34df343 100644 >> --- a/src/mesa/drivers/dri/i965/brw_compiler.h >> +++ b/src/mesa/drivers/dri/i965/brw_compiler.h >> @@ -760,7 +760,7 @@ DEFINE_PROG_DATA_DOWNCAST(sf) >> /** @} */ >> >> struct brw_compiler * >> -brw_compiler_create(void *mem_ctx, const struct gen_device_info >> *devinfo); >> +brw_compiler_create(void *mem_ctx, const struct gen_device_info >> *devinfo, bool is_vulkan); >> >> /** >> * Compile a vertex shader. >> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> index 14415bd..00c5bb4 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> @@ -35,10 +35,16 @@ using namespace brw; >> fs_reg * >> fs_visitor::emit_vs_system_value(int location) >> { >> - fs_reg *reg = new(this->mem_ctx) >> - fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) + >> - _mesa_bitcount_64(nir->info->d >> ouble_inputs_read)), >> - BRW_REGISTER_TYPE_D); >> + fs_reg *reg; >> + if (nir->options->dvec3_consumes_two_locations) >> + reg = new(this->mem_ctx) >> + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read), >> + BRW_REGISTER_TYPE_D); >> + else >> + reg = new(this->mem_ctx) >> + fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) + >> + _mesa_bitcount_64(nir->info-> >> double_inputs_read)), >> + BRW_REGISTER_TYPE_D); >> struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); >> >> switch (location) { >> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c >> b/src/mesa/drivers/dri/i965/brw_nir.c >> index 763e3ec..daa22c6 100644 >> --- a/src/mesa/drivers/dri/i965/brw_nir.c >> +++ b/src/mesa/drivers/dri/i965/brw_nir.c >> @@ -96,7 +96,7 @@ add_const_offset_to_base(nir_shader *nir, >> nir_variable_mode mode) >> } >> >> static bool >> -remap_vs_attrs(nir_block *block, shader_info *nir_info) >> +remap_vs_attrs(nir_block *block, nir_shader *nir) >> { >> nir_foreach_instr(instr, block) { >> if (instr->type != nir_instr_type_intrinsic) >> @@ -111,10 +111,13 @@ remap_vs_attrs(nir_block *block, shader_info >> *nir_info) >> * before it and counting the bits. >> */ >> int attr = intrin->const_index[0]; >> - int slot = _mesa_bitcount_64(nir_info->inputs_read & >> + int slot = _mesa_bitcount_64(nir->info->inputs_read & >> + BITFIELD64_MASK(attr)); >> + >> + int dslot = 0; >> + if (!nir->options->dvec3_consumes_two_locations) >> + dslot = _mesa_bitcount_64(nir->info->double_inputs_read & >> BITFIELD64_MASK(attr)); >> - int dslot = _mesa_bitcount_64(nir_info->double_inputs_read & >> - BITFIELD64_MASK(attr)); >> intrin->const_index[0] = 4 * (slot + dslot); >> } >> } >> @@ -204,7 +207,10 @@ brw_nir_lower_vs_inputs(nir_shader *nir, >> * loaded as one vec4 or dvec4 per element (or matrix column), >> depending on >> * whether it is a double-precision type or not. >> */ >> - nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0); >> + nir_lower_io(nir, >> + nir_var_shader_in, >> + nir->options->dvec3_consumes_two_locations ? >> type_size_vec4 : type_size_vs_input, >> + 0); >> >> /* This pass needs actual constants */ >> nir_opt_constant_folding(nir); >> @@ -220,7 +226,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir, >> nir_foreach_function(function, nir) { >> if (function->impl) { >> nir_foreach_block(block, function->impl) { >> - remap_vs_attrs(block, nir->info); >> + remap_vs_attrs(block, nir); >> } >> } >> } >> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp >> b/src/mesa/drivers/dri/i965/brw_vec4.cpp >> index b9e592f..0aab97f 100644 >> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp >> @@ -2129,6 +2129,9 @@ brw_compile_vs(const struct brw_compiler *compiler, >> void *log_data, >> >> unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read); >> >> + if (shader->options->dvec3_consumes_two_locations) >> + nr_attributes -= DIV_ROUND_UP(_mesa_bitcount_64 >> (prog_data->double_inputs_read), 2); >> + >> /* gl_VertexID and gl_InstanceID are system values, but arrive via an >> * incoming vertex attribute. So, add an extra slot. >> */ >> @@ -2146,9 +2149,13 @@ brw_compile_vs(const struct brw_compiler >> *compiler, void *log_data, >> nr_attributes++; >> } >> >> - unsigned nr_attribute_slots = >> - nr_attributes + >> - _mesa_bitcount_64(shader->info->double_inputs_read); >> + unsigned nr_attribute_slots = nr_attributes; >> + if (shader->options->dvec3_consumes_two_locations) >> + nr_attribute_slots += >> + DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), >> 2); >> + else >> + nr_attribute_slots += >> + _mesa_bitcount_64(shader->info->double_inputs_read); >> >> /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB >> Entry >> * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, >> in >> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c >> b/src/mesa/drivers/dri/i965/intel_screen.c >> index e1c3c19..c050f86 100644 >> --- a/src/mesa/drivers/dri/i965/intel_screen.c >> +++ b/src/mesa/drivers/dri/i965/intel_screen.c >> @@ -1688,7 +1688,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen >> *dri_screen) >> ? screenExtensions : intelRobustScreenExtensions; >> >> screen->compiler = brw_compiler_create(screen, >> - &screen->devinfo); >> + &screen->devinfo, >> + false); >> screen->compiler->shader_debug_log = shader_debug_log_mesa; >> screen->compiler->shader_perf_log = shader_perf_log_mesa; >> screen->program_id = 1; >> -- >> 2.7.4 >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev >> > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev