I think a better approach would be to translate the NIR variables directly to LLVM variables instead of creating one giant LLVM variable. I have a series coming up that does a similar thing for local variables, I can try and convert shared variables too.
On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <asm...@feralinteractive.com> wrote: > Currently every element in shared memory (including individual elements > of an array) are treated as a vec4. For example, the following: > > shared uint array[1024]; > > gets treated as an array of 1024 vec4s with only the first component > ever used. > > This can be highly wasteful of LDS space. We have a shader which only > really requires ~24KB of shared memory, but since scalars are being > padded to vec4, it actually ends up needing 96KB. This is more than > the hardware supports, and leads to an LLVM compilation failure. > > Therefore, instead pack data in LDS according to the std430 layout. > > Signed-off-by: Alex Smith <asm...@feralinteractive.com> > --- > RFC since I'm not sure whether this is the best approach to handle this, > and there may be something I've missed. > > No regressions seen in Mad Max or Dawn of War 3, which both have a few > shaders which use shared memory. Also no regressions in the CTS > 'dEQP-VK.compute.*' tests. > > I've also checked over various test cases using arrays/structs/etc. in > shared memory, and as far as I can tell the generated LLVM/GCN code is > correct. > --- > src/amd/common/ac_nir_to_llvm.c | 55 > ++++++++++++++++++++++------------------- > 1 file changed, 29 insertions(+), 26 deletions(-) > > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c > index 468ce4d..a48bb3b 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct > nir_to_llvm_context *ctx, > LLVMValueRef ptr; > int addr_space; > > - offset = LLVMConstInt(ctx->i32, idx * 16, false); > + offset = LLVMConstInt(ctx->i32, idx, false); > > ptr = ctx->shared_memory; > ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, ""); > @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct > nir_to_llvm_context *ctx, > > static void > radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref, > - bool vs_in, unsigned *vertex_index_out, > + bool vs_in, bool compute_shared, unsigned > *vertex_index_out, > LLVMValueRef *vertex_index_ref, > unsigned *const_out, LLVMValueRef *indir_out) > { > @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, > nir_deref_var *deref, > if (tail->deref_type == nir_deref_type_array) { > nir_deref_array *deref_array = > nir_deref_as_array(tail); > LLVMValueRef index, stride, local_offset; > - unsigned size = > glsl_count_attribute_slots(tail->type, vs_in); > + > + /* For compute LDS, data is packed rather than > treating > + * each individual element as a vec4. The returned > + * index is used to index into an i32 array so divide > + * by 4. > + */ > + unsigned size = compute_shared ? > glsl_get_std430_array_stride(tail->type, false) / 4 > + : > glsl_count_attribute_slots(tail->type, vs_in); > > const_offset += size * deref_array->base_offset; > if (deref_array->deref_array_type == > nir_deref_array_type_direct) > @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, > nir_deref_var *deref, > > for (unsigned i = 0; i < deref_struct->index; i++) { > const struct glsl_type *ft = > glsl_get_struct_field(parent_type, i); > - const_offset += > glsl_count_attribute_slots(ft, vs_in); > + > + /* Same as above. */ > + unsigned size = compute_shared ? > glsl_get_std430_array_stride(ft, false) / 4 > + : > glsl_count_attribute_slots(tail->type, vs_in); > + const_offset += size; > } > } else > unreachable("unsupported deref type"); > @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx, > const bool is_compact = instr->variables[0]->var->data.compact; > param = > shader_io_get_unique_index(instr->variables[0]->var->data.location); > radv_get_deref_offset(ctx, instr->variables[0], > - false, NULL, per_vertex ? &vertex_index : NULL, > + false, false, NULL, per_vertex ? &vertex_index > : NULL, > &const_index, &indir_index); > > stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8); > @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx, > const bool is_compact = instr->variables[0]->var->data.compact; > param = > shader_io_get_unique_index(instr->variables[0]->var->data.location); > radv_get_deref_offset(ctx, instr->variables[0], > - false, NULL, per_vertex ? &vertex_index : NULL, > + false, false, NULL, per_vertex ? &vertex_index > : NULL, > &const_index, &indir_index); > > if (!instr->variables[0]->var->data.patch) { > @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx, > const bool is_compact = instr->variables[0]->var->data.compact; > > radv_get_deref_offset(ctx, instr->variables[0], > - false, NULL, per_vertex ? &vertex_index : NULL, > + false, false, NULL, per_vertex ? &vertex_index > : NULL, > &const_index, &indir_index); > > param = > shader_io_get_unique_index(instr->variables[0]->var->data.location); > @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx, > const bool is_compact = instr->variables[0]->var->data.compact; > > radv_get_deref_offset(ctx, instr->variables[0], > - false, NULL, per_vertex ? &vertex_index : NULL, > + false, false, NULL, per_vertex ? &vertex_index > : NULL, > &const_index, &indir_index); > param = > shader_io_get_unique_index(instr->variables[0]->var->data.location); > if (instr->variables[0]->var->data.location == > VARYING_SLOT_CLIP_DIST0 && > @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx, > LLVMValueRef value[4], result; > unsigned vertex_index; > radv_get_deref_offset(ctx, instr->variables[0], > - false, &vertex_index, NULL, > + false, false, &vertex_index, NULL, > &const_index, &indir_index); > vtx_offset_param = vertex_index; > assert(vtx_offset_param < 6); > @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct > nir_to_llvm_context *ctx, > unsigned const_index; > bool vs_in = ctx->stage == MESA_SHADER_VERTEX && > instr->variables[0]->var->data.mode == nir_var_shader_in; > - radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL, > - &const_index, &indir_index); > + bool compute_shared = instr->variables[0]->var->data.mode == > nir_var_shared; > + radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared, > + NULL, NULL, &const_index, &indir_index); > > if (instr->dest.ssa.bit_size == 64) > ve *= 2; > @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct > nir_to_llvm_context *ctx, > LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); > LLVMValueRef derived_ptr; > > - if (indir_index) > - indir_index = LLVMBuildMul(ctx->builder, indir_index, > LLVMConstInt(ctx->i32, 4, false), ""); > - > for (unsigned chan = 0; chan < ve; chan++) { > LLVMValueRef index = LLVMConstInt(ctx->i32, chan, > false); > if (indir_index) > @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx, > int writemask = instr->const_index[0]; > LLVMValueRef indir_index; > unsigned const_index; > - radv_get_deref_offset(ctx, instr->variables[0], false, > + bool compute_shared = instr->variables[0]->var->data.mode == > nir_var_shared; > + radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared, > NULL, NULL, &const_index, &indir_index); > > if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) { > @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx, > case nir_var_shared: { > LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); > > - if (indir_index) > - indir_index = LLVMBuildMul(ctx->builder, indir_index, > LLVMConstInt(ctx->i32, 4, false), ""); > - > for (unsigned chan = 0; chan < 8; chan++) { > if (!(writemask & (1 << chan))) > continue; > @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context > *ctx) > > static void > handle_shared_compute_var(struct nir_to_llvm_context *ctx, > - struct nir_variable *variable, uint32_t *offset, > int idx) > + struct nir_variable *variable, uint32_t *offset) > { > - unsigned size = glsl_count_attribute_slots(variable->type, false); > + unsigned size = glsl_get_std430_array_stride(variable->type, false); > variable->data.driver_location = *offset; > *offset += size; > } > @@ -5926,16 +5933,12 @@ LLVMModuleRef > ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, > nir_foreach_variable(variable, &nir->shared) > num_shared++; > if (num_shared) { > - int idx = 0; > uint32_t shared_size = 0; > LLVMValueRef var; > LLVMTypeRef i8p = LLVMPointerType(ctx.i8, > LOCAL_ADDR_SPACE); > - nir_foreach_variable(variable, &nir->shared) { > - handle_shared_compute_var(&ctx, variable, > &shared_size, idx); > - idx++; > - } > + nir_foreach_variable(variable, &nir->shared) > + handle_shared_compute_var(&ctx, variable, > &shared_size); > > - shared_size *= 16; > var = LLVMAddGlobalInAddressSpace(ctx.module, > > LLVMArrayType(ctx.i8, shared_size), > "compute_lds", > -- > 2.9.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev