On 10.05.2016 05:52, Bas Nieuwenhuizen wrote:
Instead of creating a memory area per patch and per vertex, we put
the same attribute of every vertex & patch together. Most loads
and stores access the same attribute across all lanes, only for
different patches and vertices.

For the TCS this results in tightly packed data for 4-component
stores.

For the TES this is not the case as within a patch the loads
often also access the same vertex. However if there are < 4
vertices/patch, this still results in a reduction of the number
of cache lines. In the LDS situation we only do better than worst
case if the data per patch < 64 bytes, which due to the
tessellation factors is pretty much never.

We do not use hardware swizzling for this. It would slightly reduce
the number of executed VALU instructions, but I had issues with
increased wait times that I haven't been able to solve yet.
Furthermore, the tbuffer_store intrinsic does not support both
VGPR offset and an index, so we have a problem storing
indirectly indexed outputs. This can be solved by temporarily
storing arrays in LDS and then copying them, but I don't think
that is worth the effort. The difference in VALU cycles
hardware swizzling gives is about 0.2% of total busy cycles.
That is without handling the array case.

I chose for attributes instead of components as they are often
accessed together, and the software swizzling takes VALU cycles
for calculating offsets.

Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
---
  src/gallium/drivers/radeonsi/si_shader.c | 143 +++++++++++++++++++++++++++++++
  1 file changed, 143 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index adbee73..90830ee 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -664,6 +664,149 @@ static LLVMValueRef get_dw_address(struct 
si_shader_context *ctx,
                            lp_build_const_int32(gallivm, param * 4), "");
  }

+/* The offchip buffer layout for TCS->TES is
+ *
+ * - attribute 0 of patch 0 vertex 0
+ * - attribute 0 of patch 0 vertex 1
+ * - attribute 0 of patch 0 vertex 2
+ *   ...
+ * - attribute 0 of patch 1 vertex 0
+ * - attribute 0 of patch 1 vertex 1
+ *   ...
+ * - attribute 1 of patch 0 vertex 0
+ * - attribute 1 of patch 0 vertex 1
+ *   ...
+ * - per patch attribute 0 of patch 0 vertex 0
+ * - per patch attribute 0 of patch 0 vertex 1

Should be "... of patch 0" and "... of patch 1", right?

+ *   ...
+ *
+ * Note that every attribute has 4 components.
+ */
+static LLVMValueRef get_buffer_address(struct si_shader_context *ctx,
+                                       LLVMValueRef vertex_index,
+                                       LLVMValueRef param_index)
+{
+       struct gallivm_state *gallivm = 
ctx->radeon_bld.soa.bld_base.base.gallivm;
+       LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
+       LLVMValueRef param_stride, constant16;
+
+       vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 
6);
+       num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
+       total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
+                                     num_patches, "");
+
+       constant16 = lp_build_const_int32(gallivm, 16);
+       if (vertex_index) {
+               base_addr = LLVMBuildMul(gallivm->builder,
+                                         constant16,
+                                        vertices_per_patch, "");
+
+               base_addr = LLVMBuildMul(gallivm->builder, 
get_rel_patch_id(ctx),
+                                        base_addr, "");
+
+               base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+                                        LLVMBuildMul(gallivm->builder, 
vertex_index,
+                                                     constant16, ""), "");

Nitpick, but... apply the distributive law and calculate (rel_patch_id * vertices_per_patch + vertex_idx) * 16.

+
+               param_stride = LLVMBuildMul(gallivm->builder, total_vertices,
+                                           constant16, "");
+       } else {
+               LLVMValueRef patch_data_offset =
+                          unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 
16);
+
+               base_addr = LLVMBuildMul(gallivm->builder, 
get_rel_patch_id(ctx),
+                                        constant16, "");
+
+               base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+                                        patch_data_offset, "");
+
+               param_stride = LLVMBuildMul(gallivm->builder, num_patches,
+                                           constant16, "");
+       }
+
+       base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+                                LLVMBuildMul(gallivm->builder, param_index,
+                                             param_stride, ""), "");
+
+       return base_addr;
+}
+
+static LLVMValueRef get_buffer_address_from_reg(struct si_shader_context *ctx,
+                                       const struct tgsi_full_dst_register 
*dst,
+                                       const struct tgsi_full_src_register 
*src)
+{
+       struct gallivm_state *gallivm = 
ctx->radeon_bld.soa.bld_base.base.gallivm;
+       struct tgsi_shader_info *info = &ctx->shader->selector->info;
+       ubyte *name, *index, *array_first;
+       struct tgsi_full_dst_register reg;
+       LLVMValueRef vertex_index = NULL;
+       LLVMValueRef param_index = NULL;
+       unsigned param_index_base, param_base;
+
+       /* Set the register description. The address computation is the same
+        * for sources and destinations. */
+       if (src) {
+               reg.Register.File = src->Register.File;
+               reg.Register.Index = src->Register.Index;
+               reg.Register.Indirect = src->Register.Indirect;
+               reg.Register.Dimension = src->Register.Dimension;
+               reg.Indirect = src->Indirect;
+               reg.Dimension = src->Dimension;
+               reg.DimIndirect = src->DimIndirect;
+       } else
+               reg = *dst;

You could reverse the roles of src and dst and use tgsi_full_src_register_from_dst, but I don't feel strongly either way.

Apart from the comments above, patches 1-3 and 5-7 are

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>

+
+       /* If the register is 2-dimensional (e.g. an array of vertices
+        * in a primitive), calculate the base address of the vertex. */
+       if (reg.Register.Dimension) {
+
+               if (reg.Dimension.Indirect)
+                       vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
+                                                         reg.Dimension.Index);
+               else
+                       vertex_index = lp_build_const_int32(gallivm,
+                                                           
reg.Dimension.Index);
+       }
+
+       /* Get information about the register. */
+       if (reg.Register.File == TGSI_FILE_INPUT) {
+               name = info->input_semantic_name;
+               index = info->input_semantic_index;
+               array_first = info->input_array_first;
+       } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
+               name = info->output_semantic_name;
+               index = info->output_semantic_index;
+               array_first = info->output_array_first;
+       } else {
+               assert(0);
+               return NULL;
+       }
+
+       if (reg.Register.Indirect) {
+               if (reg.Indirect.ArrayID)
+                       param_base = array_first[reg.Indirect.ArrayID];
+               else
+                       param_base = reg.Register.Index;
+
+               param_index = get_indirect_index(ctx, &reg.Indirect,
+                                                reg.Register.Index - 
param_base);
+
+       } else {
+               param_base = reg.Register.Index;
+               param_index = lp_build_const_int32(gallivm, 0);
+       }
+
+       param_index_base = si_shader_io_get_unique_index(name[param_base],
+                                                        index[param_base]);
+
+       param_index = LLVMBuildAdd(gallivm->builder, param_index,
+                                  lp_build_const_int32(gallivm, 
param_index_base),
+                                  "");
+
+       /* Add the base address of the element. */
+       return get_buffer_address(ctx, vertex_index, param_index);
+}
+
  /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by 
num_channels=1..4.
   * The type of vdata must be one of i32 (num_channels=1), v2i32 
(num_channels=2),
   * or v4i32 (num_channels=3,4). */

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to