[Mesa-dev] [PATCH 06/23] radv: Add PM4 pregeneration for compute pipelines.

Bas Nieuwenhuizen Tue, 16 Jan 2018 17:36:07 -0800

---
 src/amd/vulkan/radv_cmd_buffer.c | 60 ++----------------------------------
 src/amd/vulkan/radv_pipeline.c   | 66 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 58 deletions(-)


diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index cf4633608a..9bca7aa952 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2172,76 +2172,20 @@ VkResult radv_EndCommandBuffer(
 static void
 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
 {
-       struct radv_shader_variant *compute_shader;
        struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-       struct radv_device *device = cmd_buffer->device;
-       unsigned compute_resource_limits;
-       unsigned waves_per_threadgroup;
-       uint64_t va;
 
        if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
                return;
 
        cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
-       compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-       va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
-
-       MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws,
-                                                          cmd_buffer->cs, 19);
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2);
-       radeon_emit(cmd_buffer->cs, va >> 8);
-       radeon_emit(cmd_buffer->cs, va >> 40);
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-       radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
-       radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
-
+       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 
pipeline->pm4.data_word_count);
+       radeon_emit_array(cmd_buffer->cs, pipeline->pm4.data, 
pipeline->pm4.data_word_count);
 
        cmd_buffer->compute_scratch_size_needed =
                                  MAX2(cmd_buffer->compute_scratch_size_needed,
                                       pipeline->max_waves * 
pipeline->scratch_bytes_per_wave);
 
-       /* change these once we have scratch support */
-       radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-                         S_00B860_WAVES(pipeline->max_waves) |
-                         S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 
10));
-
-       /* Calculate best compute resource limits. */
-       waves_per_threadgroup =
-               DIV_ROUND_UP(compute_shader->info.cs.block_size[0] *
-                            compute_shader->info.cs.block_size[1] *
-                            compute_shader->info.cs.block_size[2], 64);
-       compute_resource_limits =
-               S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
-
-       if (device->physical_device->rad_info.chip_class >= CIK) {
-               unsigned num_cu_per_se =
-                       
device->physical_device->rad_info.num_good_compute_units /
-                       device->physical_device->rad_info.max_se;
-
-               /* Force even distribution on all SIMDs in CU if the workgroup
-                * size is 64. This has shown some good improvements if # of
-                * CUs per SE is not a multiple of 4.
-                */
-               if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
-                       compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
-       }
-
-       radeon_set_sh_reg(cmd_buffer->cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-                         compute_resource_limits);
-
-       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-       radeon_emit(cmd_buffer->cs,
-                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
-       radeon_emit(cmd_buffer->cs,
-                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
-       radeon_emit(cmd_buffer->cs,
-                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
-
-       assert(cmd_buffer->cs->cdw <= cdw_max);
-
        if (unlikely(cmd_buffer->device->trace_bo))
                radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
 }
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 4761d6b806..22dd6566a1 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3248,6 +3248,69 @@ VkResult radv_CreateGraphicsPipelines(
        return result;
 }
 
+
+static void
+radv_compute_generate_pm4(struct radv_pipeline *pipeline)
+{
+       struct radv_pm4_builder builder;
+       struct radv_shader_variant *compute_shader;
+       struct radv_device *device = pipeline->device;
+       unsigned compute_resource_limits;
+       unsigned waves_per_threadgroup;
+       uint64_t va;
+
+       radv_pm4_init(&builder, &pipeline->pm4);
+
+       compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
+       va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
+
+       radv_pm4_start_reg_set(&builder, R_00B830_COMPUTE_PGM_LO, 2);
+       radv_pm4_emit(&builder, va >> 8);
+       radv_pm4_emit(&builder, va >> 40);
+
+       radv_pm4_start_reg_set(&builder, R_00B848_COMPUTE_PGM_RSRC1, 2);
+       radv_pm4_emit(&builder, compute_shader->rsrc1);
+       radv_pm4_emit(&builder, compute_shader->rsrc2);
+
+       radv_pm4_set_reg(&builder, R_00B860_COMPUTE_TMPRING_SIZE,
+                         S_00B860_WAVES(pipeline->max_waves) |
+                         S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 
10));
+
+       /* Calculate best compute resource limits. */
+       waves_per_threadgroup =
+               DIV_ROUND_UP(compute_shader->info.cs.block_size[0] *
+                            compute_shader->info.cs.block_size[1] *
+                            compute_shader->info.cs.block_size[2], 64);
+       compute_resource_limits =
+               S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
+
+       if (device->physical_device->rad_info.chip_class >= CIK) {
+               unsigned num_cu_per_se =
+                       
device->physical_device->rad_info.num_good_compute_units /
+                       device->physical_device->rad_info.max_se;
+
+               /* Force even distribution on all SIMDs in CU if the workgroup
+                * size is 64. This has shown some good improvements if # of
+                * CUs per SE is not a multiple of 4.
+                */
+               if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
+                       compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
+       }
+
+       radv_pm4_set_reg(&builder, R_00B854_COMPUTE_RESOURCE_LIMITS,
+                         compute_resource_limits);
+
+       radv_pm4_start_reg_set(&builder, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+       radv_pm4_emit(&builder,
+                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
+       radv_pm4_emit(&builder,
+                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
+       radv_pm4_emit(&builder,
+                   
S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
+
+       radv_pm4_finish(&builder);
+}
+
 static VkResult radv_compute_pipeline_create(
        VkDevice                                    _device,
        VkPipelineCache                             _cache,
@@ -3281,6 +3344,8 @@ static VkResult radv_compute_pipeline_create(
                return result;
        }
 
+       radv_compute_generate_pm4(pipeline);
+
        *pPipeline = radv_pipeline_to_handle(pipeline);
 
        if (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
@@ -3288,6 +3353,7 @@ static VkResult radv_compute_pipeline_create(
        }
        return VK_SUCCESS;
 }
+
 VkResult radv_CreateComputePipelines(
        VkDevice                                    _device,
        VkPipelineCache                             pipelineCache,
-- 
2.15.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 06/23] radv: Add PM4 pregeneration for compute pipelines.

Reply via email to