--- src/amd/vulkan/radv_cmd_buffer.c | 60 ++---------------------------------- src/amd/vulkan/radv_pipeline.c | 66 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 58 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index cf4633608a..9bca7aa952 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2172,76 +2172,20 @@ VkResult radv_EndCommandBuffer( static void radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) { - struct radv_shader_variant *compute_shader; struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; - struct radv_device *device = cmd_buffer->device; - unsigned compute_resource_limits; - unsigned waves_per_threadgroup; - uint64_t va; if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) return; cmd_buffer->state.emitted_compute_pipeline = pipeline; - compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; - va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, 19); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cmd_buffer->cs, va >> 8); - radeon_emit(cmd_buffer->cs, va >> 40); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit(cmd_buffer->cs, compute_shader->rsrc1); - radeon_emit(cmd_buffer->cs, compute_shader->rsrc2); - + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->pm4.data_word_count); + radeon_emit_array(cmd_buffer->cs, pipeline->pm4.data, pipeline->pm4.data_word_count); cmd_buffer->compute_scratch_size_needed = MAX2(cmd_buffer->compute_scratch_size_needed, pipeline->max_waves * pipeline->scratch_bytes_per_wave); - /* change these once we have scratch support */ - radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(pipeline->max_waves) | - S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); - - /* Calculate best compute resource limits. */ - waves_per_threadgroup = - DIV_ROUND_UP(compute_shader->info.cs.block_size[0] * - compute_shader->info.cs.block_size[1] * - compute_shader->info.cs.block_size[2], 64); - compute_resource_limits = - S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); - - if (device->physical_device->rad_info.chip_class >= CIK) { - unsigned num_cu_per_se = - device->physical_device->rad_info.num_good_compute_units / - device->physical_device->rad_info.max_se; - - /* Force even distribution on all SIMDs in CU if the workgroup - * size is 64. This has shown some good improvements if # of - * CUs per SE is not a multiple of 4. - */ - if (num_cu_per_se % 4 && waves_per_threadgroup == 1) - compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); - } - - radeon_set_sh_reg(cmd_buffer->cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - compute_resource_limits); - - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1])); - radeon_emit(cmd_buffer->cs, - S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); - - assert(cmd_buffer->cs->cdw <= cdw_max); - if (unlikely(cmd_buffer->device->trace_bo)) radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE); } diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 4761d6b806..22dd6566a1 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3248,6 +3248,69 @@ VkResult radv_CreateGraphicsPipelines( return result; } + +static void +radv_compute_generate_pm4(struct radv_pipeline *pipeline) +{ + struct radv_pm4_builder builder; + struct radv_shader_variant *compute_shader; + struct radv_device *device = pipeline->device; + unsigned compute_resource_limits; + unsigned waves_per_threadgroup; + uint64_t va; + + radv_pm4_init(&builder, &pipeline->pm4); + + compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; + va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; + + radv_pm4_start_reg_set(&builder, R_00B830_COMPUTE_PGM_LO, 2); + radv_pm4_emit(&builder, va >> 8); + radv_pm4_emit(&builder, va >> 40); + + radv_pm4_start_reg_set(&builder, R_00B848_COMPUTE_PGM_RSRC1, 2); + radv_pm4_emit(&builder, compute_shader->rsrc1); + radv_pm4_emit(&builder, compute_shader->rsrc2); + + radv_pm4_set_reg(&builder, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(pipeline->max_waves) | + S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); + + /* Calculate best compute resource limits. */ + waves_per_threadgroup = + DIV_ROUND_UP(compute_shader->info.cs.block_size[0] * + compute_shader->info.cs.block_size[1] * + compute_shader->info.cs.block_size[2], 64); + compute_resource_limits = + S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); + + if (device->physical_device->rad_info.chip_class >= CIK) { + unsigned num_cu_per_se = + device->physical_device->rad_info.num_good_compute_units / + device->physical_device->rad_info.max_se; + + /* Force even distribution on all SIMDs in CU if the workgroup + * size is 64. This has shown some good improvements if # of + * CUs per SE is not a multiple of 4. + */ + if (num_cu_per_se % 4 && waves_per_threadgroup == 1) + compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); + } + + radv_pm4_set_reg(&builder, R_00B854_COMPUTE_RESOURCE_LIMITS, + compute_resource_limits); + + radv_pm4_start_reg_set(&builder, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + radv_pm4_emit(&builder, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0])); + radv_pm4_emit(&builder, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1])); + radv_pm4_emit(&builder, + S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); + + radv_pm4_finish(&builder); +} + static VkResult radv_compute_pipeline_create( VkDevice _device, VkPipelineCache _cache, @@ -3281,6 +3344,8 @@ static VkResult radv_compute_pipeline_create( return result; } + radv_compute_generate_pm4(pipeline); + *pPipeline = radv_pipeline_to_handle(pipeline); if (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) { @@ -3288,6 +3353,7 @@ static VkResult radv_compute_pipeline_create( } return VK_SUCCESS; } + VkResult radv_CreateComputePipelines( VkDevice _device, VkPipelineCache pipelineCache, -- 2.15.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev