On 30 January 2017 at 03:26, Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> wrote: > v2: Create the descriptor BO with CPU access. > Signed-off-by: Bas Nieuwenhuizen <ba...@google.com>
Tom has pushed the spill to llvm 4.0 as well now, so with the checks changed to 4.0 for the series: Reviewed-by: Dave Airlie <airl...@redhat.com> > --- > src/amd/vulkan/radv_device.c | 186 > ++++++++++++++++++++++++++++++++++++++++- > src/amd/vulkan/radv_pipeline.c | 11 +-- > src/amd/vulkan/radv_private.h | 8 ++ > 3 files changed, 199 insertions(+), 6 deletions(-) > > diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c > index 0df9e783af9..6d3a8100f74 100644 > --- a/src/amd/vulkan/radv_device.c > +++ b/src/amd/vulkan/radv_device.c > @@ -32,6 +32,7 @@ > #include <fcntl.h> > #include <sys/stat.h> > #include "radv_private.h" > +#include "radv_cs.h" > #include "util/strtod.h" > > #include <xf86drm.h> > @@ -752,6 +753,15 @@ radv_queue_finish(struct radv_queue *queue) > { > if (queue->hw_ctx) > queue->device->ws->ctx_destroy(queue->hw_ctx); > + > + if (queue->preamble_cs) > + queue->device->ws->cs_destroy(queue->preamble_cs); > + if (queue->descriptor_bo) > + queue->device->ws->buffer_destroy(queue->descriptor_bo); > + if (queue->scratch_bo) > + queue->device->ws->buffer_destroy(queue->scratch_bo); > + if (queue->compute_scratch_bo) > + queue->device->ws->buffer_destroy(queue->compute_scratch_bo); > } > > VkResult radv_CreateDevice( > @@ -1001,6 +1011,159 @@ static void radv_dump_trace(struct radv_device > *device, > fclose(f); > } > > +static VkResult > +radv_get_preamble_cs(struct radv_queue *queue, > + uint32_t scratch_size, > + uint32_t compute_scratch_size, > + struct radeon_winsys_cs **preamble_cs) > +{ > + struct radeon_winsys_bo *scratch_bo = NULL; > + struct radeon_winsys_bo *descriptor_bo = NULL; > + struct radeon_winsys_bo *compute_scratch_bo = NULL; > + struct radeon_winsys_cs *cs = NULL; > + > + if (!scratch_size && !compute_scratch_size) { > + *preamble_cs = NULL; > + return VK_SUCCESS; > + } > + > + if (scratch_size <= queue->scratch_size && > + compute_scratch_size <= queue->compute_scratch_size) { > + *preamble_cs = queue->preamble_cs; > + return VK_SUCCESS; > + } > + > + if (scratch_size > queue->scratch_size) { > + scratch_bo = > queue->device->ws->buffer_create(queue->device->ws, > + scratch_size, > + 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_NO_CPU_ACCESS); > + if (!scratch_bo) > + goto fail; > + } else > + scratch_bo = queue->scratch_bo; > + > + if (compute_scratch_size > queue->compute_scratch_size) { > + compute_scratch_bo = > queue->device->ws->buffer_create(queue->device->ws, > + > compute_scratch_size, > + 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_NO_CPU_ACCESS); > + if (!compute_scratch_bo) > + goto fail; > + > + } else > + compute_scratch_bo = queue->compute_scratch_bo; > + > + if (scratch_bo != queue->scratch_bo) { > + descriptor_bo = > queue->device->ws->buffer_create(queue->device->ws, > + 8, > + 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_CPU_ACCESS); > + if (!descriptor_bo) > + goto fail; > + } else > + descriptor_bo = queue->descriptor_bo; > + > + cs = queue->device->ws->cs_create(queue->device->ws, > + queue->queue_family_index ? > RING_COMPUTE : RING_GFX); > + if (!cs) > + goto fail; > + > + > + if (scratch_bo) > + queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); > + > + if (descriptor_bo) > + queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); > + > + if (descriptor_bo != queue->descriptor_bo) { > + uint64_t scratch_va = > queue->device->ws->buffer_get_va(scratch_bo); > + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | > + S_008F04_SWIZZLE_ENABLE(1); > + > + uint32_t *map = > (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); > + > + map[0] = scratch_va; > + map[1] = rsrc1; > + > + queue->device->ws->buffer_unmap(descriptor_bo); > + } > + > + if (descriptor_bo) { > + uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, > + R_00B130_SPI_SHADER_USER_DATA_VS_0, > + R_00B230_SPI_SHADER_USER_DATA_GS_0, > + R_00B330_SPI_SHADER_USER_DATA_ES_0, > + R_00B430_SPI_SHADER_USER_DATA_HS_0, > + R_00B530_SPI_SHADER_USER_DATA_LS_0}; > + > + uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo); > + > + for (int i = 0; i < ARRAY_SIZE(regs); ++i) { > + radeon_set_sh_reg_seq(cs, regs[i], 2); > + radeon_emit(cs, va); > + radeon_emit(cs, va >> 32); > + } > + } > + > + if (compute_scratch_bo) { > + uint64_t scratch_va = > queue->device->ws->buffer_get_va(compute_scratch_bo); > + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | > + S_008F04_SWIZZLE_ENABLE(1); > + > + queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8); > + > + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); > + radeon_emit(cs, scratch_va); > + radeon_emit(cs, rsrc1); > + } > + > + if (!queue->device->ws->cs_finalize(cs)) > + goto fail; > + > + if (queue->preamble_cs) > + queue->device->ws->cs_destroy(queue->preamble_cs); > + > + queue->preamble_cs = cs; > + > + if (scratch_bo != queue->scratch_bo) { > + if (queue->scratch_bo) > + queue->device->ws->buffer_destroy(queue->scratch_bo); > + queue->scratch_bo = scratch_bo; > + queue->scratch_size = scratch_size; > + } > + > + if (compute_scratch_bo != queue->compute_scratch_bo) { > + if (queue->compute_scratch_bo) > + > queue->device->ws->buffer_destroy(queue->compute_scratch_bo); > + queue->compute_scratch_bo = compute_scratch_bo; > + queue->compute_scratch_size = compute_scratch_size; > + } > + > + if (descriptor_bo != queue->descriptor_bo) { > + if (queue->descriptor_bo) > + > queue->device->ws->buffer_destroy(queue->descriptor_bo); > + > + queue->descriptor_bo = descriptor_bo; > + } > + > + *preamble_cs = cs; > + return VK_SUCCESS; > +fail: > + if (cs) > + queue->device->ws->cs_destroy(cs); > + if (descriptor_bo && descriptor_bo != queue->descriptor_bo) > + queue->device->ws->buffer_destroy(descriptor_bo); > + if (scratch_bo && scratch_bo != queue->scratch_bo) > + queue->device->ws->buffer_destroy(scratch_bo); > + if (compute_scratch_bo && compute_scratch_bo != > queue->compute_scratch_bo) > + queue->device->ws->buffer_destroy(compute_scratch_bo); > + return VK_ERROR_OUT_OF_DEVICE_MEMORY; > +} > + > VkResult radv_QueueSubmit( > VkQueue _queue, > uint32_t submitCount, > @@ -1013,6 +1176,27 @@ VkResult radv_QueueSubmit( > struct radeon_winsys_ctx *ctx = queue->hw_ctx; > int ret; > uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; > + uint32_t scratch_size = 0; > + uint32_t compute_scratch_size = 0; > + struct radeon_winsys_cs *preamble_cs = NULL; > + VkResult result; > + > + /* Do this first so failing to allocate scratch buffers can't result > in > + * partially executed submissions. */ > + for (uint32_t i = 0; i < submitCount; i++) { > + for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) > { > + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, > + pSubmits[i].pCommandBuffers[j]); > + > + scratch_size = MAX2(scratch_size, > cmd_buffer->scratch_size_needed); > + compute_scratch_size = MAX2(compute_scratch_size, > + > cmd_buffer->compute_scratch_size_needed); > + } > + } > + > + result = radv_get_preamble_cs(queue, scratch_size, > compute_scratch_size, &preamble_cs); > + if (result != VK_SUCCESS) > + return result; > > for (uint32_t i = 0; i < submitCount; i++) { > struct radeon_winsys_cs **cs_array; > @@ -1045,7 +1229,7 @@ VkResult radv_QueueSubmit( > *queue->device->trace_id_ptr = 0; > > ret = queue->device->ws->cs_submit(ctx, > queue->queue_idx, cs_array + j, > - advance, NULL, > + advance, preamble_cs, > (struct > radeon_winsys_sem **)pSubmits[i].pWaitSemaphores, > b ? > pSubmits[i].waitSemaphoreCount : 0, > (struct > radeon_winsys_sem **)pSubmits[i].pSignalSemaphores, > diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c > index e332877e2ba..3f90fb3016f 100644 > --- a/src/amd/vulkan/radv_pipeline.c > +++ b/src/amd/vulkan/radv_pipeline.c > @@ -363,12 +363,13 @@ static void radv_fill_shader_variant(struct radv_device > *device, > struct ac_shader_binary *binary, > gl_shader_stage stage) > { > - variant->code_size = binary->code_size; > bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0; > unsigned vgpr_comp_cnt = 0; > > - if (scratch_enabled) > - radv_finishme("shader scratch space"); > + if (scratch_enabled && !device->llvm_supports_spill) > + radv_finishme("shader scratch support only available with > LLVM 5.0"); > + > + variant->code_size = binary->code_size; > > switch (stage) { > case MESA_SHADER_VERTEX: > @@ -433,8 +434,8 @@ static struct radv_shader_variant > *radv_shader_variant_create(struct radv_device > options.unsafe_math = !!(device->debug_flags & > RADV_DEBUG_UNSAFE_MATH); > options.family = chip_family; > options.chip_class = device->physical_device->rad_info.chip_class; > - options.supports_spill = false; > - tm = ac_create_target_machine(chip_family, false); > + options.supports_spill = device->llvm_supports_spill; > + tm = ac_create_target_machine(chip_family, options.supports_spill); > ac_compile_nir_shader(tm, &binary, &variant->config, > &variant->info, shader, &options, dump); > LLVMDisposeTargetMachine(tm); > diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h > index 88e05595380..fac5b97153d 100644 > --- a/src/amd/vulkan/radv_private.h > +++ b/src/amd/vulkan/radv_private.h > @@ -467,6 +467,14 @@ struct radv_queue { > struct radeon_winsys_ctx *hw_ctx; > int queue_family_index; > int queue_idx; > + > + uint32_t scratch_size; > + uint32_t compute_scratch_size; > + > + struct radeon_winsys_bo *scratch_bo; > + struct radeon_winsys_bo *descriptor_bo; > + struct radeon_winsys_bo *compute_scratch_bo; > + struct radeon_winsys_cs *preamble_cs; > }; > > struct radv_device { > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev