Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> --- src/intel/vulkan/anv_cmd_buffer.c | 53 ++++++++++++++++++++------------------ src/intel/vulkan/anv_private.h | 1 - src/intel/vulkan/gen7_cmd_buffer.c | 15 +++++------ src/intel/vulkan/gen8_cmd_buffer.c | 13 ++++------ src/intel/vulkan/genX_cmd_buffer.c | 4 +-- src/intel/vulkan/genX_pipeline.c | 12 ++------- 6 files changed, 43 insertions(+), 55 deletions(-)
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 4d0fd7c..846970ed 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1065,24 +1065,14 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - const unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - const unsigned push_constant_data_size = - (local_id_dwords + prog_data->nr_params) * 4; - const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - const unsigned param_aligned_count = - reg_aligned_constant_size / sizeof(uint32_t); - /* If we don't actually have any push constants, bail. */ - if (reg_aligned_constant_size == 0) + if (cs_prog_data->push.total.size == 0) return (struct anv_state) { .offset = 0 }; - const unsigned threads = pipeline->cs_thread_width_max; - const unsigned total_push_constants_size = - reg_aligned_constant_size * threads; const unsigned push_constant_alignment = cmd_buffer->device->info.gen < 8 ? 32 : 64; const unsigned aligned_total_push_constants_size = - ALIGN(total_push_constants_size, push_constant_alignment); + ALIGN(cs_prog_data->push.total.size, push_constant_alignment); struct anv_state state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, aligned_total_push_constants_size, @@ -1091,21 +1081,34 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) /* Walk through the param array and fill the buffer with data */ uint32_t *u32_map = state.map; - brw_cs_fill_local_id_payload(cs_prog_data, u32_map, threads, - reg_aligned_constant_size); - - /* Setup uniform data for the first thread */ - for (unsigned i = 0; i < prog_data->nr_params; i++) { - uint32_t offset = (uintptr_t)prog_data->param[i]; - u32_map[local_id_dwords + i] = *(uint32_t *)((uint8_t *)data + offset); + if (cs_prog_data->push.cross_thread.size > 0) { + assert(cs_prog_data->thread_local_id_index < 0 || + cs_prog_data->thread_local_id_index >= + cs_prog_data->push.cross_thread.dwords); + for (unsigned i = 0; + i < cs_prog_data->push.cross_thread.dwords; + i++) { + uint32_t offset = (uintptr_t)prog_data->param[i]; + u32_map[i] = *(uint32_t *)((uint8_t *)data + offset); + } } - /* Copy uniform data from the first thread to every other thread */ - const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t); - for (unsigned t = 1; t < threads; t++) { - memcpy(&u32_map[t * param_aligned_count + local_id_dwords], - &u32_map[local_id_dwords], - uniform_data_size); + if (cs_prog_data->push.per_thread.size > 0) { + for (unsigned t = 0; t < cs_prog_data->threads; t++) { + uint32_t *t_u32_map = + &u32_map[8 * (cs_prog_data->push.per_thread.regs * t + + cs_prog_data->push.cross_thread.regs)]; + for (unsigned si = cs_prog_data->push.cross_thread.dwords, di = 0; + si < prog_data->nr_params; + si++, di++) { + if (si != cs_prog_data->thread_local_id_index) { + uint32_t offset = (uintptr_t)prog_data->param[si]; + t_u32_map[di] = *(uint32_t *)((uint8_t *)data + offset); + } else { + t_u32_map[di] = t * cs_prog_data->simd_size; + } + } + } } if (!cmd_buffer->device->info.has_llc) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 7325f3f..26ffbd6 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1474,7 +1474,6 @@ struct anv_pipeline { bool primitive_restart; uint32_t topology; - uint32_t cs_thread_width_max; uint32_t cs_right_mask; struct { diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 331275e..478122b 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -234,12 +234,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; - if (push_state.alloc_size) { anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = push_state.alloc_size; @@ -264,14 +258,17 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BindingTablePointer = surfaces.offset, .SamplerStatePointer = samplers.offset, .ConstantURBEntryReadLength = - push_constant_regs, -#if !GEN_IS_HASWELL + cs_prog_data->push.per_thread.regs, +#if GEN_IS_HASWELL + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs, +#else .ConstantURBEntryReadOffset = 0, #endif .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - pipeline->cs_thread_width_max); + cs_prog_data->threads); const uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index 547fedd..1fdc7d2 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -319,12 +319,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; - if (push_state.alloc_size) { anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = push_state.alloc_size; @@ -351,12 +345,15 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BindingTableEntryCount = 0, .SamplerStatePointer = samplers.offset, .SamplerCount = 0, - .ConstantIndirectURBEntryReadLength = push_constant_regs, + .ConstantIndirectURBEntryReadLength = + cs_prog_data->push.per_thread.regs, .ConstantURBEntryReadOffset = 0, .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - pipeline->cs_thread_width_max); + cs_prog_data->threads, + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs); uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index e7d322c..d9acf58 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -773,7 +773,7 @@ void genX(CmdDispatch)( ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; ggw.ThreadGroupIDXDimension = x; ggw.ThreadGroupIDYDimension = y; ggw.ThreadGroupIDZDimension = z; @@ -874,7 +874,7 @@ void genX(CmdDispatchIndirect)( ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; ggw.RightExecutionMask = pipeline->cs_right_mask; ggw.BottomExecutionMask = 0xffffffff; } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 918a9a4..458e80c 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -87,18 +87,9 @@ genX(compute_pipeline_create)( anv_setup_pipeline_l3_config(pipeline); const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; uint32_t group_size = cs_prog_data->local_size[0] * cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; - pipeline->cs_thread_width_max = - DIV_ROUND_UP(group_size, cs_prog_data->simd_size); uint32_t remainder = group_size & (cs_prog_data->simd_size - 1); if (remainder > 0) @@ -107,7 +98,8 @@ genX(compute_pipeline_create)( pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size); const uint32_t vfe_curbe_allocation = - push_constant_regs * pipeline->cs_thread_width_max; + ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + + cs_prog_data->push.cross_thread.regs, 2); anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) { vfe.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_COMPUTE]; -- 2.8.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev