[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
It's common in some applications to bind a new graphics pipeline without ending up changing any context registers. This has a pipline have two command buffers: one for setting context registers and one for everything else. The context register command buffer is only emitted if it differs from the previous pipeline's. Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 46 +-- src/amd/vulkan/radv_pipeline.c | 217 --- src/amd/vulkan/radv_private.h| 2 + 3 files changed, 150 insertions(+), 115 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f41d6c0b3e7..59903ab64d8 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, } } -static void +static bool radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) { @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->sample_positions_needed = true; if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) - return; + return false; radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2); radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } + + return true; } static void @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) radeon_emit(cmd_buffer->cs, sx_blend_opt_control); } -static void +static bool radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) - return; + return false; - radv_update_multisample_state(cmd_buffer, pipeline); + bool context_roll = radv_update_multisample_state(cmd_buffer, pipeline); cmd_buffer->scratch_size_needed = MAX2(cmd_buffer->scratch_size_needed, @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); + if (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, + pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); + context_roll = true; + } + for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { if (!pipeline->shaders[i]) continue; @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.emitted_pipeline = pipeline; cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; + + return context_roll; } static void @@ -2859,6 +2872,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) return; + assert(!pipeline->ctx_cs.cdw); + cmd_buffer->state.emitted_compute_pipeline = pipeline; radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); @@ -3609,30 +3624,30 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, * any context registers. */ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, -bool indexed_draw) +bool indexed_draw, +bool pipeline_context_roll) { struct radv_cmd_state *state = &cmd_buffer->state; if (!cmd_buffer->device->physical_device->has_scissor_bug) return false; + if (pipeline_context_roll) + return true; + uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; /* Index, vertex and streamout buffers don't change context regs, and -* pipeline is handle
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
I did and found small improvements in Rise of the Tomb Raider. I measured framerates ~104.3% that of without the changes for the Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% for Prophets Tomb. I found no change with Dota 2 but I've heard it's cpu-bound. On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset wrote: > > Did you benchmark? > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > It's common in some applications to bind a new graphics pipeline without > > ending up changing any context registers. > > > > This has a pipline have two command buffers: one for setting context > > registers and one for everything else. The context register command buffer > > is only emitted if it differs from the previous pipeline's. > > > > Signed-off-by: Rhys Perry > > --- > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > src/amd/vulkan/radv_pipeline.c | 217 --- > > src/amd/vulkan/radv_private.h| 2 + > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > b/src/amd/vulkan/radv_cmd_buffer.c > > index f41d6c0b3e7..59903ab64d8 100644 > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > > *cmd_buffer, > > } > > } > > > > -static void > > +static bool > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > struct radv_pipeline *pipeline) > > { > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > > *cmd_buffer, > > cmd_buffer->sample_positions_needed = true; > > > > if (old_pipeline && num_samples == > > old_pipeline->graphics.ms.num_samples) > > - return; > > + return false; > > > > radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, > > 2); > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > > *cmd_buffer, > > radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | > > EVENT_INDEX(0)); > > } > > + > > + return true; > > } > > > > static void > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > > *cmd_buffer) > > radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > > } > > > > -static void > > +static bool > > radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > > { > > struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > > > > if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > > - return; > > + return false; > > > > - radv_update_multisample_state(cmd_buffer, pipeline); > > + bool context_roll = radv_update_multisample_state(cmd_buffer, > > pipeline); > > > > cmd_buffer->scratch_size_needed = > > MAX2(cmd_buffer->scratch_size_needed, > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > > *cmd_buffer) > > > > radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); > > > > + if (!cmd_buffer->state.emitted_pipeline || > > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != > > pipeline->ctx_cs.cdw || > > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != > > pipeline->ctx_cs_hash || > > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, > > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { > > + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, > > pipeline->ctx_cs.cdw); > > + context_roll = true; > > + } > > + > > for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { > > if (!pipeline->shaders[i]) > > continue; > > @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > > *cmd_buffer) > > cmd_buffer->state.emitted_pipeline = pipeline; > > > > cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; > > + > > + return context_roll;
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
This is with Rise of the Tomb Raider's graphics settings set to "High" by the way. On Mon, 14 Jan 2019 at 16:12, Rhys Perry wrote: > > I did and found small improvements in Rise of the Tomb Raider. I > measured framerates ~104.3% that of without the changes for the > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > for Prophets Tomb. > > I found no change with Dota 2 but I've heard it's cpu-bound. > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > wrote: > > > > Did you benchmark? > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > It's common in some applications to bind a new graphics pipeline without > > > ending up changing any context registers. > > > > > > This has a pipline have two command buffers: one for setting context > > > registers and one for everything else. The context register command buffer > > > is only emitted if it differs from the previous pipeline's. > > > > > > Signed-off-by: Rhys Perry > > > --- > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > src/amd/vulkan/radv_pipeline.c | 217 --- > > > src/amd/vulkan/radv_private.h| 2 + > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > index f41d6c0b3e7..59903ab64d8 100644 > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > > > *cmd_buffer, > > > } > > > } > > > > > > -static void > > > +static bool > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > struct radv_pipeline *pipeline) > > > { > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > > > *cmd_buffer, > > > cmd_buffer->sample_positions_needed = true; > > > > > > if (old_pipeline && num_samples == > > > old_pipeline->graphics.ms.num_samples) > > > - return; > > > + return false; > > > > > > radeon_set_context_reg_seq(cmd_buffer->cs, > > > R_028BDC_PA_SC_LINE_CNTL, 2); > > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > > > *cmd_buffer, > > > radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > > radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) > > > | EVENT_INDEX(0)); > > > } > > > + > > > + return true; > > > } > > > > > > static void > > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > > > *cmd_buffer) > > > radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > > > } > > > > > > -static void > > > +static bool > > > radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > > > { > > > struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > > > > > > if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > > > - return; > > > + return false; > > > > > > - radv_update_multisample_state(cmd_buffer, pipeline); > > > + bool context_roll = radv_update_multisample_state(cmd_buffer, > > > pipeline); > > > > > > cmd_buffer->scratch_size_needed = > > > MAX2(cmd_buffer->scratch_size_needed, > > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > > > *cmd_buffer) > > > > > > radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, > > > pipeline->cs.cdw); > > > > > > + if (!cmd_buffer->state.emitted_pipeline || > > > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != > > > pipeline->ctx_cs.cdw || > > > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != > > > pipeline->ctx_cs_hash || > > > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, > > > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { > > >
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
Sure On Mon, 14 Jan 2019 at 16:50, Samuel Pitoiset wrote: > > While you are on it, can you experiment the tracked ctx stuff that > RadeonSI implements (ie. SI_TRACKED_XXX)? > > This approach will likely be more costly from the CPU side, but it will > reduce the number of register changes a lot more. > > Not sure if that will improve anything though, but I think it's worth to > try? > > On 1/14/19 5:12 PM, Rhys Perry wrote: > > I did and found small improvements in Rise of the Tomb Raider. I > > measured framerates ~104.3% that of without the changes for the > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > for Prophets Tomb. > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > wrote: > >> Did you benchmark? > >> > >> On 1/14/19 5:01 PM, Rhys Perry wrote: > >>> It's common in some applications to bind a new graphics pipeline without > >>> ending up changing any context registers. > >>> > >>> This has a pipline have two command buffers: one for setting context > >>> registers and one for everything else. The context register command buffer > >>> is only emitted if it differs from the previous pipeline's. > >>> > >>> Signed-off-by: Rhys Perry > >>> --- > >>>src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > >>>src/amd/vulkan/radv_pipeline.c | 217 --- > >>>src/amd/vulkan/radv_private.h| 2 + > >>>3 files changed, 150 insertions(+), 115 deletions(-) > >>> > >>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c > >>> b/src/amd/vulkan/radv_cmd_buffer.c > >>> index f41d6c0b3e7..59903ab64d8 100644 > >>> --- a/src/amd/vulkan/radv_cmd_buffer.c > >>> +++ b/src/amd/vulkan/radv_cmd_buffer.c > >>> @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > >>> *cmd_buffer, > >>>} > >>>} > >>> > >>> -static void > >>> +static bool > >>>radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > >>> struct radv_pipeline *pipeline) > >>>{ > >>> @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > >>> *cmd_buffer, > >>>cmd_buffer->sample_positions_needed = true; > >>> > >>>if (old_pipeline && num_samples == > >>> old_pipeline->graphics.ms.num_samples) > >>> - return; > >>> + return false; > >>> > >>>radeon_set_context_reg_seq(cmd_buffer->cs, > >>> R_028BDC_PA_SC_LINE_CNTL, 2); > >>>radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > >>> @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > >>> *cmd_buffer, > >>>radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > >>>radeon_emit(cmd_buffer->cs, > >>> EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); > >>>} > >>> + > >>> + return true; > >>>} > >>> > >>>static void > >>> @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > >>> *cmd_buffer) > >>>radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > >>>} > >>> > >>> -static void > >>> +static bool > >>>radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > >>>{ > >>>struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > >>> > >>>if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > >>> - return; > >>> + return false; > >>> > >>> - radv_update_multisample_state(cmd_buffer, pipeline); > >>> + bool context_roll = radv_update_multisample_state(cmd_buffer, > >>> pipeline); > >>> > >>>cmd_buffer->scratch_size_needed = > >>> MAX2(cmd_buffer->scratch_size_needed, > >>> @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > >>> *cmd_buffer) > >>> > >>>radeon_emit_array(c
[Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change
DXVK often sets dynamic state without actually changing it. Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 92 ++-- 1 file changed, 76 insertions(+), 16 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 59903ab64d8..56b3c934c2e 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2965,6 +2965,11 @@ void radv_CmdSetViewport( assert(firstViewport < MAX_VIEWPORTS); assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); + if (!memcmp(state->dynamic.viewport.viewports + firstViewport, + pViewports, viewportCount * sizeof(*pViewports))) { + return; + } + memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports)); @@ -2984,6 +2989,11 @@ void radv_CmdSetScissor( assert(firstScissor < MAX_SCISSORS); assert(total_count >= 1 && total_count <= MAX_SCISSORS); + if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors, + scissorCount * sizeof(*pScissors))) { + return; + } + memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors)); @@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth( float lineWidth) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + if (cmd_buffer->state.dynamic.line_width == lineWidth) + return; + cmd_buffer->state.dynamic.line_width = lineWidth; cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; } @@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias( float depthBiasSlopeFactor) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; - cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor; - cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp; - cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor; + if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && + state->dynamic.depth_bias.clamp == depthBiasClamp && + state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { + return; + } - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; + state->dynamic.depth_bias.bias = depthBiasConstantFactor; + state->dynamic.depth_bias.clamp = depthBiasClamp; + state->dynamic.depth_bias.slope = depthBiasSlopeFactor; + + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; } void radv_CmdSetBlendConstants( @@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants( const float blendConstants[4]) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; - memcpy(cmd_buffer->state.dynamic.blend_constants, - blendConstants, sizeof(float) * 4); + if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4)) + return; + + memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4); - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; } void radv_CmdSetDepthBounds( @@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds( float maxDepthBounds) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; - cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; - cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; + if (state->dynamic.depth_bounds.min == minDepthBounds && + state->dynamic.depth_bounds.max == maxDepthBounds) { + return; + } + + state->dynamic.depth_bounds.min = minDepthBounds; + state->dynamic.depth_bounds.max = maxDepthBounds; - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; } void radv_CmdSetStencilCompareMask( @@ -3045,13 +3075,21 @@ void radv_CmdSetStencilCompareMask( uint32_tcompareMask) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; + bool front_same = state->dynamic.stencil_compare_mask.front == compareMask; + bool back_same = state->dyn
Re: [Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change
I misread some code and forgot to remove it. It was always unrelated to this patch. On Wed, 16 Jan 2019 at 00:22, Bas Nieuwenhuizen wrote: > > On Tue, Jan 15, 2019 at 10:59 PM Rhys Perry wrote: > > > > DXVK often sets dynamic state without actually changing it. > > > > Signed-off-by: Rhys Perry > > --- > > src/amd/vulkan/radv_cmd_buffer.c | 92 ++-- > > 1 file changed, 76 insertions(+), 16 deletions(-) > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > b/src/amd/vulkan/radv_cmd_buffer.c > > index 59903ab64d8..56b3c934c2e 100644 > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > @@ -2965,6 +2965,11 @@ void radv_CmdSetViewport( > > assert(firstViewport < MAX_VIEWPORTS); > > assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); > > > > + if (!memcmp(state->dynamic.viewport.viewports + firstViewport, > > + pViewports, viewportCount * sizeof(*pViewports))) { > > + return; > > + } > > + > > memcpy(state->dynamic.viewport.viewports + firstViewport, > > pViewports, > >viewportCount * sizeof(*pViewports)); > > > > @@ -2984,6 +2989,11 @@ void radv_CmdSetScissor( > > assert(firstScissor < MAX_SCISSORS); > > assert(total_count >= 1 && total_count <= MAX_SCISSORS); > > > > + if (!memcmp(state->dynamic.scissor.scissors + firstScissor, > > pScissors, > > + scissorCount * sizeof(*pScissors))) { > > + return; > > + } > > + > > memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, > >scissorCount * sizeof(*pScissors)); > > > > @@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth( > > float lineWidth) > > { > > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); > > + > > + if (cmd_buffer->state.dynamic.line_width == lineWidth) > > + return; > > + > > cmd_buffer->state.dynamic.line_width = lineWidth; > > cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; > > } > > @@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias( > > float depthBiasSlopeFactor) > > { > > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); > > + struct radv_cmd_state *state = &cmd_buffer->state; > > > > - cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor; > > - cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp; > > - cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor; > > + if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && > > + state->dynamic.depth_bias.clamp == depthBiasClamp && > > + state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { > > + return; > > + } > > > > - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; > > + state->dynamic.depth_bias.bias = depthBiasConstantFactor; > > + state->dynamic.depth_bias.clamp = depthBiasClamp; > > + state->dynamic.depth_bias.slope = depthBiasSlopeFactor; > > + > > + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; > > } > > > > void radv_CmdSetBlendConstants( > > @@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants( > > const float blendConstants[4]) > > { > > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); > > + struct radv_cmd_state *state = &cmd_buffer->state; > > > > - memcpy(cmd_buffer->state.dynamic.blend_constants, > > - blendConstants, sizeof(float) * 4); > > + if (!memcmp(state->dynamic.blend_constants, blendConstants, > > sizeof(float) * 4)) > > + return; > > + > > + memcpy(state->dynamic.blend_constants, blendConstants, > > sizeof(float) * 4); > > > > - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; > > + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; > > } > > > > void radv_CmdSetDepthBounds( > > @@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds( > > float maxDepthB
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
I did a before/after comparison during development with multiple runs but only 1 before and after run to produce the numbers I sent. They seemed to match up well enough to the runs during development, so I wasn't too concerned. IIRC, the two runs were with a Vega 64 at 1080p with "High" settings. The kernel/distro was 4.19.13 and Fedora 29. Also "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to "performance" and "/sys/class/drm/card*/device/power_dpm_force_performance_level" was set to "high" while running. I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I get anything too different. On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen wrote: > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry wrote: > > > > I did and found small improvements in Rise of the Tomb Raider. I > > measured framerates ~104.3% that of without the changes for the > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > for Prophets Tomb. > > My main question would be what the statistical significance is. e.g. > did you do one run of each, did you do multiple, and what was your > test setup? > > Just curious because I have tried the exact same thing before and > could not find anything more than noise. > > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > wrote: > > > > > > Did you benchmark? > > > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > > It's common in some applications to bind a new graphics pipeline without > > > > ending up changing any context registers. > > > > > > > > This has a pipline have two command buffers: one for setting context > > > > registers and one for everything else. The context register command > > > > buffer > > > > is only emitted if it differs from the previous pipeline's. > > > > > > > > Signed-off-by: Rhys Perry > > > > --- > > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > > src/amd/vulkan/radv_pipeline.c | 217 --- > > > > src/amd/vulkan/radv_private.h| 2 + > > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > > index f41d6c0b3e7..59903ab64d8 100644 > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct > > > > radv_cmd_buffer *cmd_buffer, > > > > } > > > > } > > > > > > > > -static void > > > > +static bool > > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > > struct radv_pipeline *pipeline) > > > > { > > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct > > > > radv_cmd_buffer *cmd_buffer, > > > > cmd_buffer->sample_positions_needed = true; > > > > > > > > if (old_pipeline && num_samples == > > > > old_pipeline->graphics.ms.num_samples) > > > > - return; > > > > + return false; > > > > > > > > radeon_set_context_reg_seq(cmd_buffer->cs, > > > > R_028BDC_PA_SC_LINE_CNTL, 2); > > > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct > > > > radv_cmd_buffer *cmd_buffer, > > > > radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > > > radeon_emit(cmd_buffer->cs, > > > > EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); > > > > } > > > > + > > > > + return true; > > > > } > > > > > > > > static void > > > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > > > > *cmd_buffer) > > > > radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > > > > } > > > > > > > > -static void > > > > +static bool > > > > radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > > > > { > > > > struct radv_pipeline *pipeline =
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
Rise of the Tomb Raider from without to with the change (average of 3 runs): SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%) ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%) GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%) So not much improvement (if any). On Wed, 16 Jan 2019 at 00:39, Rhys Perry wrote: > > I did a before/after comparison during development with multiple runs > but only 1 before and after run to produce the numbers I sent. They > seemed to match up well enough to the runs during development, so I > wasn't too concerned. > > IIRC, the two runs were with a Vega 64 at 1080p with "High" settings. > The kernel/distro was 4.19.13 and Fedora 29. Also > "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to > "performance" and > "/sys/class/drm/card*/device/power_dpm_force_performance_level" was > set to "high" while running. > > I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I > get anything too different. > > On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen > wrote: > > > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry wrote: > > > > > > I did and found small improvements in Rise of the Tomb Raider. I > > > measured framerates ~104.3% that of without the changes for the > > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > > for Prophets Tomb. > > > > My main question would be what the statistical significance is. e.g. > > did you do one run of each, did you do multiple, and what was your > > test setup? > > > > Just curious because I have tried the exact same thing before and > > could not find anything more than noise. > > > > > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > > wrote: > > > > > > > > Did you benchmark? > > > > > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > > > It's common in some applications to bind a new graphics pipeline > > > > > without > > > > > ending up changing any context registers. > > > > > > > > > > This has a pipline have two command buffers: one for setting context > > > > > registers and one for everything else. The context register command > > > > > buffer > > > > > is only emitted if it differs from the previous pipeline's. > > > > > > > > > > Signed-off-by: Rhys Perry > > > > > --- > > > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > > > src/amd/vulkan/radv_pipeline.c | 217 > > > > > --- > > > > > src/amd/vulkan/radv_private.h| 2 + > > > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > > > index f41d6c0b3e7..59903ab64d8 100644 > > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct > > > > > radv_cmd_buffer *cmd_buffer, > > > > > } > > > > > } > > > > > > > > > > -static void > > > > > +static bool > > > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > > > struct radv_pipeline *pipeline) > > > > > { > > > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct > > > > > radv_cmd_buffer *cmd_buffer, > > > > > cmd_buffer->sample_positions_needed = true; > > > > > > > > > > if (old_pipeline && num_samples == > > > > > old_pipeline->graphics.ms.num_samples) > > > > > - return; > > > > > + return false; > > > > > > > > > > radeon_set_context_reg_seq(cmd_buffer->cs, > > > > > R_028BDC_PA_SC_LINE_CNTL, 2); > > > > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct > > > > > radv_cmd_buffer *cmd_buffer, > > > > > radeon_emit(cmd_buffer-&g
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
Seems I accidentally had it use Fedora 29's mesa build in both the before and after runs... Running again I get (again, average of 3 runs): GeothermalValley: 58.2 fps -> 59.633 fps (+2.5%) ProphetsTomb: 59 fps -> 60 fps (+1.7%) SpineOfTheMountain: 64 fps -> 64.06667 fps (+0.1%) (1 extreme from "before" run excluded) Sorry for the noise. On Wed, 16 Jan 2019 at 11:46, Rhys Perry wrote: > > Rise of the Tomb Raider from without to with the change (average of 3 runs): > SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%) > ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%) > GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%) > > So not much improvement (if any). > > On Wed, 16 Jan 2019 at 00:39, Rhys Perry wrote: > > > > I did a before/after comparison during development with multiple runs > > but only 1 before and after run to produce the numbers I sent. They > > seemed to match up well enough to the runs during development, so I > > wasn't too concerned. > > > > IIRC, the two runs were with a Vega 64 at 1080p with "High" settings. > > The kernel/distro was 4.19.13 and Fedora 29. Also > > "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to > > "performance" and > > "/sys/class/drm/card*/device/power_dpm_force_performance_level" was > > set to "high" while running. > > > > I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I > > get anything too different. > > > > On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen > > wrote: > > > > > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry > > > wrote: > > > > > > > > I did and found small improvements in Rise of the Tomb Raider. I > > > > measured framerates ~104.3% that of without the changes for the > > > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > > > for Prophets Tomb. > > > > > > My main question would be what the statistical significance is. e.g. > > > did you do one run of each, did you do multiple, and what was your > > > test setup? > > > > > > Just curious because I have tried the exact same thing before and > > > could not find anything more than noise. > > > > > > > > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > > > wrote: > > > > > > > > > > Did you benchmark? > > > > > > > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > > > > It's common in some applications to bind a new graphics pipeline > > > > > > without > > > > > > ending up changing any context registers. > > > > > > > > > > > > This has a pipline have two command buffers: one for setting context > > > > > > registers and one for everything else. The context register command > > > > > > buffer > > > > > > is only emitted if it differs from the previous pipeline's. > > > > > > > > > > > > Signed-off-by: Rhys Perry > > > > > > --- > > > > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > > > > src/amd/vulkan/radv_pipeline.c | 217 > > > > > > --- > > > > > > src/amd/vulkan/radv_private.h| 2 + > > > > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > > > > index f41d6c0b3e7..59903ab64d8 100644 > > > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct > > > > > > radv_cmd_buffer *cmd_buffer, > > > > > > } > > > > > > } > > > > > > > > > > > > -static void > > > > > > +static bool > > > > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > > > > struct radv_pipeline *pipeline) > > > > > > { > > > > > > @@ -646,7 +646,7 @@ radv_update_multisamp
[Mesa-dev] [PATCH v3 3/5] st/mesa: add support for EXT_shader_image_load_formatted
v3: rebase Signed-off-by: Rhys Perry Reviewed-by: Marek Olšák (v2) --- src/mesa/state_tracker/st_extensions.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 4628079260..b713eed969 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -717,6 +717,7 @@ void st_init_extensions(struct pipe_screen *screen, { o(ARB_shader_clock), PIPE_CAP_TGSI_CLOCK }, { o(ARB_shader_draw_parameters), PIPE_CAP_DRAW_PARAMETERS }, { o(ARB_shader_group_vote),PIPE_CAP_TGSI_VOTE }, + { o(EXT_shader_image_load_formatted), PIPE_CAP_IMAGE_LOAD_FORMATTED }, { o(ARB_shader_stencil_export),PIPE_CAP_SHADER_STENCIL_EXPORT }, { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS }, { o(ARB_shader_texture_lod), PIPE_CAP_SM3 }, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 1/5] gallium: add support for formatted image loads
v3: rebase v3: make use of u_pipe_screen_get_param_defaults Signed-off-by: Rhys Perry --- src/gallium/auxiliary/util/u_screen.c | 1 + src/gallium/docs/source/screen.rst | 1 + src/gallium/drivers/nouveau/nv30/nv30_screen.c | 1 + src/gallium/drivers/nouveau/nv50/nv50_screen.c | 1 + src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 + src/gallium/drivers/swr/swr_screen.cpp | 1 + src/gallium/drivers/vc4/vc4_screen.c | 1 + src/gallium/include/pipe/p_defines.h | 1 + 8 files changed, 8 insertions(+) diff --git a/src/gallium/auxiliary/util/u_screen.c b/src/gallium/auxiliary/util/u_screen.c index c14edde859..470632f5ec 100644 --- a/src/gallium/auxiliary/util/u_screen.c +++ b/src/gallium/auxiliary/util/u_screen.c @@ -314,6 +314,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen, case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS: case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_TGSI_ATOMFADD: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index 9b75a407db..b2d0c401d5 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -483,6 +483,7 @@ The integer capabilities: * ``PIPE_CAP_TGSI_ATOMFADD``: Atomic floating point adds are supported on images, buffers, and shared memory. * ``PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND``: True if the driver needs blend state to use zero/one instead of destination alpha for RGB/XRGB formats. +* ``PIPE_CAP_IMAGE_LOAD_FORMATTED``: True if a format for image loads does not need to be specified in the shader IR .. _pipe_capf: diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 2b69a8f696..d6e0f43f6c 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -243,6 +243,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE: case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index d83926f2b1..ff92012894 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -310,6 +310,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_SURFACE_SAMPLE_COUNT: case PIPE_CAP_TGSI_ATOMFADD: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index f5f3cf..b7cf2cd2e4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -334,6 +334,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS: case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_SURFACE_SAMPLE_COUNT: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index de9008ddf6..38b76366cb 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -364,6 +364,7 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: return 32; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index e7f7c82c27..22de60f02c 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -293,6 +293,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: +case PIPE_SHADER_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_SHADER_CAP_SCALAR_ISA: return 1; diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index ae53c723c7..5c0652d7a9 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -854,6 +854,7 @@ enum pipe_cap PIPE_CAP_TGSI_ATOMFADD
[Mesa-dev] [PATCH v3 0/5] nvc0: Implement EXT_shader_image_load_formatted
This patch series implements EXT_shader_image_load_formatted on Maxwell+. It should implement all of the spec except, if the extension is enabled, passing image variables without a format qualifier to atomic operations will not raise a compilation error like it should. This is because knowing the format used in an image operation before function inlining can be difficult, because formats don't have to (and currently can't) be specified in the paramter declaration. So this series leaves this issue to hopefully be resolved in a later patch. I tested the second version of this series when it was released in June 2018 but I can't easily test this version. Nothing changed too much though so it should be fine. v2: change from PIPE_SHADER_CAP_* to PIPE_CAP_* v2: fix broken feature detection in the state tracker v2: move code in AlgebraicOpt::handleSULDP() to nv50_ir_ra.cpp v3: rebase v3: make use of u_pipe_screen_get_param_defaults v3: move RA code into it's own function Rhys Perry (5): gallium: add support for formatted image loads mesa,glsl: add support for EXT_shader_image_load_formatted st/mesa: add support for EXT_shader_image_load_formatted nv50/ir: use suld.p on GM107+ nvc0,nv50/ir: enable support for formatted image loads on GM107+ src/compiler/glsl/ast_to_hir.cpp | 5 +++ src/compiler/glsl/glsl_parser_extras.cpp | 1 + src/compiler/glsl/glsl_parser_extras.h| 7 src/gallium/auxiliary/util/u_screen.c | 1 + src/gallium/docs/source/screen.rst| 1 + src/gallium/drivers/nouveau/codegen/nv50_ir.h | 4 +++ .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 --- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++ .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 + .../drivers/nouveau/nv30/nv30_screen.c| 1 + .../drivers/nouveau/nv50/nv50_screen.c| 1 + .../drivers/nouveau/nvc0/nvc0_screen.c| 2 ++ src/gallium/drivers/swr/swr_screen.cpp| 1 + src/gallium/drivers/vc4/vc4_screen.c | 1 + src/gallium/include/pipe/p_defines.h | 1 + src/mesa/main/extensions_table.h | 1 + src/mesa/main/mtypes.h| 1 + src/mesa/state_tracker/st_extensions.c| 1 + 19 files changed, 100 insertions(+), 14 deletions(-) -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 5/5] nvc0, nv50/ir: enable support for formatted image loads on GM107+
v3: rebase Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +-- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c| 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 295497be2f..6c134962b4 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -2414,12 +2414,11 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su) bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), TYPE_U32, bld.mkImm(0), loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); - if (su->op != OP_SUSTP && su->tex.format) { + if (su->op != OP_SUSTP && su->tex.format && su->tex.format->components > 0) { const TexInstruction::ImgFormatDesc *format = su->tex.format; int blockwidth = format->bits[0] + format->bits[1] + format->bits[2] + format->bits[3]; - assert(format->components != 0); // make sure that the format doesn't mismatch when it's not FMT_NONE bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), TYPE_U32, bld.loadImm(NULL, blockwidth / 8), diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index b7cf2cd2e4..c47502cae1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -288,6 +288,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return class_3d >= GM200_3D_CLASS; case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES: return class_3d >= GP100_3D_CLASS; + case PIPE_CAP_IMAGE_LOAD_FORMATTED: + return class_3d >= GM107_3D_CLASS; /* unsupported caps */ case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: @@ -334,7 +336,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS: case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_SURFACE_SAMPLE_COUNT: - case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_VENDOR_ID: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 2/5] mesa, glsl: add support for EXT_shader_image_load_formatted
v3: rebase Signed-off-by: Rhys Perry Reviewed-by: Marek Olšák (v2) --- src/compiler/glsl/ast_to_hir.cpp | 5 + src/compiler/glsl/glsl_parser_extras.cpp | 1 + src/compiler/glsl/glsl_parser_extras.h | 7 +++ src/mesa/main/extensions_table.h | 1 + src/mesa/main/mtypes.h | 1 + 5 files changed, 15 insertions(+) diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 67a5a8c050..d9a57d37f6 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -3476,6 +3476,11 @@ apply_image_qualifier_to_variable(const struct ast_type_qualifier *qual, } var->data.image_format = qual->image_format; + } else if (state->has_image_load_formatted()) { + if (var->data.mode == ir_var_uniform && + state->EXT_shader_image_load_formatted_warn) { + _mesa_glsl_warning(loc, state, "GL_EXT_image_load_formatted used"); + } } else { if (var->data.mode == ir_var_uniform) { if (state->es_shader) { diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp index 2048a7f900..1e035e94d8 100644 --- a/src/compiler/glsl/glsl_parser_extras.cpp +++ b/src/compiler/glsl/glsl_parser_extras.cpp @@ -721,6 +721,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(EXT_separate_shader_objects), EXT(EXT_shader_framebuffer_fetch), EXT(EXT_shader_framebuffer_fetch_non_coherent), + EXT(EXT_shader_image_load_formatted), EXT(EXT_shader_implicit_conversions), EXT(EXT_shader_integer_mix), EXT_AEP(EXT_shader_io_blocks), diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h index b17b5125e0..63a5cca5d2 100644 --- a/src/compiler/glsl/glsl_parser_extras.h +++ b/src/compiler/glsl/glsl_parser_extras.h @@ -344,6 +344,11 @@ struct _mesa_glsl_parse_state { return ARB_bindless_texture_enable; } + bool has_image_load_formatted() const + { + return EXT_shader_image_load_formatted_enable; + } + bool has_implicit_conversions() const { return EXT_shader_implicit_conversions_enable || is_version(120, 0); @@ -816,6 +821,8 @@ struct _mesa_glsl_parse_state { bool EXT_shader_framebuffer_fetch_warn; bool EXT_shader_framebuffer_fetch_non_coherent_enable; bool EXT_shader_framebuffer_fetch_non_coherent_warn; + bool EXT_shader_image_load_formatted_enable; + bool EXT_shader_image_load_formatted_warn; bool EXT_shader_implicit_conversions_enable; bool EXT_shader_implicit_conversions_warn; bool EXT_shader_integer_mix_enable; diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index dad38124d5..c3eb019f81 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -264,6 +264,7 @@ EXT(EXT_separate_shader_objects , dummy_true EXT(EXT_separate_specular_color , dummy_true , GLL, x , x , x , 1997) EXT(EXT_shader_framebuffer_fetch, EXT_shader_framebuffer_fetch , GLL, GLC, x , ES2, 2013) EXT(EXT_shader_framebuffer_fetch_non_coherent, EXT_shader_framebuffer_fetch_non_coherent, GLL, GLC, x, ES2, 2018) +EXT(EXT_shader_image_load_formatted , EXT_shader_image_load_formatted , GLL, GLC, x , x , 2014) EXT(EXT_shader_implicit_conversions , dummy_true , x , x , x , 31, 2013) EXT(EXT_shader_integer_mix , EXT_shader_integer_mix , GLL, GLC, x , 30, 2013) EXT(EXT_shader_io_blocks, dummy_true , x , x , x , 31, 2014) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 241c2b92f7..bd90727e26 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -4264,6 +4264,7 @@ struct gl_extensions GLboolean EXT_render_snorm; GLboolean EXT_semaphore; GLboolean EXT_semaphore_fd; + GLboolean EXT_shader_image_load_formatted; GLboolean EXT_shader_integer_mix; GLboolean EXT_shader_samples_identical; GLboolean EXT_stencil_two_side; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 4/5] nv50/ir: use suld.p on GM107+
v3: rebase v3: move RA code into it's own function Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir.h | 4 +++ .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 --- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++ .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 + 4 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 8085bb2f54..2388f3923c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -224,6 +224,10 @@ enum operation #define NV50_IR_SUBOP_SULD_ZERO0 #define NV50_IR_SUBOP_SULD_TRAP1 #define NV50_IR_SUBOP_SULD_SDCL3 +// These three are only for GM107+ and are set during register allocation +#define NV50_IR_SUBOP_SULDP_RGBA (0 << 2) +#define NV50_IR_SUBOP_SULDP_RG (1 << 2) +#define NV50_IR_SUBOP_SULDP_R (2 << 2) #define NV50_IR_SUBOP_SUBFM_3D 1 #define NV50_IR_SUBOP_SUCLAMP_2D 0x10 #define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0)) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index be00db3131..d7f4380b34 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -3257,26 +3257,36 @@ void CodeEmitterGM107::emitSULDx() { const TexInstruction *insn = this->insn->asTex(); - int type = 0; emitInsn(0xeb00); if (insn->op == OP_SULDB) emitField(0x34, 1, 1); emitSUTarget(); - switch (insn->dType) { - case TYPE_S8: type = 1; break; - case TYPE_U16: type = 2; break; - case TYPE_S16: type = 3; break; - case TYPE_U32: type = 4; break; - case TYPE_U64: type = 5; break; - case TYPE_B128: type = 6; break; - default: - assert(insn->dType == TYPE_U8); - break; + if (insn->op == OP_SULDB) { + int type = 0; + switch (insn->dType) { + case TYPE_S8: type = 1; break; + case TYPE_U16: type = 2; break; + case TYPE_S16: type = 3; break; + case TYPE_U32: type = 4; break; + case TYPE_U64: type = 5; break; + case TYPE_B128: type = 6; break; + default: + assert(insn->dType == TYPE_U8); + break; + } + emitField(0x14, 3, type); + } else { + int type = 0; + switch (insn->subOp & 0xc) { + case NV50_IR_SUBOP_SULDP_R:type = 0x1; break; + case NV50_IR_SUBOP_SULDP_RG: type = 0x3; break; + case NV50_IR_SUBOP_SULDP_RGBA: type = 0xf; break; + } + emitField(0x14, 4, type); } emitLDSTc(0x18); - emitField(0x14, 3, type); emitGPR (0x00, insn->def(0)); emitGPR (0x08, insn->src(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index 5dcbf3c3e0..43011c23af 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -246,6 +246,16 @@ static const char *xmadOpCModeStr[] = "clo", "chi", "csfu", "cbcc" }; +static const char *suldOpStr[] = +{ + "zero", "trap", "sdcl" +}; + +static const char *suldSwizzleOpStr[] = +{ + "rgba", "rg", "r" +}; + static const char *DataTypeStr[] = { "-", @@ -672,6 +682,13 @@ void Instruction::print() const PRINT("h%d ", (subOp & NV50_IR_SUBOP_XMAD_H1(i)) ? 1 : 0); break; } + case OP_SULDB: + case OP_SULDP: + if ((subOp & 0x3) < ARRAY_SIZE(suldOpStr)) +PRINT("%s ", suldOpStr[subOp & 0x3]); + if (op == OP_SULDP && subOp >> 2 < (int)ARRAY_SIZE(suldSwizzleOpStr)) +PRINT("%s ", suldSwizzleOpStr[subOp >> 2]); + break; default: if (subOp) PRINT("(SUBOP:%u) ", subOp); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 322b79fe62..8e57bda254 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -264,6 +264,7 @@ private: void addHazard(Instruction *i, const ValueRef *src); void textureMask(TexInstruction *); + void suldpMask(TexInstruction *); void addConstraint(Instruction *, int s, int n); bool detectConflict(Instruction *, int s); @@ -1996,6 +1997,33 @@ RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex) tex->setDef(c, NULL); } +void +RegAlloc::InsertConstraintsPass::suldpMask(TexInstruction *tex) +{ + int max = 0; +
[Mesa-dev] [PATCH 1/2] radv: pass radv_draw_info to radv_emit_draw_registers()
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 118 +++ 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f41d6c0b3e7..f430b4f20dd 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2074,10 +2074,60 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); } +struct radv_draw_info { + /** +* Number of vertices. +*/ + uint32_t count; + + /** +* Index of the first vertex. +*/ + int32_t vertex_offset; + + /** +* First instance id. +*/ + uint32_t first_instance; + + /** +* Number of instances. +*/ + uint32_t instance_count; + + /** +* First index (indexed draws only). +*/ + uint32_t first_index; + + /** +* Whether it's an indexed draw. +*/ + bool indexed; + + /** +* Indirect draw parameters resource. +*/ + struct radv_buffer *indirect; + uint64_t indirect_offset; + uint32_t stride; + + /** +* Draw count parameters resource. +*/ + struct radv_buffer *count_buffer; + uint64_t count_buffer_offset; + + /** +* Stream output parameters resource. +*/ + struct radv_buffer *strmout_buffer; + uint64_t strmout_buffer_offset; +}; + static void -radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw, -bool instanced_draw, bool indirect_draw, -uint32_t draw_vertex_count) +radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, +const struct radv_draw_info *draw_info) { struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; struct radv_cmd_state *state = &cmd_buffer->state; @@ -2087,8 +2137,9 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw, /* Draw state. */ ia_multi_vgt_param = - si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, - indirect_draw, draw_vertex_count); + si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, + draw_info->indirect, + draw_info->indirect ? 0 : draw_info->count); if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { if (info->chip_class >= GFX9) { @@ -2108,7 +2159,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw, /* Primitive restart. */ primitive_reset_en = - indexed_draw && state->pipeline->graphics.prim_restart_enable; + draw_info->indexed && state->pipeline->graphics.prim_restart_enable; if (primitive_reset_en != state->last_primitive_reset_en) { state->last_primitive_reset_en = primitive_reset_en; @@ -3411,57 +3462,6 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, } } -struct radv_draw_info { - /** -* Number of vertices. -*/ - uint32_t count; - - /** -* Index of the first vertex. -*/ - int32_t vertex_offset; - - /** -* First instance id. -*/ - uint32_t first_instance; - - /** -* Number of instances. -*/ - uint32_t instance_count; - - /** -* First index (indexed draws only). -*/ - uint32_t first_index; - - /** -* Whether it's an indexed draw. -*/ - bool indexed; - - /** -* Indirect draw parameters resource. -*/ - struct radv_buffer *indirect; - uint64_t indirect_offset; - uint32_t stride; - - /** -* Draw count parameters resource. -*/ - struct radv_buffer *count_buffer; - uint64_t count_buffer_offset; - - /** -* Stream output parameters resource. -*/ - struct radv_buffer *strmout_buffer; - uint64_t strmout_buffer_offset; -}; - static void radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) @@ -3672,9 +3672,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, radv_cmd_buffer_flush_dynamic_state(cmd_buffer); - radv_emit_draw_registers(cmd_buffer, info->indexed, -info->instance_count > 1, info->indirect, -info->indirect ? 0 : info->count); + radv_emit_draw_registers(cmd_
[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
It's common in some applications to bind a new graphics pipeline without ending up changing any context registers. This has a pipline have two command buffers: one for setting context registers and one for everything else. The context register command buffer is only emitted if it differs from the previous pipeline's. v2: ensure late scissor emission is done when radv_emit_rbplus_state() is called v2: make use of cmd_buffer->state.workaround_scissor_bug Signed-off-by: Rhys Perry --- This second version depends on the patch "radv: add missed situations for scissor bug workaround". src/amd/vulkan/radv_cmd_buffer.c | 30 - src/amd/vulkan/radv_pipeline.c | 217 --- src/amd/vulkan/radv_private.h| 2 + 3 files changed, 141 insertions(+), 108 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 6d538d7e88a..f406a3a42f3 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } + + cmd_buffer->state.workaround_scissor_bug = true; } static void @@ -857,10 +859,13 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); } + /* TODO: avoid redundantly setting context registers */ radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); radeon_emit(cmd_buffer->cs, sx_ps_downconvert); radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon); radeon_emit(cmd_buffer->cs, sx_blend_opt_control); + + cmd_buffer->state.workaround_scissor_bug = true; } static void @@ -884,6 +889,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); + if (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, + pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); + cmd_buffer->state.workaround_scissor_bug = true; + } + for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { if (!pipeline->shaders[i]) continue; @@ -2939,6 +2953,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) return; + assert(!pipeline->ctx_cs.cdw); + cmd_buffer->state.emitted_compute_pipeline = pipeline; radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); @@ -3630,20 +3646,16 @@ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; /* Index, vertex and streamout buffers don't change context regs, and -* pipeline is handled later. +* pipeline is already handled. */ used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE); - /* Assume all state changes except these two can imply context rolls. */ if (cmd_buffer->state.dirty & used_states) return true; - if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline) - return true; - if (info->indexed && state->pipeline->graphics.prim_restart_enable && (state->index_type ? 0xu : 0xu) != state->last_primitive_reset_index) return true; @@ -3655,7 +3667,7 @@ static void radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) { - bool late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info); + bool late_scissor_emission; if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipel
[Mesa-dev] [PATCH 2/2] radv: add missed situations for scissor bug workaround
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 65 src/amd/vulkan/radv_private.h| 2 + 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f430b4f20dd..6d538d7e88a 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -920,6 +920,8 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.dynamic.scissor.scissors, cmd_buffer->state.dynamic.viewport.viewports, cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband); + + cmd_buffer->state.workaround_scissor_bug = false; } static void @@ -1217,6 +1219,8 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, radv_update_zrange_precision(cmd_buffer, &att->ds, image, layout, false); } + + cmd_buffer->state.workaround_scissor_bug = true; } /** @@ -1442,6 +1446,8 @@ radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); radeon_emit(cs, color_values[0]); radeon_emit(cs, color_values[1]); + + cmd_buffer->state.workaround_scissor_bug = true; } /** @@ -1704,6 +1710,8 @@ void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) } radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); + + cmd_buffer->state.workaround_scissor_bug = true; } static void @@ -2185,6 +2193,27 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, state->last_primitive_reset_index = primitive_reset_index; } } + + if (draw_info->strmout_buffer) { + uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo); + + va += draw_info->strmout_buffer->offset + + draw_info->strmout_buffer_offset; + + radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, + draw_info->stride); + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG) | + COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); + radeon_emit(cs, 0); /* unused */ + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo); + } } static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, @@ -3470,27 +3499,6 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys *ws = cmd_buffer->device->ws; struct radeon_cmdbuf *cs = cmd_buffer->cs; - if (info->strmout_buffer) { - uint64_t va = radv_buffer_get_va(info->strmout_buffer->bo); - - va += info->strmout_buffer->offset + - info->strmout_buffer_offset; - - radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, - info->stride); - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | - COPY_DATA_DST_SEL(COPY_DATA_REG) | - COPY_DATA_WR_CONFIRM); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); - radeon_emit(cs, 0); /* unused */ - - radv_cs_add_buffer(ws, cs, info->strmout_buffer->bo); - } - if (info->indirect) { uint64_t va = radv_buffer_get_va(info->indirect->bo); uint64_t count_va = 0; @@ -3609,13 +3617,16 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, * any context registers. */ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, -bool indexed_draw) +const struct radv_draw_info *info) { struct radv_cmd_state *state = &cmd_buffer->state; if (!cmd_buffer->device->physical_device->has_scissor_bug) return false; + if (cmd_buffer->state.workaround_scissor_bug || info->strmout_buffer) + return true; + uint32_
Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
It currently requires review (and possibly rebasing). Marek Olšák send some feedback for a few of the patches but other than that, it hasn't gotten much attention. Also patch 35 seems to vectorize 32-bit code which can help or hurt shaders quite a bit and seems to hurt shaders overall. I'm not yet sure how to solve this without removing it or changing the result of LLVM's SLP vectorizer significantly. IIRC enabling SLP vectorizer also uncovered a RA bug with a shader. I think I'll look into the issues with patch 35 again. On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset wrote: > > What's the status of this? > > On 12/7/18 6:21 PM, Rhys Perry wrote: > > This series add support for: > > - VK_KHR_shader_float16_int8 > > - VK_AMD_gpu_shader_half_float > > - VK_AMD_gpu_shader_int16 > > - VK_KHR_8bit_storage > > on VI+. Half floats are currently disabled on LLVM 7 because of a bug > > causing large memory usage and long (or unbounded) compilation times with > > some tests. > > > > It depends on the follow patch series: > > - https://patchwork.freedesktop.org/series/53454/ > > - https://patchwork.freedesktop.org/series/53602/ > > - https://patchwork.freedesktop.org/series/53660/ > > > > An older version was tested on my Polaris card, but due to hardware issues > > I currently can't test the latest version of the series. > > > > deqp-vk has no regressions and none of the newly enabled tests fail. > > > > Rhys Perry (38): > >ac: add various helpers for float16/int16/int8 > >ac/nir: implement 8-bit push constant, ssbo and ubo loads > >ac/nir: implement 8-bit ssbo stores > >ac/nir: fix 16-bit ssbo stores > >ac/nir: implement 8-bit nir_load_const_instr > >ac/nir: implement 8-bit conversions > >ac/nir: fix 64-bit nir_op_f2f16_rtz > >ac/nir: make ac_build_clamp work on all bit sizes > >ac/nir: make ac_build_fract work on all bit sizes > >ac/nir: make ac_build_isign work on all bit sizes > >ac/nir: make ac_build_fsign work on all bit sizes > >ac/nir: make ac_build_fdiv support 16-bit floats > >ac/nir: implement half-float nir_op_frcp > >ac/nir: implement half-float nir_op_frsq > >ac/nir: implement half-float nir_op_ldexp > >radv: lower 16-bit flrp > >ac/nir: support half floats in emit_b2f > >ac/nir: make emit_b2i work on all bit sizes > >ac/nir: implement 16-bit shifts > >compiler/nir: add lowering option for 16-bit ffma > >ac/nir: implement 16-bit ac_build_ddxy > >ac/nir: implement 8 and 16 bit ac_build_readlane > >nir: make bitfield_reverse and ifind_msb work with all integers > >ac/nir: make ac_find_lsb work on all bit sizes > >ac/nir: make ac_build_umsb work on all bit sizes > >ac/nir: implement 8 and 16 bit ac_build_imsb > >ac/nir: make ac_build_bit_count work on all bit sizes > >ac/nir: make ac_build_bitfield_reverse work on all bit sizes > >ac/nir: implement 16-bit pack/unpack opcodes > >ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type > >ac/nir,radv: create an array of varying output types > >ac/nir: store all outputs as f32 > >radv: store all fragment shader inputs as f32 > >radv: handle all fragment output types > >ac,radv: run LLVM's SLP vectorizer > >ac/nir: generate better code for nir_op_f2f16_rtz > >ac/nir: have nir_op_f2f16 round to zero > >radv: expose float16, int16 and int8 features and extensions > > > > src/amd/common/ac_llvm_build.c| 355 ++ > > src/amd/common/ac_llvm_build.h| 22 +- > > src/amd/common/ac_llvm_util.c | 9 +- > > src/amd/common/ac_llvm_util.h | 1 + > > src/amd/common/ac_nir_to_llvm.c | 258 +++ > > src/amd/common/ac_shader_abi.h| 1 + > > src/amd/vulkan/radv_device.c | 17 ++ > > src/amd/vulkan/radv_extensions.py | 4 + > > src/amd/vulkan/radv_nir_to_llvm.c | 92 --- > > src/amd/vulkan/radv_shader.c | 7 + > > src/broadcom/compiler/nir_to_vir.c| 1 + > > src/compiler/nir/nir.h| 1 + > > src/compiler/nir/nir_opcodes.py | 4 +- > > src/compiler/nir/nir_opt_algebraic.py | 4 +- > > src/gallium/drivers/radeonsi/si_get.c | 1 + > > src/gallium/drivers/vc4/vc4_program.c | 1 + > > 16 files changed, 516 insertions(+), 262 deletions(-) > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
Quite a bit of the patches aren't specific to a single extension as many make code size-generic and some of the extensions intersect in functionality. It might still be possible to roughly order the patches by functionality but I'm not sure if it would be very useful (possible order in attachment). I didn't look at the actual content of the patches when creating the attachment, this is from memory and looking at the descriptions. Would you like me to send out a v2 of this series doing like that? On Tue, 12 Feb 2019 at 17:08, Samuel Pitoiset wrote: > > How about splitting this series in four different parts? One for every > extension? Is this doable without too much troubles? > > On 2/12/19 6:02 PM, Rhys Perry wrote: > > It currently requires review (and possibly rebasing). Marek Olšák send > > some feedback for a few of the patches but other than that, it hasn't > > gotten much attention. > > > > Also patch 35 seems to vectorize 32-bit code which can help or hurt > > shaders quite a bit and seems to hurt shaders overall. I'm not yet > > sure how to solve this without removing it or changing the result of > > LLVM's SLP vectorizer significantly. > > IIRC enabling SLP vectorizer also uncovered a RA bug with a shader. > > > > I think I'll look into the issues with patch 35 again. > > > > On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset > > wrote: > >> What's the status of this? > >> > >> On 12/7/18 6:21 PM, Rhys Perry wrote: > >>> This series add support for: > >>> - VK_KHR_shader_float16_int8 > >>> - VK_AMD_gpu_shader_half_float > >>> - VK_AMD_gpu_shader_int16 > >>> - VK_KHR_8bit_storage > >>> on VI+. Half floats are currently disabled on LLVM 7 because of a bug > >>> causing large memory usage and long (or unbounded) compilation times with > >>> some tests. > >>> > >>> It depends on the follow patch series: > >>> - https://patchwork.freedesktop.org/series/53454/ > >>> - https://patchwork.freedesktop.org/series/53602/ > >>> - https://patchwork.freedesktop.org/series/53660/ > >>> > >>> An older version was tested on my Polaris card, but due to hardware issues > >>> I currently can't test the latest version of the series. > >>> > >>> deqp-vk has no regressions and none of the newly enabled tests fail. > >>> > >>> Rhys Perry (38): > >>> ac: add various helpers for float16/int16/int8 > >>> ac/nir: implement 8-bit push constant, ssbo and ubo loads > >>> ac/nir: implement 8-bit ssbo stores > >>> ac/nir: fix 16-bit ssbo stores > >>> ac/nir: implement 8-bit nir_load_const_instr > >>> ac/nir: implement 8-bit conversions > >>> ac/nir: fix 64-bit nir_op_f2f16_rtz > >>> ac/nir: make ac_build_clamp work on all bit sizes > >>> ac/nir: make ac_build_fract work on all bit sizes > >>> ac/nir: make ac_build_isign work on all bit sizes > >>> ac/nir: make ac_build_fsign work on all bit sizes > >>> ac/nir: make ac_build_fdiv support 16-bit floats > >>> ac/nir: implement half-float nir_op_frcp > >>> ac/nir: implement half-float nir_op_frsq > >>> ac/nir: implement half-float nir_op_ldexp > >>> radv: lower 16-bit flrp > >>> ac/nir: support half floats in emit_b2f > >>> ac/nir: make emit_b2i work on all bit sizes > >>> ac/nir: implement 16-bit shifts > >>> compiler/nir: add lowering option for 16-bit ffma > >>> ac/nir: implement 16-bit ac_build_ddxy > >>> ac/nir: implement 8 and 16 bit ac_build_readlane > >>> nir: make bitfield_reverse and ifind_msb work with all integers > >>> ac/nir: make ac_find_lsb work on all bit sizes > >>> ac/nir: make ac_build_umsb work on all bit sizes > >>> ac/nir: implement 8 and 16 bit ac_build_imsb > >>> ac/nir: make ac_build_bit_count work on all bit sizes > >>> ac/nir: make ac_build_bitfield_reverse work on all bit sizes > >>> ac/nir: implement 16-bit pack/unpack opcodes > >>> ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type > >>> ac/nir,radv: create an array of varying output types > >>> ac/nir: store all outputs as f32 > >>> radv: store all fragment shader inputs as f32 > >>> radv: handle all fragment output types > >>> ac,ra
[Mesa-dev] [PATCH v2 01/41] radv: bitcast 16-bit outputs to integers
16-bit outputs are stored as 16-bit floats in the outputs array, so they have to be bitcast. Fixes: b722b29f10d ('radv: add support for 16bit input/output') Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 7f74678d5f1..a8268c44ecf 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2365,7 +2365,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildZExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; @@ -2376,7 +2376,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildSExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 89a78b43c6f..b260142c177 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, } else if (num_bytes == 2) { store_name = "llvm.amdgcn.tbuffer.store.i32"; data_type = ctx->ac.i32; + data = LLVMBuildBitCast(ctx->ac.builder, data, ctx->ac.i16, ""); + data = LLVMBuildZExt(ctx->ac.builder, data, data_type, ""); LLVMValueRef tbuffer_params[] = { data, rsrc, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
This series add support for: - VK_KHR_shader_float16_int8 - VK_AMD_gpu_shader_half_float - VK_AMD_gpu_shader_int16 - VK_KHR_8bit_storage on VI+. Half floats are disabled on LLVM 7 because of a bug causing large memory usage and long (or unbounded) compilation times with some CTS tests. It is written against the following patch series: - https://patchwork.freedesktop.org/series/53454/ (v4) - https://patchwork.freedesktop.org/series/53660/ (v1) With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega and VI except for dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.* which fails or crashes because of unrelated radv bugs with 64-bit varyings and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even though radv does not support it. With LLVM 9, there are no reproducable piglit regressions except for glsl-array-bounds-12.shader_test because of a LLVM bug when SLP vectorization is enabled. With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega and VI except for those with LLVM 9 and a couple of tests because of a LLVM bug after the SLP vectorizer and with the current lack of fallback for 16-bit interpolation on LLVM versions before LLVM 9. With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega and VI except for those with LLVM 9 and a couple of tests because of a LLVM bug after the SLP vectorizer. The SLP vectorization patch is marked as WIP because it exposes LLVM bugs with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and some shader-db test for a game I can't remember. It also over-vectorizes 32-bit code which can cause significant worsening in generated code quality. The 16-bit interpolation patch is marked as WIP because it currently requires intrinsics only available in LLVM 9 and does not have a fallback. A branch on Github containing this series can be found at: https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2 v2: rebase v2: implement 16-bit interpolation v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass v2: run vectorization unconditionally on GFX9 and later v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof() v2: remove ac_int_of_size() v2: fix 64-bit visit_load_var() v2: mark VK_KHR_8bit_storage as DONE in features.txt v2: mark SLP vectorization patch as WIP v2: fix C++ style comment Rhys Perry (41): radv: bitcast 16-bit outputs to integers radv: ensure export arguments are always float ac: add various helpers for float16/int16/int8 ac/nir: implement 8-bit push constant, ssbo and ubo loads ac/nir: implement 8-bit ssbo stores ac/nir: fix 16-bit ssbo stores ac/nir: implement 8-bit nir_load_const_instr ac/nir: implement 8-bit conversions ac/nir: fix 64-bit nir_op_f2f16_rtz ac/nir: make ac_build_clamp work on all bit sizes ac/nir: make ac_build_fract work on all bit sizes ac/nir: make ac_build_isign work on all bit sizes ac/nir: make ac_build_fsign work on all bit sizes ac/nir: make ac_build_fdiv support 16-bit floats ac/nir: implement half-float nir_op_frcp ac/nir: implement half-float nir_op_frsq ac/nir: implement half-float nir_op_ldexp radv: lower 16-bit flrp ac/nir: support half floats in emit_b2f ac/nir: make emit_b2i work on all bit sizes ac/nir: implement 16-bit shifts compiler/nir: add lowering option for 16-bit ffma ac/nir: implement 16-bit ac_build_ddxy ac/nir: implement 8 and 16 bit ac_build_readlane nir: make bitfield_reverse and ifind_msb work with all integers ac/nir: make ac_find_lsb work on all bit sizes ac/nir: make ac_build_umsb work on all bit sizes ac/nir: implement 8 and 16 bit ac_build_imsb ac/nir: make ac_build_bit_count work on all bit sizes ac/nir: make ac_build_bitfield_reverse work on all bit sizes ac/nir: implement 16-bit pack/unpack opcodes ac/nir: add 8-bit types to glsl_base_to_llvm_type ac/nir,radv: create an array of varying output types ac/nir: store all outputs as f32 radv: store all fragment shader inputs as f32 radv: handle all fragment output types WIP: radv,ac: implement 16-bit interpolation WIP: ac,radv: run LLVM's SLP vectorizer ac/nir: generate better code for nir_op_f2f16_rtz ac/nir: have nir_op_f2f16 round to zero radv,docs: expose float16, int16 and int8 features and extensions docs/features.txt| 2 +- src/amd/common/ac_llvm_build.c | 325 +++ src/amd/common/ac_llvm_build.h | 18 +- src/amd/common/ac_llvm_util.c| 8 +- src/amd/common/ac_nir_to_llvm.c | 268 +++ src/amd/common/ac_shader_abi.h | 1 + src/amd/vulkan/radv_device.c | 17 ++ src/amd/vulkan/radv_extensions.py| 4 + src/amd/vulkan/radv_nir_to_llvm.c| 123 + src/amd/vulkan/radv_pipeline.c | 19 +- src/amd/vulkan/radv_shader.c | 4 +
[Mesa-dev] [PATCH v2 04/41] ac/nir: implement 8-bit push constant, ssbo and ubo loads
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 37 +++-- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bed52490bad..17d952d1ae8 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1399,7 +1399,30 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr); - if (instr->dest.ssa.bit_size == 16) { + if (instr->dest.ssa.bit_size == 8) { + unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1; + LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords); + ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); + LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + + LLVMValueRef params[3]; + if (load_dwords > 1) { + LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), ""); + params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), ""); + params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), ""); + } else { + res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, ""); + params[0] = ctx->ac.i32_0; + params[1] = res; + } + params[2] = addr; + res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0); + + res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), ""); + if (instr->dest.ssa.num_components > 1) + res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), ""); + return res; + } else if (instr->dest.ssa.bit_size == 16) { unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1; LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords); ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); @@ -1676,7 +1699,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); LLVMValueRef ret; - if (load_bytes == 2) { + if (load_bytes <= 2) { ret = ac_build_tbuffer_load_short_byte(&ctx->ac, rsrc, vindex, @@ -1684,7 +1707,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, ctx->ac.i32_0, immoffset, glc, - 2); + load_bytes); } else { const char *load_name; LLVMTypeRef data_type; @@ -1700,6 +1723,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, data_type = ctx->ac.v2f32; break; case 4: + case 3: load_name = "llvm.amdgcn.buffer.load.f32"; data_type = ctx->ac.f32; break; @@ -1746,7 +1770,8 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, if (instr->dest.ssa.bit_size == 64) num_components *= 2; - if (instr->dest.ssa.bit_size == 16) { + if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) { + unsigned size = instr->dest.ssa.bit_size / 8; LLVMValueRef results[num_components]; for (unsigned i = 0; i < num_components; ++i) { results[i] = ac_build_tbuffer_load_short_byte(&ctx->ac, @@ -1754,9 +1779,9 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, ctx->ac.i32_0,
[Mesa-dev] [PATCH v2 02/41] radv: ensure export arguments are always float
So that the signature is correct and consistent, the inputs to a export intrinsic should always be 32-bit floats. This and the previous commit fixes a large amount crashes from dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_* tests Fixes: b722b29f10d ('radv: add support for 16bit input/output') Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index a8268c44ecf..d3795eec403 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2429,12 +2429,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, } else memcpy(&args->out[0], values, sizeof(values[0]) * 4); - for (unsigned i = 0; i < 4; ++i) { - if (!(args->enabled_channels & (1 << i))) - continue; - + for (unsigned i = 0; i < 4; ++i) args->out[i] = ac_to_float(&ctx->ac, args->out[i]); - } } static void -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 07/41] ac/nir: implement 8-bit nir_load_const_instr
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 4 1 file changed, 4 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index b260142c177..f39232b91a1 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1114,6 +1114,10 @@ static void visit_load_const(struct ac_nir_context *ctx, for (unsigned i = 0; i < instr->def.num_components; ++i) { switch (instr->def.bit_size) { + case 8: + values[i] = LLVMConstInt(element_type, +instr->value.u8[i], false); + break; case 16: values[i] = LLVMConstInt(element_type, instr->value.u16[i], false); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 03/41] ac: add various helpers for float16/int16/int8
v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof() v2: remove ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 55 ++--- src/amd/common/ac_llvm_build.h | 15 +++-- src/amd/common/ac_nir_to_llvm.c | 30 +- 3 files changed, 79 insertions(+), 21 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 9395bd1bbda..b53d9c7ff8c 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -87,12 +87,16 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->v4f32 = LLVMVectorType(ctx->f32, 4); ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); + ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); + ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); + ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); @@ -201,7 +205,9 @@ ac_get_type_size(LLVMTypeRef type) static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) { - if (t == ctx->f16 || t == ctx->i16) + if (t == ctx->i8) + return ctx->i8; + else if (t == ctx->f16 || t == ctx->i16) return ctx->i16; else if (t == ctx->f32 || t == ctx->i32) return ctx->i32; @@ -281,6 +287,42 @@ ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); } +LLVMTypeRef ac_float_of_size(struct ac_llvm_context *ctx, unsigned bit_size) +{ + switch (bit_size) { + case 16: + return ctx->f16; + case 32: + return ctx->f32; + case 64: + return ctx->f64; + default: + unreachable("Unhandled bit size"); + } +} + +LLVMValueRef ac_build_ui_cast(struct ac_llvm_context *ctx, LLVMValueRef v, LLVMTypeRef t) +{ + unsigned new_bit_size = ac_get_elem_bits(ctx, t); + unsigned old_bit_size = ac_get_elem_bits(ctx, LLVMTypeOf(v)); + if (new_bit_size > old_bit_size) + return LLVMBuildZExt(ctx->builder, v, t, ""); + else if (new_bit_size < old_bit_size) + return LLVMBuildTrunc(ctx->builder, v, t, ""); + else + return v; +} + +LLVMValueRef ac_build_reinterpret(struct ac_llvm_context *ctx, LLVMValueRef v, LLVMTypeRef t) +{ + if (LLVMTypeOf(v) == t) + return v; + + v = ac_to_integer(ctx, v); + v = ac_build_ui_cast(ctx, v, ac_to_integer_type(ctx, t)); + return LLVMBuildBitCast(ctx->builder, v, t, ""); +} + LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, @@ -1338,15 +1380,18 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx, } LLVMValueRef -ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, +ac_build_tbuffer_load_short_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset, - LLVMValueRef glc) + LLVMValueRef glc, + unsigned size) { + assert(size == 1 || size == 2); const char *name = "llvm.amdgcn.tbuffer.load.i32"; + int data_format = size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : V_008F0C_BUF_DATA_FORMAT_16; LLVMTypeRef type = ctx->i32; LLVMValueRef params[] = { rsrc, @@ -1354,13 +1399,13 @@ ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, voffset, soffset, immoffset, - LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false), + LLVMConstInt(ctx->i32, data_format, false), LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false), glc, ctx->i1false, }; LLVMValueRef res = ac_build
[Mesa-dev] [PATCH v2 09/41] ac/nir: fix 64-bit nir_op_f2f16_rtz
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 691d444db05..741059b5f1a 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -886,6 +886,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_f2f16_rtz: src[0] = ac_to_float(&ctx->ac, src[0]); + if (LLVMTypeOf(src[0]) == ctx->ac.f64) + src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 }; result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 05/41] ac/nir: implement 8-bit ssbo stores
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 17d952d1ae8..89a78b43c6f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1524,7 +1524,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, get_src(ctx, instr->src[1]), true); - LLVMValueRef base_data = ac_to_float(&ctx->ac, src_data); + LLVMValueRef base_data = src_data; base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components); LLVMValueRef base_offset = get_src(ctx, instr->src[2]); @@ -1565,7 +1565,25 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, offset = LLVMBuildAdd(ctx->ac.builder, base_offset, LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); } - if (num_bytes == 2) { + if (num_bytes == 1) { + store_name = "llvm.amdgcn.tbuffer.store.i32"; + data_type = ctx->ac.i32; + data = LLVMBuildZExt(ctx->ac.builder, data, data_type, ""); + LLVMValueRef tbuffer_params[] = { + data, + rsrc, + ctx->ac.i32_0, /* vindex */ + offset,/* voffset */ + ctx->ac.i32_0, + ctx->ac.i32_0, + LLVMConstInt(ctx->ac.i32, 1, false), // dfmt (= 8bit) + LLVMConstInt(ctx->ac.i32, 4, false), // nfmt (= uint) + glc, + ctx->ac.i1false, + }; + ac_build_intrinsic(&ctx->ac, store_name, + ctx->ac.voidt, tbuffer_params, 10, 0); + } else if (num_bytes == 2) { store_name = "llvm.amdgcn.tbuffer.store.i32"; data_type = ctx->ac.i32; LLVMValueRef tbuffer_params[] = { -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 13/41] ac/nir: make ac_build_fsign work on all bit sizes
v2: don't use ac_get_zerof() and ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 16 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 3b2257e8bf0..23e454385d7 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2079,19 +2079,11 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMValueRef cmp, val, zero, one; - LLVMTypeRef type; - - if (bitsize == 32) { - type = ctx->f32; - zero = ctx->f32_0; - one = ctx->f32_1; - } else { - type = ctx->f64; - zero = ctx->f64_0; - one = ctx->f64_1; - } + LLVMTypeRef type = ac_float_of_size(ctx, bitsize); + LLVMValueRef zero = LLVMConstReal(type, 0.0); + LLVMValueRef one = LLVMConstReal(type, 1.0); + LLVMValueRef cmp, val; cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, ""); val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 20/41] ac/nir: make emit_b2i work on all bit sizes
v2: don't use ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index e459001c1cf..75bb19031bf 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -347,11 +347,7 @@ static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, unsigned bitsize) { LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); - - if (bitsize == 32) - return result; - - return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); + return ac_build_ui_cast(ctx, result, LLVMIntTypeInContext(ctx->context, bitsize)); } static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 08/41] ac/nir: implement 8-bit conversions
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index f39232b91a1..691d444db05 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -858,12 +858,14 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) src[i] = ac_to_integer(&ctx->ac, src[i]); result = ac_build_gather_values(&ctx->ac, src, num_components); break; + case nir_op_f2i8: case nir_op_f2i16: case nir_op_f2i32: case nir_op_f2i64: src[0] = ac_to_float(&ctx->ac, src[0]); result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, ""); break; + case nir_op_f2u8: case nir_op_f2u16: case nir_op_f2u32: case nir_op_f2u64: @@ -898,15 +900,14 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) else result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); break; + case nir_op_u2u8: case nir_op_u2u16: case nir_op_u2u32: case nir_op_u2u64: src[0] = ac_to_integer(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, ""); - else - result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); + result = ac_build_ui_cast(&ctx->ac, src[0], def_type); break; + case nir_op_i2i8: case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 29/41] ac/nir: make ac_build_bit_count work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 +++-- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index c986f800fa4..46738faea9d 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2085,35 +2085,16 @@ LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef result; - unsigned bitsize; + unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + char name[64]; + snprintf(name, sizeof(name), "llvm.ctpop.i%d", bitsize); - switch (bitsize) { - case 64: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 32: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - default: - unreachable(!"invalid bitsize"); - break; - } + LLVMValueRef result = ac_build_intrinsic(ctx, name, LLVMTypeOf(src0), +(LLVMValueRef []) { src0 }, 1, +AC_FUNC_ATTR_READNONE); - return result; + return ac_build_ui_cast(ctx, result, ctx->i32); } LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 36/41] radv: handle all fragment output types
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 55 --- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 01b8b097ea1..c46eabf3656 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2297,9 +2297,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (!values) return; - bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { - bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; @@ -2310,6 +2308,28 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, bool hi) = NULL; + if (LLVMTypeOf(values[0]) == ctx->ac.f16 && + col_format != V_028714_SPI_SHADER_FP16_ABGR) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildFPExt(ctx->ac.builder, + values[chan], + ctx->ac.f32, ""); + } + + if (LLVMTypeOf(values[0]) == ctx->ac.i16 || LLVMTypeOf(values[0]) == ctx->ac.i8) { + if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildSExt(ctx->ac.builder, + values[chan], + ctx->ac.i32, ""); + } else { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildZExt(ctx->ac.builder, + values[chan], + ctx->ac.i32, ""); + } + } + switch(col_format) { case V_028714_SPI_SHADER_ZERO: args->enabled_channels = 0; /* writemask */ @@ -2335,12 +2355,16 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, case V_028714_SPI_SHADER_FP16_ABGR: args->enabled_channels = 0x5; - packf = ac_build_cvt_pkrtz_f16; - if (is_16bit) { - for (unsigned chan = 0; chan < 4; chan++) - values[chan] = LLVMBuildFPExt(ctx->ac.builder, - values[chan], - ctx->ac.f32, ""); + if (LLVMTypeOf(values[0]) == ctx->ac.f16) { + packi = ac_build_cvt_pk_u16; + for (unsigned chan = 0; chan < 4; chan++) { + values[chan] = ac_to_integer(&ctx->ac, values[chan]); + values[chan] = LLVMBuildZExt(ctx->ac.builder, + values[chan], + ctx->ac.i32, ""); + } + } else { + packf = ac_build_cvt_pkrtz_f16; } break; @@ -2357,23 +2381,11 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, case V_028714_SPI_SHADER_UINT16_ABGR: args->enabled_channels = 0x5; packi = ac_build_cvt_pk_u16; - if (is_16bit) { - for (unsigned chan = 0; chan < 4; chan++) - values[chan] = LLVMBuildZExt(ctx->ac.builder, - ac_to_integer(&ctx->ac, values[chan]), - ctx->ac.i32, ""); - } break; case V_028714_SPI_SHADER_SIN
[Mesa-dev] [PATCH v2 22/41] compiler/nir: add lowering option for 16-bit ffma
The lowering needs to be disabled for sufficient precision to pass deqp-vk's 16-bit fma test on radv. Signed-off-by: Rhys Perry --- src/broadcom/compiler/nir_to_vir.c| 1 + src/compiler/nir/nir.h| 1 + src/compiler/nir/nir_opt_algebraic.py | 4 +++- src/gallium/drivers/radeonsi/si_get.c | 1 + src/gallium/drivers/vc4/vc4_program.c | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index d983f91e718..6c0a623096a 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -2471,6 +2471,7 @@ const nir_shader_compiler_options v3d_nir_options = { .lower_fdiv = true, .lower_find_lsb = true, .lower_ffma = true, +.lower_ffma16 = true, .lower_flrp32 = true, .lower_fpow = true, .lower_fsat = true, diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 740c64d2a94..8df275f4aa3 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2111,6 +2111,7 @@ typedef struct nir_function { typedef struct nir_shader_compiler_options { bool lower_fdiv; + bool lower_ffma16; bool lower_ffma; bool fuse_ffma; bool lower_flrp16; diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 71c626e1b3f..63dff878d35 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -136,7 +136,9 @@ optimizations = [ (('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a, ('bcsel', c, b, a), 'options->lower_flrp32'), (('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', a, b, c), '!options->lower_flrp32'), (('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', a, b, c), '!options->lower_flrp64'), - (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), + (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), + (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), + (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'), (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)), diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index f8ca02d4fcf..5bf107ef6fe 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -491,6 +491,7 @@ static const struct nir_shader_compiler_options nir_options = { .lower_fdiv = true, .lower_sub = true, .lower_ffma = true, + .lower_ffma16 = true, .lower_pack_snorm_2x16 = true, .lower_pack_snorm_4x8 = true, .lower_pack_unorm_2x16 = true, diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 2d0a52bb5fb..8be258cbba4 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2234,6 +2234,7 @@ static const nir_shader_compiler_options nir_options = { .lower_extract_word = true, .lower_fdiv = true, .lower_ffma = true, +.lower_ffma16 = true, .lower_flrp32 = true, .lower_fpow = true, .lower_fsat = true, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 27/41] ac/nir: make ac_build_umsb work on all bit sizes
v2: don't use ac_get_zero() and ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 38 +++--- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 61085db9320..ec87a7b9343 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1555,36 +1555,12 @@ ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) { - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef highest_bit; - LLVMValueRef zero; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); - switch (bitsize) { - case 64: - intrin_name = "llvm.ctlz.i64"; - type = ctx->i64; - highest_bit = LLVMConstInt(ctx->i64, 63, false); - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.ctlz.i32"; - type = ctx->i32; - highest_bit = LLVMConstInt(ctx->i32, 31, false); - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.ctlz.i16"; - type = ctx->i16; - highest_bit = LLVMConstInt(ctx->i16, 15, false); - zero = ctx->i16_0; - break; - default: - unreachable(!"invalid bitsize"); - break; - } + LLVMTypeRef type = LLVMTypeOf(arg); + unsigned bitsize = ac_get_elem_bits(ctx, type); + LLVMValueRef highest_bit = LLVMConstInt(type, bitsize - 1, false); + LLVMValueRef zero = LLVMConstInt(type, 0, false); + char intrin_name[64]; + snprintf(intrin_name, sizeof(intrin_name), "llvm.ctlz.i%d", bitsize); LLVMValueRef params[2] = { arg, @@ -1598,7 +1574,7 @@ ac_build_umsb(struct ac_llvm_context *ctx, /* The HW returns the last bit index from MSB, but TGSI/NIR wants * the index from LSB. Invert it by doing "31 - msb". */ msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); - msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, ""); + msb = ac_build_ui_cast(ctx, msb, dst_type); /* check for zero */ return LLVMBuildSelect(ctx->builder, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 24/41] ac/nir: implement 8 and 16 bit ac_build_readlane
v2: don't use ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 71eaac4b7bd..aa92c55c822 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2868,9 +2868,15 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la { LLVMTypeRef src_type = LLVMTypeOf(src); src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + unsigned src_bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + unsigned bits = src_bits; LLVMValueRef ret; + if (bits < 32) { + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + bits = 32; + } + if (bits == 32) { ret = _ac_build_readlane(ctx, src, lane); } else { @@ -2887,6 +2893,10 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la LLVMConstInt(ctx->i32, i, 0), ""); } } + + if (src_bits < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, LLVMIntTypeInContext(ctx->context, src_bits), ""); + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 30/41] ac/nir: make ac_build_bitfield_reverse work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 26 ++ 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 46738faea9d..dff369aae7f 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2100,28 +2100,14 @@ LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef result; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - switch (bitsize) { - case 32: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - default: - unreachable(!"invalid bitsize"); - break; - } + char name[64]; + snprintf(name, sizeof(name), "llvm.bitreverse.i%d", bitsize); - return result; + return ac_build_intrinsic(ctx, name, LLVMTypeOf(src0), + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); } #define AC_EXP_TARGET 0 -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 23/41] ac/nir: implement 16-bit ac_build_ddxy
v2: rebase Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index fb871a47400..71eaac4b7bd 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1481,6 +1481,11 @@ ac_build_ddxy(struct ac_llvm_context *ctx, LLVMValueRef tl, trbl; LLVMValueRef result; + int size = ac_get_type_size(LLVMTypeOf(val)); + + if (size == 2) + val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); + for (unsigned i = 0; i < 4; ++i) { tl_lanes[i] = i & mask; trbl_lanes[i] = (i & mask) + idx; @@ -1493,12 +1498,19 @@ ac_build_ddxy(struct ac_llvm_context *ctx, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]); - tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); - trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, ""); + if (size == 2) { + tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); + trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); + } + + LLVMTypeRef type = ac_float_of_size(ctx, size * 8); + tl = LLVMBuildBitCast(ctx->builder, tl, type, ""); + trbl = LLVMBuildBitCast(ctx->builder, trbl, type, ""); result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); - result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, - &result, 1, 0); + result = ac_build_intrinsic(ctx, + LLVMTypeOf(val) == ctx->f32 ? "llvm.amdgcn.wqm.f32" : "llvm.amdgcn.wqm.f16", type, + &result, 1, 0); return result; } -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 17/41] ac/nir: implement half-float nir_op_ldexp
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 8b0e07d2930..0e5946dfdb3 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -829,8 +829,10 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_ldexp: src[0] = ac_to_float(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 32) + if (ac_get_elem_bits(&ctx->ac, def_type) == 32) result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE); + else if (ac_get_elem_bits(&ctx->ac, def_type) == 16) + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE); else result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE); break; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 26/41] ac/nir: make ac_find_lsb work on all bit sizes
v2: don't use ac_get_zero() and ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 ++--- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index aa92c55c822..61085db9320 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2474,30 +2474,11 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0) { - unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef zero; - - switch (src0_bitsize) { - case 64: - intrin_name = "llvm.cttz.i64"; - type = ctx->i64; - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.cttz.i32"; - type = ctx->i32; - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.cttz.i16"; - type = ctx->i16; - zero = ctx->i16_0; - break; - default: - unreachable(!"invalid bitsize"); - } + LLVMTypeRef type = LLVMTypeOf(src0); + unsigned src0_bitsize = ac_get_elem_bits(ctx, type); + char intrin_name[64]; + LLVMValueRef zero = LLVMConstInt(type, 0, false); + snprintf(intrin_name, sizeof(intrin_name), "llvm.cttz.i%d", src0_bitsize); LLVMValueRef params[2] = { src0, @@ -2518,9 +2499,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, params, 2, AC_FUNC_ATTR_READNONE); - if (src0_bitsize == 64) { - lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); - } + lsb = ac_build_ui_cast(ctx, lsb, ctx->i32); /* TODO: We need an intrinsic to skip this conditional. */ /* Check for zero: */ -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 28/41] ac/nir: implement 8 and 16 bit ac_build_imsb
v2: fix C++ style comment Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 4 1 file changed, 4 insertions(+) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index ec87a7b9343..c986f800fa4 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1531,6 +1531,10 @@ ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) { + /* TODO: support 64-bit integers */ + if (LLVMTypeOf(arg) != ctx->i32) + arg = LLVMBuildSExt(ctx->builder, arg, ctx->i32, ""); + LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 10/41] ac/nir: make ac_build_clamp work on all bit sizes
v2: don't use ac_get_zerof() and ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index b53d9c7ff8c..667f9700764 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1597,16 +1597,20 @@ ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { + char intr[64]; + snprintf(intr, sizeof(intr), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2, + return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); } LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { + char intr[64]; + snprintf(intr, sizeof(intr), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2, + return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); } @@ -1633,8 +1637,9 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) { - return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0), -ctx->f32_1); + LLVMTypeRef t = LLVMTypeOf(value); + return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), +LLVMConstReal(t, 1.0)); } void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 14/41] ac/nir: make ac_build_fdiv support 16-bit floats
v2: don't use ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 23e454385d7..fb871a47400 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -661,7 +661,7 @@ ac_build_fdiv(struct ac_llvm_context *ctx, * If we do (num * (1 / den)), LLVM does: *return num * v_rcp_f32(den); */ - LLVMValueRef one = LLVMTypeOf(num) == ctx->f64 ? ctx->f64_1 : ctx->f32_1; + LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0); LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, ""); LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 21/41] ac/nir: implement 16-bit shifts
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 9 +++-- 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 75bb19031bf..bad1c2a990e 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -672,20 +672,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_ishl: result = LLVMBuildShl(ctx->ac.builder, src[0], - LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""), + ac_build_ui_cast(&ctx->ac, src[1], LLVMTypeOf(src[0])), ""); break; case nir_op_ishr: result = LLVMBuildAShr(ctx->ac.builder, src[0], - LLVMBuildZExt(ctx->ac.builder, src[1], -LLVMTypeOf(src[0]), ""), + ac_build_ui_cast(&ctx->ac, src[1], LLVMTypeOf(src[0])), ""); break; case nir_op_ushr: result = LLVMBuildLShr(ctx->ac.builder, src[0], - LLVMBuildZExt(ctx->ac.builder, src[1], -LLVMTypeOf(src[0]), ""), + ac_build_ui_cast(&ctx->ac, src[1], LLVMTypeOf(src[0])), ""); break; case nir_op_ilt32: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 11/41] ac/nir: make ac_build_fract work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 667f9700764..db937eb66fb 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2049,16 +2049,9 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16) LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMTypeRef type; - char *intr; - - if (bitsize == 32) { - intr = "llvm.floor.f32"; - type = ctx->f32; - } else { - intr = "llvm.floor.f64"; - type = ctx->f64; - } + LLVMTypeRef type = ac_float_of_size(ctx, bitsize); + char intr[64]; + snprintf(intr, sizeof(intr), "llvm.floor.f%d", bitsize); LLVMValueRef params[] = { src0, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 16/41] ac/nir: implement half-float nir_op_frsq
v2: don't use ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index cba0cec3e8f..8b0e07d2930 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -788,8 +788,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) case nir_op_frsq: result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", ac_to_float_type(&ctx->ac, def_type), src[0]); - result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1, - result); + result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result); break; case nir_op_frexp_exp: src[0] = ac_to_float(&ctx->ac, src[0]); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 15/41] ac/nir: implement half-float nir_op_frcp
v2: don't use ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 741059b5f1a..cba0cec3e8f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -657,8 +657,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_frcp: src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1, - src[0]); + result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]); break; case nir_op_iand: result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 18/41] radv: lower 16-bit flrp
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_shader.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 1dcb0606246..adba730ad8b 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -53,6 +53,7 @@ static const struct nir_shader_compiler_options nir_options = { .vertex_id_zero_based = true, .lower_scmp = true, + .lower_flrp16 = true, .lower_flrp32 = true, .lower_flrp64 = true, .lower_device_index_to_zero = true, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 12/41] ac/nir: make ac_build_isign work on all bit sizes
v2: don't use ac_get_zero(), ac_get_one() and ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 27 --- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index db937eb66fb..3b2257e8bf0 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2064,30 +2064,11 @@ LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMValueRef cmp, val, zero, one; - LLVMTypeRef type; - - switch (bitsize) { - case 64: - type = ctx->i64; - zero = ctx->i64_0; - one = ctx->i64_1; - break; - case 32: - type = ctx->i32; - zero = ctx->i32_0; - one = ctx->i32_1; - break; - case 16: - type = ctx->i16; - zero = ctx->i16_0; - one = ctx->i16_1; - break; - default: - unreachable(!"invalid bitsize"); - break; - } + LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize); + LLVMValueRef zero = LLVMConstInt(type, 0, false); + LLVMValueRef one = LLVMConstInt(type, 1, false); + LLVMValueRef cmp, val; cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, ""); val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 19/41] ac/nir: support half floats in emit_b2f
This seems to generate fine code, even though the IR is a bit ugly. Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 0e5946dfdb3..e459001c1cf 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -316,14 +316,20 @@ static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, unsigned bitsize) { LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, - LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), + LLVMBuildBitCast(ctx->builder, ctx->f32_1, ctx->i32, ""), ""); result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, ""); - if (bitsize == 32) + switch (bitsize) { + case 16: + return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, ""); + case 32: return result; - - return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); + case 64: + return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); + default: + unreachable("Unsupported bit size."); + } } static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 40/41] ac/nir: have nir_op_f2f16 round to zero
In the hope that one day LLVM will then be able to generate code with vectorized v_cvt_pkrtz_f16_f32 instructions. Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 8bfc63958ca..7a5e95506f2 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -884,6 +884,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); break; case nir_op_f2f16_rtz: + case nir_op_f2f16: src[0] = ac_to_float(&ctx->ac, src[0]); if (LLVMTypeOf(src[0]) == ctx->ac.f64) src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); @@ -894,7 +895,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, ""); break; case nir_op_f2f16_rtne: - case nir_op_f2f16: case nir_op_f2f32: case nir_op_f2f64: src[0] = ac_to_float(&ctx->ac, src[0]); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation
v2: add to patch series Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 +--- src/amd/common/ac_llvm_build.h | 3 ++- src/amd/common/ac_nir_to_llvm.c | 14 +++--- src/amd/vulkan/radv_nir_to_llvm.c| 27 ++- src/amd/vulkan/radv_pipeline.c | 19 -- src/amd/vulkan/radv_shader.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 2 +- 7 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index dff369aae7f..be2c2251a21 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + int word) { - LLVMValueRef args[5]; + LLVMValueRef args[6]; LLVMValueRef p1; args[0] = i; args[1] = llvm_chan; args[2] = attr_number; - args[3] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[3] = LLVMConstInt(ctx->i1, word, false); + args[4] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f16, args, 5, AC_FUNC_ATTR_READNONE); + } else { + args[3] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", + ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + } args[0] = p1; args[1] = j; args[2] = llvm_chan; args[3] = attr_number; - args[4] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[4] = LLVMConstInt(ctx->i1, word, false); + args[5] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); + } else { + args[4] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + } } LLVMValueRef diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 61c9b5e4b6c..655427567c4 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j); + LLVMValueRef j, + int word); LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bf7024c68e4..939b8eb13de 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, LLVMValueRef j = LLVMBuildExtractElement( ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); + /* This fp16 handling isn't technically correct +* but should be correct for the attributes we +* are actually going to use. */ + bool fp16 = instr->dest.ssa.bit_size == 16; + int word = fp16 ? 0 : -1; v = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ctx->abi->prim_mask, i, j); + ctx->abi->prim_mask, i, j, word); + if (fp16) + v = ac_build_reinterpret(&ctx->ac, v, ctx->ac.f32); } else { v = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, false), llvm_chan, attr_number, ctx->abi->prim_mask); @@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, attrib_idx, ""); } - retu
[Mesa-dev] [PATCH v2 38/41] WIP: ac, radv: run LLVM's SLP vectorizer
v2: rebase v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass v2: run unconditionally on GFX9 and later v2: mark as WIP because it can make 32-bit code much worse Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_util.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c index 69446863b95..8d78b5a850b 100644 --- a/src/amd/common/ac_llvm_util.c +++ b/src/amd/common/ac_llvm_util.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "c11/threads.h" #include "gallivm/lp_bld_misc.h" #include "util/u_math.h" @@ -175,7 +176,7 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, } static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info, - bool check_ir) + bool check_ir, enum radeon_family family) { LLVMPassManagerRef passmgr = LLVMCreatePassManager(); if (!passmgr) @@ -203,6 +204,9 @@ static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_libr LLVMAddCFGSimplificationPass(passmgr); /* This is recommended by the instruction combining pass. */ LLVMAddEarlyCSEMemSSAPass(passmgr); + /* vectorization is disabled on pre-GFX9 because it's not very useful there */ + if (family >= CHIP_VEGA10) + LLVMAddSLPVectorizePass(passmgr); LLVMAddInstructionCombiningPass(passmgr); return passmgr; } @@ -327,7 +331,7 @@ ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, goto fail; compiler->passmgr = ac_create_passmgr(compiler->target_library_info, - tm_options & AC_TM_CHECK_IR); + tm_options & AC_TM_CHECK_IR, family); if (!compiler->passmgr) goto fail; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 39/41] ac/nir: generate better code for nir_op_f2f16_rtz
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 939b8eb13de..8bfc63958ca 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -889,7 +889,9 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 }; result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + // generates better code than an extractelement with slp vectorization + result = LLVMBuildBitCast(ctx->ac.builder, result, ctx->ac.i32, ""); + result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, ""); break; case nir_op_f2f16_rtne: case nir_op_f2f16: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 41/41] radv, docs: expose float16, int16 and int8 features and extensions
v2: rebase v2: mark VK_KHR_8bit_storage as DONE in features.txt Signed-off-by: Rhys Perry --- docs/features.txt | 2 +- src/amd/vulkan/radv_device.c | 17 + src/amd/vulkan/radv_extensions.py | 4 src/amd/vulkan/radv_shader.c | 3 +++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/features.txt b/docs/features.txt index 6c2b6d59377..ded753b0182 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -439,7 +439,7 @@ Vulkan 1.1 -- all DONE: anv, radv VK_KHR_variable_pointers DONE (anv, radv) Khronos extensions that are not part of any Vulkan version: - VK_KHR_8bit_storage DONE (anv) + VK_KHR_8bit_storage DONE (anv, radv) VK_KHR_android_surfacenot started VK_KHR_create_renderpass2 DONE (anv, radv) VK_KHR_displayDONE (anv, radv) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 0fef92773e1..4137b778466 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -877,6 +877,23 @@ void radv_GetPhysicalDeviceFeatures2( features->bufferDeviceAddressMultiDevice = false; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { + VkPhysicalDeviceFloat16Int8FeaturesKHR *features = + (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext; + bool enabled = pdevice->rad_info.chip_class >= VI; + features->shaderFloat16 = enabled && HAVE_LLVM >= 0x0800; + features->shaderInt8 = enabled; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: { + VkPhysicalDevice8BitStorageFeaturesKHR *features = + (VkPhysicalDevice8BitStorageFeaturesKHR*)ext; + bool enabled = pdevice->rad_info.chip_class >= VI; + features->storageBuffer8BitAccess = enabled; + features->uniformAndStorageBuffer8BitAccess = enabled; + features->storagePushConstant8 = enabled; + break; + } default: break; } diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index f218598f123..e38cfcfdcbe 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -91,6 +91,8 @@ EXTENSIONS = [ Extension('VK_KHR_xlib_surface', 6, 'VK_USE_PLATFORM_XLIB_KHR'), Extension('VK_KHR_multiview', 1, True), Extension('VK_KHR_display', 23, 'VK_USE_PLATFORM_DISPLAY_KHR'), +Extension('VK_KHR_shader_float16_int8', 1, 'device->rad_info.chip_class >= VI'), +Extension('VK_KHR_8bit_storage', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_EXT_direct_mode_display', 1, 'VK_USE_PLATFORM_DISPLAY_KHR'), Extension('VK_EXT_acquire_xlib_display', 1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'), Extension('VK_EXT_buffer_device_address', 1, True), @@ -121,6 +123,8 @@ EXTENSIONS = [ Extension('VK_AMD_shader_core_properties',1, True), Extension('VK_AMD_shader_info', 1, True), Extension('VK_AMD_shader_trinary_minmax', 1, True), +Extension('VK_AMD_gpu_shader_half_float', 1, 'device->rad_info.chip_class >= VI && HAVE_LLVM >= 0x0800'), +Extension('VK_AMD_gpu_shader_int16', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_GOOGLE_decorate_string',1, True), Extension('VK_GOOGLE_hlsl_functionality1',1, True), ] diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index adba730ad8b..44dea8e7203 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -249,6 +249,9 @@ radv_shader_compile_to_nir(struct radv_device *device, .transform_feedback = true, .trinary_minmax = true, .variable_pointers = true, + .float16 = true, + .storage_8bit = true, + .int8 =
[Mesa-dev] [PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation
v2: add to patch series Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 +--- src/amd/common/ac_llvm_build.h | 3 ++- src/amd/common/ac_nir_to_llvm.c | 14 +++--- src/amd/vulkan/radv_nir_to_llvm.c| 27 ++- src/amd/vulkan/radv_pipeline.c | 19 -- src/amd/vulkan/radv_shader.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 2 +- 7 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index dff369aae7f..be2c2251a21 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + int word) { - LLVMValueRef args[5]; + LLVMValueRef args[6]; LLVMValueRef p1; args[0] = i; args[1] = llvm_chan; args[2] = attr_number; - args[3] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[3] = LLVMConstInt(ctx->i1, word, false); + args[4] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f16, args, 5, AC_FUNC_ATTR_READNONE); + } else { + args[3] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", + ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + } args[0] = p1; args[1] = j; args[2] = llvm_chan; args[3] = attr_number; - args[4] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[4] = LLVMConstInt(ctx->i1, word, false); + args[5] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); + } else { + args[4] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + } } LLVMValueRef diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 61c9b5e4b6c..655427567c4 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j); + LLVMValueRef j, + int word); LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bf7024c68e4..939b8eb13de 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, LLVMValueRef j = LLVMBuildExtractElement( ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); + /* This fp16 handling isn't technically correct +* but should be correct for the attributes we +* are actually going to use. */ + bool fp16 = instr->dest.ssa.bit_size == 16; + int word = fp16 ? 0 : -1; v = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ctx->abi->prim_mask, i, j); + ctx->abi->prim_mask, i, j, word); + if (fp16) + v = ac_build_reinterpret(&ctx->ac, v, ctx->ac.f32); } else { v = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, false), llvm_chan, attr_number, ctx->abi->prim_mask); @@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, attrib_idx, ""); } - retu
[Mesa-dev] [PATCH v2 34/41] ac/nir: store all outputs as f32
v2: rebase v2: fix 64-bit visit_load_var() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 14 ++ src/amd/vulkan/radv_nir_to_llvm.c | 22 +- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 5821c18aeb1..bf7024c68e4 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -2114,7 +2114,10 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, unreachable("unhandle variable mode"); } ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp); - return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); + if (instr->dest.ssa.bit_size == 16) + return ac_build_reinterpret(&ctx->ac, ret, get_def_type(ctx, &instr->dest.ssa)); + else + return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); } static void @@ -2152,6 +2155,11 @@ visit_store_var(struct ac_nir_context *ctx, writemask = writemask << comp; + LLVMTypeRef type = ctx->ac.f32; + if (LLVMGetTypeKind(LLVMTypeOf(src)) == LLVMVectorTypeKind) + type = LLVMVectorType(ctx->ac.f32, LLVMGetVectorSize(LLVMTypeOf(src))); + src = ac_build_reinterpret(&ctx->ac, src, type); + switch (deref->mode) { case nir_var_shader_out: @@ -4329,12 +4337,10 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, } } - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32; for (unsigned i = 0; i < attrib_count; ++i) { for (unsigned chan = 0; chan < 4; chan++) { abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] = - ac_build_alloca_undef(ctx, type, ""); + ac_build_alloca_undef(ctx, ctx->f32, ""); } } diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 8fdaee72036..2002a744545 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2305,6 +2305,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { + bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; @@ -2421,16 +2422,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, return; } - if (is_16bit) { - for (unsigned chan = 0; chan < 4; chan++) { - values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, ""); - args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, ""); - } - } else - memcpy(&args->out[0], values, sizeof(values[0]) * 4); - - for (unsigned i = 0; i < 4; ++i) - args->out[i] = ac_to_float(&ctx->ac, args->out[i]); + for (unsigned chan = 0; chan < 4; chan++) + args->out[chan] = ac_build_reinterpret(&ctx->ac, values[chan], ctx->ac.f32); } static void @@ -3137,9 +3130,12 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) if (i < FRAG_RESULT_DATA0) continue; - for (unsigned j = 0; j < 4; j++) - values[j] = ac_to_float(&ctx->ac, - radv_load_output(ctx, i, j)); + for (unsigned j = 0; j < 4; j++) { + values[j] = radv_load_output(ctx, i, j); + unsigned index = ac_llvm_reg_index_soa(i, 0); + LLVMTypeRef new_type = ctx->abi.output_types[index]; + values[j] = ac_build_reinterpret(&ctx->ac, values[j], new_type); + } bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation
This patch can be ignored. I forgot to delete it and it ended up getting sent. "[PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation" is the correct one. On Sat, 16 Feb 2019 at 00:23, Rhys Perry wrote: > > v2: add to patch series > > Signed-off-by: Rhys Perry > --- > src/amd/common/ac_llvm_build.c | 33 +--- > src/amd/common/ac_llvm_build.h | 3 ++- > src/amd/common/ac_nir_to_llvm.c | 14 +++--- > src/amd/vulkan/radv_nir_to_llvm.c| 27 ++- > src/amd/vulkan/radv_pipeline.c | 19 -- > src/amd/vulkan/radv_shader.h | 1 + > src/gallium/drivers/radeonsi/si_shader.c | 2 +- > 7 files changed, 69 insertions(+), 30 deletions(-) > > diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c > index dff369aae7f..be2c2251a21 100644 > --- a/src/amd/common/ac_llvm_build.c > +++ b/src/amd/common/ac_llvm_build.c > @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, >LLVMValueRef attr_number, >LLVMValueRef params, >LLVMValueRef i, > - LLVMValueRef j) > + LLVMValueRef j, > + int word) > { > - LLVMValueRef args[5]; > + LLVMValueRef args[6]; > LLVMValueRef p1; > > args[0] = i; > args[1] = llvm_chan; > args[2] = attr_number; > - args[3] = params; > - > - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", > - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); > + if (word >= 0) { > + args[3] = LLVMConstInt(ctx->i1, word, false); > + args[4] = params; > + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", > + ctx->f16, args, 5, > AC_FUNC_ATTR_READNONE); > + } else { > + args[3] = params; > + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", > + ctx->f32, args, 4, > AC_FUNC_ATTR_READNONE); > + } > > args[0] = p1; > args[1] = j; > args[2] = llvm_chan; > args[3] = attr_number; > - args[4] = params; > - > - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", > - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); > + if (word >= 0) { > + args[4] = LLVMConstInt(ctx->i1, word, false); > + args[5] = params; > + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", > + ctx->f16, args, 6, > AC_FUNC_ATTR_READNONE); > + } else { > + args[4] = params; > + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", > + ctx->f32, args, 5, > AC_FUNC_ATTR_READNONE); > + } > } > > LLVMValueRef > diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h > index 61c9b5e4b6c..655427567c4 100644 > --- a/src/amd/common/ac_llvm_build.h > +++ b/src/amd/common/ac_llvm_build.h > @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, >LLVMValueRef attr_number, >LLVMValueRef params, >LLVMValueRef i, > - LLVMValueRef j); > + LLVMValueRef j, > + int word); > > LLVMValueRef > ac_build_fs_interp_mov(struct ac_llvm_context *ctx, > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c > index bf7024c68e4..939b8eb13de 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context > *ctx, > LLVMValueRef j = LLVMBuildExtractElement( > ctx->ac.builder, interp_param, > ctx->ac.i32_1, ""); > > + /* This fp16 handling isn't technically > correct > +* but should be correct for the attributes we > +* are actually going to use. */ > + bool fp16 = instr->dest.ssa.bit_size == 16; > + int word = fp16 ? 0 : -1; > v = ac_build_fs_interp(&ctx->ac, llvm_chan, > attr_number, > -
[Mesa-dev] [PATCH v2 35/41] radv: store all fragment shader inputs as f32
v2: rebase Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 14 -- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 2002a744545..01b8b097ea1 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2056,7 +2056,6 @@ static void interp_fs_input(struct radv_shader_context *ctx, LLVMValueRef attr_number; unsigned chan; LLVMValueRef i, j; - bool interp = !LLVMIsUndef(interp_param); attr_number = LLVMConstInt(ctx->ac.i32, attr, false); @@ -2070,7 +2069,7 @@ static void interp_fs_input(struct radv_shader_context *ctx, * fs.interp cannot be used on integers, because they can be equal * to NaN. */ - if (interp) { + if (interp_param) { interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, ""); @@ -2083,7 +2082,7 @@ static void interp_fs_input(struct radv_shader_context *ctx, for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); - if (interp) { + if (interp_param) { result[chan] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, @@ -2095,7 +2094,6 @@ static void interp_fs_input(struct radv_shader_context *ctx, attr_number, prim_mask); result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, ""); - result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), ""); } } } @@ -2123,10 +2121,6 @@ handle_fs_input_decl(struct radv_shader_context *ctx, interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); } - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32; - if (interp == NULL) - interp = LLVMGetUndef(type); for (unsigned i = 0; i < attrib_count; ++i) ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp; @@ -2187,7 +2181,7 @@ handle_fs_inputs(struct radv_shader_context *ctx, if (ctx->shader_info->info.ps.uses_input_attachments || ctx->shader_info->info.needs_multiview_view_index) { ctx->input_mask |= 1ull << VARYING_SLOT_LAYER; - ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = LLVMGetUndef(ctx->ac.i32); + ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = NULL; } for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { @@ -2203,7 +2197,7 @@ handle_fs_inputs(struct radv_shader_context *ctx, interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, inputs); - if (LLVMIsUndef(interp_param)) + if (!interp_param) ctx->shader_info->fs.flat_shaded_mask |= 1u << index; if (i >= VARYING_SLOT_VAR0) ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 33/41] ac/nir, radv: create an array of varying output types
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 68 +++ src/amd/common/ac_shader_abi.h| 1 + src/amd/vulkan/radv_nir_to_llvm.c | 3 ++ 3 files changed, 72 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index defbfdf4297..5821c18aeb1 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -4238,6 +4238,68 @@ static void visit_cf_list(struct ac_nir_context *ctx, } } +static unsigned traverse_var_component_slots(struct ac_llvm_context *ctx, bool vs_in, +struct nir_variable *var, unsigned cur_offset, +const struct glsl_type *cur_type, +void (*cb)(struct ac_llvm_context *, unsigned, enum glsl_base_type, void *), +void *cbdata) +{ + if (glsl_type_is_struct(cur_type)) { + for (unsigned i = 0; i < glsl_get_length(cur_type); i++) { + const struct glsl_type *ft = glsl_get_struct_field(cur_type, i); + cur_offset = traverse_var_component_slots(ctx, vs_in, var, cur_offset, ft, cb, cbdata); + } + return (cur_offset + 3) / 4 * 4; + } + + enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array_or_matrix(cur_type)); + + unsigned stride = glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); + if (!var->data.compact) + stride = (stride + 3) / 4 * 4; + unsigned arr_len = MAX2(glsl_get_matrix_columns(cur_type), 1); + if (glsl_type_is_array(cur_type)) + arr_len *= glsl_get_aoa_size(cur_type); + for (unsigned i = 0; i < arr_len; i++) { + for (unsigned j = 0; j < glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); j++) { + cb(ctx, cur_offset + var->data.location_frac + j, base_type, cbdata); + } + cur_offset += stride; + } + return cur_offset; +} + +static void setup_output_type(struct ac_llvm_context *ctx, unsigned index, enum glsl_base_type base, void *output_types) +{ + LLVMTypeRef type; + switch (base) { + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + type = ctx->i8; + break; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + type = ctx->i16; + break; + case GLSL_TYPE_FLOAT16: + type = ctx->f16; + break; + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + type = ctx->i32; + break; + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: + default: + type = ctx->f32; + break; + } + ((LLVMTypeRef*)output_types)[index] = type; +} + void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi, @@ -4275,6 +4337,9 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, ac_build_alloca_undef(ctx, type, ""); } } + + traverse_var_component_slots(ctx, false, variable, output_loc * 4, +variable->type, &setup_output_type, abi->output_types); } static void @@ -4328,6 +4393,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS * 4; i++) + ctx.abi->output_types[i] = ac->i32; + nir_foreach_variable(variable, &nir->outputs) ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, ctx.stage); diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index ee18e6c1923..274deeb13a4 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -69,6 +69,7 @@ struct ac_shader_abi { LLVMValueRef view_index; LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; + LLVMTypeRef output_types[AC_LLVM_MAX_OUTPUTS * 4]; /* For VS and PS: pre-loaded shader inputs. * diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index d3795eec403..8fdaee72036 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -3910,6 +3910,9 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out; ac_setup_rings(&ctx); + for (unsigned i = 0; i < AC
[Mesa-dev] [PATCH v2 25/41] nir: make bitfield_reverse and ifind_msb work with all integers
Signed-off-by: Rhys Perry --- src/compiler/nir/nir_opcodes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index dc4cd9ac63d..0f40bd6c548 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -350,7 +350,7 @@ unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") # Bit operations, part of ARB_gpu_shader5. -unop("bitfield_reverse", tuint32, """ +unop("bitfield_reverse", tuint, """ /* we're not winning any awards for speed here, but that's ok */ dst = 0; for (unsigned bit = 0; bit < 32; bit++) @@ -374,7 +374,7 @@ for (int bit = bit_size - 1; bit >= 0; bit--) { } """) -unop("ifind_msb", tint32, """ +unop_convert("ifind_msb", tint32, tint, """ dst = -1; for (int bit = 31; bit >= 0; bit--) { /* If src0 < 0, we're looking for the first 0 bit. -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 32/41] ac/nir: add 8-bit types to glsl_base_to_llvm_type
v2: remove 16-bit additions and rebase Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index f6ad1aa7e77..defbfdf4297 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3969,6 +3969,9 @@ glsl_base_to_llvm_type(struct ac_llvm_context *ac, case GLSL_TYPE_BOOL: case GLSL_TYPE_SUBROUTINE: return ac->i32; + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + return ac->i8; case GLSL_TYPE_INT16: case GLSL_TYPE_UINT16: return ac->i16; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 31/41] ac/nir: implement 16-bit pack/unpack opcodes
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 24 1 file changed, 24 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bad1c2a990e..f6ad1aa7e77 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1015,6 +1015,30 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; } + case nir_op_pack_32_2x16_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + + case nir_op_unpack_32_2x16_split_x: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, +ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_32_2x16_split_y: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, +ctx->ac.i32_1, ""); + break; + } + case nir_op_cube_face_coord: { src[0] = ac_to_float(&ctx->ac, src[0]); LLVMValueRef results[2]; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores
I don't see a 16-bit version of tbuffer.store in IntrinsicsAMDGPU.td and simply changing "llvm.amdgcn.tbuffer.store.i32" to "llvm.amdgcn.tbuffer.store.i16" and removing the zext doesn't seem to work. On Mon, 18 Feb 2019 at 08:55, Samuel Pitoiset wrote: > > Does this fix anything know? There is a 16-bit version of tbuffer.store, > maybe we should use it? > > On 2/16/19 1:21 AM, Rhys Perry wrote: > > Signed-off-by: Rhys Perry > > --- > > src/amd/common/ac_nir_to_llvm.c | 2 ++ > > 1 file changed, 2 insertions(+) > > > > diff --git a/src/amd/common/ac_nir_to_llvm.c > > b/src/amd/common/ac_nir_to_llvm.c > > index 89a78b43c6f..b260142c177 100644 > > --- a/src/amd/common/ac_nir_to_llvm.c > > +++ b/src/amd/common/ac_nir_to_llvm.c > > @@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context > > *ctx, > > } else if (num_bytes == 2) { > > store_name = "llvm.amdgcn.tbuffer.store.i32"; > > data_type = ctx->ac.i32; > > + data = LLVMBuildBitCast(ctx->ac.builder, data, > > ctx->ac.i16, ""); > > + data = LLVMBuildZExt(ctx->ac.builder, data, > > data_type, ""); > > LLVMValueRef tbuffer_params[] = { > > data, > > rsrc, ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
The CTS is buggy because the input_output_float_64_to_16 tests are run even though they shouldn't be run because they try to use a unadvertised (and unimplemented) optional feature. Some of them crash for unrelated reasons though: load_tess_varyings() from ac_nir_to_llvm.c doesn't handle 64-bit varyings. So not all of them would work even if VK_FORMAT_R64_SFLOAT was a implemented vertex format. On Mon, 18 Feb 2019 at 08:53, Samuel Pitoiset wrote: > > > On 2/16/19 1:21 AM, Rhys Perry wrote: > > This series add support for: > > - VK_KHR_shader_float16_int8 > > - VK_AMD_gpu_shader_half_float > > - VK_AMD_gpu_shader_int16 > > - VK_KHR_8bit_storage > > on VI+. Half floats are disabled on LLVM 7 because of a bug causing large > > memory usage and long (or unbounded) compilation times with some CTS > > tests. > > > > It is written against the following patch series: > > - https://patchwork.freedesktop.org/series/53454/ (v4) > > - https://patchwork.freedesktop.org/series/53660/ (v1) > > > > With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega > > and VI except for > > dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.* > > which fails or crashes because of unrelated radv bugs with 64-bit varyings > > and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even > > though radv does not support it. > > test bug? > > The two NIR related patches (22 and 25) should be sent separately, > otherwise people working on NIR might miss them. > > > > > With LLVM 9, there are no reproducable piglit regressions except for > > glsl-array-bounds-12.shader_test because of a LLVM bug when > > SLP vectorization is enabled. > > > > With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega > > and VI except for those with LLVM 9 and a couple of tests because of a > > LLVM bug after the SLP vectorizer and with the current lack of fallback > > for 16-bit interpolation on LLVM versions before LLVM 9. > > > > With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega > > and VI except for those with LLVM 9 and a couple of tests because of a > > LLVM bug after the SLP vectorizer. > > > > The SLP vectorization patch is marked as WIP because it exposes LLVM bugs > > with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and > > some shader-db test for a game I can't remember. It also over-vectorizes > > 32-bit code which can cause significant worsening in generated code > > quality. > > > > The 16-bit interpolation patch is marked as WIP because it currently > > requires intrinsics only available in LLVM 9 and does not have a fallback. > > > > A branch on Github containing this series can be found at: > > https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2 > > > > v2: rebase > > v2: implement 16-bit interpolation > > v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass > > v2: run vectorization unconditionally on GFX9 and later > > v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof() > > v2: remove ac_int_of_size() > > v2: fix 64-bit visit_load_var() > > v2: mark VK_KHR_8bit_storage as DONE in features.txt > > v2: mark SLP vectorization patch as WIP > > v2: fix C++ style comment > > > > Rhys Perry (41): > >radv: bitcast 16-bit outputs to integers > >radv: ensure export arguments are always float > >ac: add various helpers for float16/int16/int8 > >ac/nir: implement 8-bit push constant, ssbo and ubo loads > >ac/nir: implement 8-bit ssbo stores > >ac/nir: fix 16-bit ssbo stores > >ac/nir: implement 8-bit nir_load_const_instr > >ac/nir: implement 8-bit conversions > >ac/nir: fix 64-bit nir_op_f2f16_rtz > >ac/nir: make ac_build_clamp work on all bit sizes > >ac/nir: make ac_build_fract work on all bit sizes > >ac/nir: make ac_build_isign work on all bit sizes > >ac/nir: make ac_build_fsign work on all bit sizes > >ac/nir: make ac_build_fdiv support 16-bit floats > >ac/nir: implement half-float nir_op_frcp > >ac/nir: implement half-float nir_op_frsq > >ac/nir: implement half-float nir_op_ldexp > >radv: lower 16-bit flrp > >ac/nir: support half floats in emit_b2f > >ac/nir: make emit_b2i work on all bit sizes > >ac/nir: implement 16-bit shifts > >compiler/nir: add lowering option for 16-bit ffma > >ac/nir: implement 16-bit ac_build_ddxy > >ac/nir: implement 8 and 16 bit ac
[Mesa-dev] [PATCH] nv50/ir, nvc0: add debug options for shader replacement
Changes in v4: - Move code to nv50_ir_dump.cpp - Dump headers of nvc0 programs - Use CRC-32 instead of a truncated SHA1 - Set prog->maxGPR to targ->getFileSize() - 1 and set prog->tlsSize - Don't compile the program if a replacement is offered This has the consequence that a program is not dumped when it's replaced Changes in v3: - Fixed messed up patch description and diff - Use the checksum of the TGSI instead of the binary if possible Changes in v2: - move "#ifdef DEBUG" from above dumpProgram to above createDumpFilename The NV50_PROG_DUMP environment variable specifies a (already created) directory to dump shader binaries, headers and tgsi code. The NV50_PROG_REPLACE environment variable specifies a (already created) directory that is searched to find replacement binaries and headers. This is all much like MESA_SHADER_DUMP_PATH and MESA_SHADER_READ_PATH expect using CRC-32 checksums instead of program IDs and chip-specific binaries instead of GLSL. Signed-off-by: Rhys Perry --- src/gallium/auxiliary/tgsi/tgsi_util.h | 1 + src/gallium/drivers/nouveau/Makefile.sources | 2 + src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 40 +++-- .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 + .../drivers/nouveau/codegen/nv50_ir_dump.cpp | 171 + src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h | 70 + src/gallium/drivers/nouveau/meson.build| 2 + src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 138 +++-- 8 files changed, 360 insertions(+), 65 deletions(-) create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.cpp create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h index 686b90f467..81cf955d8f 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.h +++ b/src/gallium/auxiliary/tgsi/tgsi_util.h @@ -28,6 +28,7 @@ #ifndef TGSI_UTIL_H #define TGSI_UTIL_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 65f08c7d8d..e867221818 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -114,6 +114,8 @@ NV50_CODEGEN_SOURCES := \ codegen/nv50_ir_build_util.cpp \ codegen/nv50_ir_build_util.h \ codegen/nv50_ir_driver.h \ + codegen/nv50_ir_dump.cpp \ + codegen/nv50_ir_dump.h \ codegen/nv50_ir_emit_nv50.cpp \ codegen/nv50_ir_from_tgsi.cpp \ codegen/nv50_ir_graph.cpp \ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index c987da9908..b1782bb4f2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -23,6 +23,7 @@ #include "codegen/nv50_ir.h" #include "codegen/nv50_ir_target.h" #include "codegen/nv50_ir_driver.h" +#include "codegen/nv50_ir_dump.h" extern "C" { #include "nouveau_debug.h" @@ -1244,30 +1245,35 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info) prog->print(); targ->parseDriverInfo(info); - prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA); - prog->convertToSSA(); + if (!nv50_ir::replaceProgramCode(prog)) { + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA); - if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) - prog->print(); + prog->convertToSSA(); - prog->optimizeSSA(info->optLevel); - prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA); + if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) + prog->print(); - if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) - prog->print(); + prog->optimizeSSA(info->optLevel); + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA); - if (!prog->registerAllocation()) { - ret = -4; - goto out; - } - prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA); + if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) + prog->print(); - prog->optimizePostRA(info->optLevel); + if (!prog->registerAllocation()) { + ret = -4; + goto out; + } + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA); - if (!prog->emitBinary(info)) { - ret = -5; - goto out; + prog->optimizePostRA(info->optLevel); + + if (!prog->emitBinary(info)) { + ret = -5; + goto out; + } + + nv50_ir::dumpProgramCodeAndIR(prog); } out: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
Re: [Mesa-dev] [PATCH v2 4/5] nvc0: add support for programmable sample locations
EvaluateDepthValuesARB()/ResolveDepthValuesNV() is a hint for the driver to decompress the depth buffer if needed. This can be needed because the decompressed result can depend on the current sample locations. Fiddling around with the current state of the patches, I could not find a case where it seemed that compressed depth values depended on the sample locations. I figured the depth values in the test were rather compressible, but I don't know any details about Nvidia's depth compression. I wouldn't mind running a trace of the blob and see if it does anything though, if you want to be more sure. As for the MS=1 thing, it's for the unlikely case that someone wants to create a single sample texture through some other API than OpenGL or just direct gallium and wants to program the sample locations. It doesn't matter much, though I think it's pretty harmless. On Mon, May 28, 2018 at 9:05 PM, Ilia Mirkin wrote: > ARB_sample_locaitons has all this stuff about a resolve of some sort > when you switch around the locations. I don't see anything here about > that. Thoughts? > > Also some more specific comments inline: > > On Thu, May 10, 2018 at 12:28 PM, Rhys Perry wrote: >> Signed-off-by: Rhys Perry >> --- >> .../drivers/nouveau/codegen/nv50_ir_driver.h | 2 + >> .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 7 + >> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 91 +-- >> .../nouveau/codegen/nv50_ir_lowering_nvc0.h| 2 + >> src/gallium/drivers/nouveau/nv50/nv50_miptree.c| 1 + >> src/gallium/drivers/nouveau/nv50/nv50_resource.h | 1 + >> src/gallium/drivers/nouveau/nvc0/nvc0_context.h| 15 +- >> src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c| 1 + >> src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 3 + >> src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 33 +++- >> src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 17 +- >> .../drivers/nouveau/nvc0/nvc0_state_validate.c | 174 >> + >> 12 files changed, 301 insertions(+), 46 deletions(-) >> >> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h >> b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h >> index 3d0782f86b..7c835ceab8 100644 >> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h >> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h >> @@ -73,6 +73,7 @@ struct nv50_ir_prog_symbol >> #define NVISA_GK104_CHIPSET0xe0 >> #define NVISA_GK20A_CHIPSET0xea >> #define NVISA_GM107_CHIPSET0x110 >> +#define NVISA_GM200_CHIPSET0x120 >> >> struct nv50_ir_prog_info >> { >> @@ -145,6 +146,7 @@ struct nv50_ir_prog_info >> bool persampleInvocation; >> bool usesSampleMaskIn; >> bool readsFramebuffer; >> + bool readsSampleLocations; >>} fp; >>struct { >> uint32_t inputOffset; /* base address for user args */ >> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp >> b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp >> index 3c5bad05fe..d7844d7381 100644 >> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp >> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp >> @@ -1520,6 +1520,10 @@ void Source::scanInstructionSrc(const Instruction& >> insn, >> info->out[src.getIndex(0)].oread = 1; >>} >> } >> + if (src.getFile() == TGSI_FILE_SYSTEM_VALUE) { >> + if (info->sv[src.getIndex(0)].sn == TGSI_SEMANTIC_SAMPLEPOS) >> + info->prop.fp.readsSampleLocations = true; >> + } >> if (src.getFile() != TGSI_FILE_INPUT) >>return; >> >> @@ -1560,6 +1564,9 @@ bool Source::scanInstruction(const struct >> tgsi_full_instruction *inst) >> if (insn.getOpcode() == TGSI_OPCODE_FBFETCH) >>info->prop.fp.readsFramebuffer = true; >> >> + if (insn.getOpcode() == TGSI_OPCODE_INTERP_SAMPLE) >> + info->prop.fp.readsSampleLocations = true; >> + >> if (insn.dstCount()) { >>Instruction::DstRegister dst = insn.getDst(0); >> >> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> index 29f674b451..5f5298777e 100644 >> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> @@ -2662,17 +2662,33 @@ NVC0LoweringPass::handleRDSV(Instruction *i) >>ld-
[Mesa-dev] [PATCH v5] nv50/ir, nvc0: add debug options for shader replacement
Changes in v5: - Add a forgotten change to fix memory leaks of fname Changes in v4: - Move code to nv50_ir_dump.cpp - Dump headers of nvc0 programs - Use CRC-32 instead of a truncated SHA1 - Set prog->maxGPR to targ->getFileSize() - 1 and set prog->tlsSize - Don't compile the program if a replacement is offered This has the consequence that a program is not dumped when it's replaced Changes in v3: - Fixed messed up patch description and diff - Use the checksum of the TGSI instead of the binary if possible Changes in v2: - move "#ifdef DEBUG" from above dumpProgram to above createDumpFilename The NV50_PROG_DUMP environment variable specifies a (already created) directory to dump shader binaries, headers and tgsi code. The NV50_PROG_REPLACE environment variable specifies a (already created) directory that is searched to find replacement binaries and headers. This is all much like MESA_SHADER_DUMP_PATH and MESA_SHADER_READ_PATH expect using CRC-32 checksums instead of program IDs and chip-specific binaries instead of GLSL. Signed-off-by: Rhys Perry --- src/gallium/auxiliary/tgsi/tgsi_util.h | 1 + src/gallium/drivers/nouveau/Makefile.sources | 2 + src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 40 +++-- .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 + .../drivers/nouveau/codegen/nv50_ir_dump.cpp | 174 + src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h | 70 + src/gallium/drivers/nouveau/meson.build| 2 + src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 138 ++-- 8 files changed, 363 insertions(+), 65 deletions(-) create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.cpp create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h index 686b90f467..81cf955d8f 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.h +++ b/src/gallium/auxiliary/tgsi/tgsi_util.h @@ -28,6 +28,7 @@ #ifndef TGSI_UTIL_H #define TGSI_UTIL_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 65f08c7d8d..e867221818 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -114,6 +114,8 @@ NV50_CODEGEN_SOURCES := \ codegen/nv50_ir_build_util.cpp \ codegen/nv50_ir_build_util.h \ codegen/nv50_ir_driver.h \ + codegen/nv50_ir_dump.cpp \ + codegen/nv50_ir_dump.h \ codegen/nv50_ir_emit_nv50.cpp \ codegen/nv50_ir_from_tgsi.cpp \ codegen/nv50_ir_graph.cpp \ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index c987da9908..b1782bb4f2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -23,6 +23,7 @@ #include "codegen/nv50_ir.h" #include "codegen/nv50_ir_target.h" #include "codegen/nv50_ir_driver.h" +#include "codegen/nv50_ir_dump.h" extern "C" { #include "nouveau_debug.h" @@ -1244,30 +1245,35 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info) prog->print(); targ->parseDriverInfo(info); - prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA); - prog->convertToSSA(); + if (!nv50_ir::replaceProgramCode(prog)) { + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA); - if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) - prog->print(); + prog->convertToSSA(); - prog->optimizeSSA(info->optLevel); - prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA); + if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) + prog->print(); - if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) - prog->print(); + prog->optimizeSSA(info->optLevel); + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA); - if (!prog->registerAllocation()) { - ret = -4; - goto out; - } - prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA); + if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) + prog->print(); - prog->optimizePostRA(info->optLevel); + if (!prog->registerAllocation()) { + ret = -4; + goto out; + } + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA); - if (!prog->emitBinary(info)) { - ret = -5; - goto out; + prog->optimizePostRA(info->optLevel); + + if (!prog->emitBinary(info)) { + ret = -5; + goto out; + } + + nv50_ir::dumpProgramCodeAndIR(prog); } out: diff --gi
[Mesa-dev] [PATCH v3 0/5] Implement ARB_sample_locations for nvc0
This patch set adds support for GL_ARB_sample_locations in mesa core, gallium, the mesa OpenGL state tracker and the nvc0 driver. Changes in v3: - Fix non-althabetical order of new extensions in extensions_table.h - Implement glEvaluateDepthValuesARB()/glResolveDepthValuesNV() - Stylistic changes and addition of comments in the nvc0 code - Renamed patch 5 and added GL_*_sample_locations to the release notes Changes in v2: - various minor changes/cleanups (mostly formatting and style changes) - improve error handling - don't expose the ARB_* variant on ES - expose NV_sample_locations so the feature is available on ES - decouple framebuffer and sample location state in the state tracker and nvc0 - rebase to upstream master Rhys Perry (5): mesa: add support for ARB_sample_locations gallium: add support for programmable sample locations st/mesa: add support for ARB_sample_locations nvc0: add support for programmable sample locations docs: document addition of GL_ARB_sample_locations for nvc0 docs/features.txt | 2 +- docs/relnotes/18.2.0.html | 2 +- src/gallium/auxiliary/util/u_framebuffer.c | 30 +++ src/gallium/auxiliary/util/u_framebuffer.h | 5 + src/gallium/docs/source/context.rst| 14 ++ src/gallium/docs/source/screen.rst | 3 + src/gallium/drivers/etnaviv/etnaviv_screen.c | 1 + src/gallium/drivers/freedreno/freedreno_screen.c | 1 + src/gallium/drivers/i915/i915_screen.c | 1 + src/gallium/drivers/llvmpipe/lp_screen.c | 1 + .../drivers/nouveau/codegen/nv50_ir_driver.h | 2 + .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 7 + .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 102 +++- .../nouveau/codegen/nv50_ir_lowering_nvc0.h| 2 + src/gallium/drivers/nouveau/nv30/nv30_screen.c | 1 + src/gallium/drivers/nouveau/nv50/nv50_screen.c | 1 + src/gallium/drivers/nouveau/nvc0/nvc0_context.h| 15 +- src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 3 + src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 32 +++ src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 17 +- .../drivers/nouveau/nvc0/nvc0_state_validate.c | 152 +--- src/gallium/drivers/nouveau/nvc0/nvc0_surface.c| 12 + src/gallium/drivers/r300/r300_screen.c | 1 + src/gallium/drivers/r600/r600_pipe.c | 1 + src/gallium/drivers/radeonsi/si_get.c | 1 + src/gallium/drivers/softpipe/sp_screen.c | 1 + src/gallium/drivers/svga/svga_screen.c | 1 + src/gallium/drivers/swr/swr_screen.cpp | 1 + src/gallium/drivers/v3d/v3d_screen.c | 1 + src/gallium/drivers/vc4/vc4_screen.c | 1 + src/gallium/drivers/virgl/virgl_screen.c | 1 + src/gallium/include/pipe/p_context.h | 41 +++- src/gallium/include/pipe/p_defines.h | 1 + src/gallium/include/pipe/p_screen.h| 11 + src/gallium/include/pipe/p_state.h | 1 + src/mapi/glapi/gen/gl_API.xml | 104 + src/mesa/main/config.h | 9 + src/mesa/main/dd.h | 8 + src/mesa/main/extensions_table.h | 2 + src/mesa/main/fbobject.c | 256 ++--- src/mesa/main/fbobject.h | 20 ++ src/mesa/main/framebuffer.c| 10 + src/mesa/main/get.c| 31 +++ src/mesa/main/get_hash_params.py | 6 + src/mesa/main/mtypes.h | 9 + src/mesa/main/multisample.c| 18 ++ src/mesa/main/tests/dispatch_sanity.cpp| 10 + src/mesa/state_tracker/st_atom.h | 2 +- src/mesa/state_tracker/st_atom_list.h | 2 +- src/mesa/state_tracker/st_atom_msaa.c | 77 ++- src/mesa/state_tracker/st_cb_fbo.c | 14 ++ src/mesa/state_tracker/st_cb_msaa.c| 27 +++ src/mesa/state_tracker/st_context.c| 7 +- src/mesa/state_tracker/st_context.h| 6 + src/mesa/state_tracker/st_extensions.c | 1 + 55 files changed, 1004 insertions(+), 84 deletions(-) -- 2.14.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 2/5] gallium: add support for programmable sample locations
Signed-off-by: Rhys Perry Reviewed-by: Brian Paul (v2) Reviewed-by: Marek Olšák (v2) --- src/gallium/auxiliary/util/u_framebuffer.c | 30 + src/gallium/auxiliary/util/u_framebuffer.h | 5 +++ src/gallium/docs/source/context.rst | 14 src/gallium/docs/source/screen.rst | 3 ++ src/gallium/drivers/etnaviv/etnaviv_screen.c | 1 + src/gallium/drivers/freedreno/freedreno_screen.c | 1 + src/gallium/drivers/i915/i915_screen.c | 1 + src/gallium/drivers/llvmpipe/lp_screen.c | 1 + src/gallium/drivers/nouveau/nv30/nv30_screen.c | 1 + src/gallium/drivers/nouveau/nv50/nv50_screen.c | 1 + src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 + src/gallium/drivers/r300/r300_screen.c | 1 + src/gallium/drivers/r600/r600_pipe.c | 1 + src/gallium/drivers/radeonsi/si_get.c| 1 + src/gallium/drivers/softpipe/sp_screen.c | 1 + src/gallium/drivers/svga/svga_screen.c | 1 + src/gallium/drivers/swr/swr_screen.cpp | 1 + src/gallium/drivers/v3d/v3d_screen.c | 1 + src/gallium/drivers/vc4/vc4_screen.c | 1 + src/gallium/drivers/virgl/virgl_screen.c | 1 + src/gallium/include/pipe/p_context.h | 41 ++-- src/gallium/include/pipe/p_defines.h | 1 + src/gallium/include/pipe/p_screen.h | 11 +++ src/gallium/include/pipe/p_state.h | 1 + 24 files changed, 120 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c index c2948a5cfb..5bafddc726 100644 --- a/src/gallium/auxiliary/util/u_framebuffer.c +++ b/src/gallium/auxiliary/util/u_framebuffer.c @@ -240,3 +240,33 @@ util_framebuffer_get_num_samples(const struct pipe_framebuffer_state *fb) return 1; } + + +/** + * Flip the sample location state along the Y axis. + */ +void +util_sample_locations_flip_y(struct pipe_screen *screen, unsigned fb_height, + unsigned samples, uint8_t *locations) +{ + unsigned row, i, shift, grid_width, grid_height; + uint8_t new_locations[ + PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * + PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32]; + + screen->get_sample_pixel_grid(screen, samples, &grid_width, &grid_height); + + shift = fb_height % grid_height; + + for (row = 0; row < grid_height; row++) { + unsigned row_size = grid_width * samples; + for (i = 0; i < row_size; i++) { + unsigned dest_row = grid_height - row - 1; + /* this relies on unsigned integer wraparound behaviour */ + dest_row = (dest_row - shift) % grid_height; + new_locations[dest_row * row_size + i] = locations[row * row_size + i]; + } + } + + memcpy(locations, new_locations, grid_width * grid_height * samples); +} diff --git a/src/gallium/auxiliary/util/u_framebuffer.h b/src/gallium/auxiliary/util/u_framebuffer.h index c73942c9c1..877e6e393f 100644 --- a/src/gallium/auxiliary/util/u_framebuffer.h +++ b/src/gallium/auxiliary/util/u_framebuffer.h @@ -64,6 +64,11 @@ extern unsigned util_framebuffer_get_num_samples(const struct pipe_framebuffer_state *fb); +extern void +util_sample_locations_flip_y(struct pipe_screen *screen, unsigned fb_height, + unsigned samples, uint8_t *locations); + + #ifdef __cplusplus } #endif diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst index e8e80dcbc3..20d0df7931 100644 --- a/src/gallium/docs/source/context.rst +++ b/src/gallium/docs/source/context.rst @@ -68,6 +68,9 @@ objects. They all follow simple, one-method binding calls, e.g. that this takes effect even if multisampling is not explicitly enabled if the frambuffer surface(s) are multisampled. Also, this mask is AND-ed with the optional fragment shader sample mask output (when emitted). +* ``set_sample_locations`` sets the sample locations used for rasterization. + ```get_sample_position``` still returns the default locations. When NULL, + the default locations are used. * ``set_min_samples`` sets the minimum number of samples that must be run. * ``set_clip_state`` * ``set_polygon_stipple`` @@ -270,6 +273,17 @@ format. multi-byte element value starting at offset bytes from resource start, going for size bytes. It is guaranteed that size % clear_value_size == 0. +Evaluating Depth Buffers + + +``evaluate_depth_buffer`` is a hint to decompress the current depth buffer +assuming the current sample locations to avoid problems that could arise when +using programmable sample locations. + +If a depth buffer is rendered with different sample location state than +what is current at the time of reading the depth buffer, the values may differ +because depth buffer compression can depend the sample locations. + Uploading ^ diff --
[Mesa-dev] [PATCH v3 1/5] mesa: add support for ARB_sample_locations
Signed-off-by: Rhys Perry Reviewed-by: Brian Paul (v2) Reviewed-by: Marek Olšák (v2) --- src/mapi/glapi/gen/gl_API.xml | 104 + src/mesa/main/config.h | 9 ++ src/mesa/main/dd.h | 8 + src/mesa/main/extensions_table.h| 2 + src/mesa/main/fbobject.c| 256 src/mesa/main/fbobject.h| 20 +++ src/mesa/main/framebuffer.c | 10 ++ src/mesa/main/get.c | 31 src/mesa/main/get_hash_params.py| 6 + src/mesa/main/mtypes.h | 9 ++ src/mesa/main/multisample.c | 18 +++ src/mesa/main/tests/dispatch_sanity.cpp | 10 ++ 12 files changed, 455 insertions(+), 28 deletions(-) diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml index 8ad45970c9..49807e1ea5 100644 --- a/src/mapi/glapi/gen/gl_API.xml +++ b/src/mapi/glapi/gen/gl_API.xml @@ -10881,6 +10881,110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h index 81573bfbf2..6a2f766222 100644 --- a/src/mesa/main/config.h +++ b/src/mesa/main/config.h @@ -315,4 +315,13 @@ #define MAX_CLIPPED_VERTICES ((2 * (6 + MAX_CLIP_PLANES))+1) +/** For GL_ARB_sample_locations - maximum of SAMPLE_LOCATION_PIXEL_GRID_*_ARB */ +#define MAX_SAMPLE_LOCATION_GRID_SIZE 4 + +/* It is theoretically possible for Consts.MaxSamples to be >32 but + * other code seems to assume that is not the case. + */ +#define MAX_SAMPLE_LOCATION_TABLE_SIZE \ + (MAX_SAMPLE_LOCATION_GRID_SIZE * MAX_SAMPLE_LOCATION_GRID_SIZE * 32) + #endif /* MESA_CONFIG_H_INCLUDED */ diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h index 9f9606ac6b..1b048d3ff8 100644 --- a/src/mesa/main/dd.h +++ b/src/mesa/main/dd.h @@ -787,6 +787,14 @@ struct dd_function_table { GLenum target, GLsizei numAttachments, const GLenum *attachments); + /** +* \name Functions for GL_ARB_sample_locations +*/ + void (*GetProgrammableSampleCaps)(struct gl_context *ctx, + const struct gl_framebuffer *fb, + GLuint *bits, GLuint *width, GLuint *height); + void (*EvaluateDepthValues)(struct gl_context *ctx); + /** * \name Query objects */ diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index 9207e3f8c6..ab1fd170bd 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -103,6 +103,7 @@ EXT(ARB_provoking_vertex, EXT_provoking_vertex EXT(ARB_query_buffer_object , ARB_query_buffer_object , GLL, GLC, x , x , 2013) EXT(ARB_robust_buffer_access_behavior , ARB_robust_buffer_access_behavior , GLL, GLC, x , x , 2012) EXT(ARB_robustness , dummy_true , GLL, GLC, x , x , 2010) +EXT(ARB_sample_locations, ARB_sample_locations , GLL, GLC, x , x , 2015) EXT(ARB_sample_shading , ARB_sample_shading , GLL, GLC, x , x , 2009) EXT(ARB_sampler_objects , dummy_true , GLL, GLC, x , x , 2009) EXT(ARB_seamless_cube_map , ARB_seamless_cube_map , GLL, GLC, x , x , 2009) @@ -350,6 +351,7 @@ EXT(NV_read_buffer , dummy_true EXT(NV_read_depth , dummy_true , x , x , x , ES2, 2011) EXT(NV_read_depth_stencil , dummy_true , x , x , x , ES2, 2011) EXT(NV_read_stencil , dummy_true , x , x , x , ES2, 2011) +EXT(NV_sample_locations , ARB_sample_locations , GLL, GLC, x , ES2, 2015) EXT(NV_texgen_reflection, dummy_true , GLL, x , x , x , 1999) EXT(NV_texture_barrier , NV_texture_barrier , GLL, GLC, x , x , 2009) EXT(NV_texture_env_combine4 , NV_texture_env_combine4 , GLL, x , x ,
[Mesa-dev] [PATCH v3 3/5] st/mesa: add support for ARB_sample_locations
Signed-off-by: Rhys Perry Reviewed-by: Brian Paul (v2) Reviewed-by: Marek Olšák (v2) --- src/mesa/state_tracker/st_atom.h | 2 +- src/mesa/state_tracker/st_atom_list.h | 2 +- src/mesa/state_tracker/st_atom_msaa.c | 77 +- src/mesa/state_tracker/st_cb_fbo.c | 14 +++ src/mesa/state_tracker/st_cb_msaa.c| 27 src/mesa/state_tracker/st_context.c| 7 ++-- src/mesa/state_tracker/st_context.h| 6 +++ src/mesa/state_tracker/st_extensions.c | 1 + 8 files changed, 129 insertions(+), 7 deletions(-) diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h index 2567ad30df..96e128d38c 100644 --- a/src/mesa/state_tracker/st_atom.h +++ b/src/mesa/state_tracker/st_atom.h @@ -86,7 +86,7 @@ enum { ST_NEW_CS_SAMPLERS) #define ST_NEW_FRAMEBUFFER (ST_NEW_FB_STATE | \ - ST_NEW_SAMPLE_MASK | \ + ST_NEW_SAMPLE_STATE | \ ST_NEW_SAMPLE_SHADING) #define ST_NEW_VERTEX_PROGRAM(st, p) (p->affected_states | \ diff --git a/src/mesa/state_tracker/st_atom_list.h b/src/mesa/state_tracker/st_atom_list.h index 5391d4710c..e1aebc91e7 100644 --- a/src/mesa/state_tracker/st_atom_list.h +++ b/src/mesa/state_tracker/st_atom_list.h @@ -34,7 +34,7 @@ ST_STATE(ST_NEW_FS_IMAGES, st_bind_fs_images) ST_STATE(ST_NEW_FB_STATE, st_update_framebuffer_state) /* depends on update_*_texture and bind_*_images */ ST_STATE(ST_NEW_BLEND, st_update_blend) /* depends on update_framebuffer_state */ ST_STATE(ST_NEW_RASTERIZER, st_update_rasterizer) /* depends on update_framebuffer_state */ -ST_STATE(ST_NEW_SAMPLE_MASK, st_update_sample_mask) /* depends on update_framebuffer_state */ +ST_STATE(ST_NEW_SAMPLE_STATE, st_update_sample_state) /* depends on update_framebuffer_state */ ST_STATE(ST_NEW_SAMPLE_SHADING, st_update_sample_shading) ST_STATE(ST_NEW_SCISSOR, st_update_scissor) /* depends on update_framebuffer_state */ ST_STATE(ST_NEW_VIEWPORT, st_update_viewport) /* depends on update_framebuffer_state */ diff --git a/src/mesa/state_tracker/st_atom_msaa.c b/src/mesa/state_tracker/st_atom_msaa.c index 556c7c5889..c6affec552 100644 --- a/src/mesa/state_tracker/st_atom_msaa.c +++ b/src/mesa/state_tracker/st_atom_msaa.c @@ -33,13 +33,84 @@ #include "st_program.h" #include "cso_cache/cso_context.h" +#include "util/u_framebuffer.h" #include "main/framebuffer.h" -/* Update the sample mask for MSAA. +/** + * Update the sample locations + */ +static void +update_sample_locations(struct st_context *st) +{ + struct gl_framebuffer *fb = st->ctx->DrawBuffer; + + if (!st->ctx->Extensions.ARB_sample_locations) + return; + + if (fb->ProgrammableSampleLocations) { + unsigned grid_width, grid_height, size, pixel, sample_index; + unsigned samples = st->state.fb_num_samples; + bool sample_location_pixel_grid = fb->SampleLocationPixelGrid; + uint8_t locations[ + PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * + PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32]; + + st->pipe->screen->get_sample_pixel_grid( + st->pipe->screen, samples, &grid_width, &grid_height); + size = grid_width * grid_height * samples; + + /** + * when a dimension is greater than MAX_SAMPLE_LOCATION_GRID_SIZE, + * st->ctx->Driver.GetSamplePixelGrid() returns 1 for both dimensions. + */ + if (grid_width > MAX_SAMPLE_LOCATION_GRID_SIZE || + grid_height > MAX_SAMPLE_LOCATION_GRID_SIZE) + sample_location_pixel_grid = false; + + for (pixel = 0; pixel < grid_width * grid_height; pixel++) { + for (sample_index = 0; sample_index < samples; sample_index++) { +int table_index = sample_index; +float x = 0.5f, y = 0.5f; +uint8_t loc; +if (sample_location_pixel_grid) + table_index = pixel * samples + sample_index; +if (fb->SampleLocationTable) { + x = fb->SampleLocationTable[table_index*2]; + y = fb->SampleLocationTable[table_index*2+1]; +} +if (st->state.fb_orientation == Y_0_BOTTOM) + y = 1.0 - y; + +loc = roundf(CLAMP(x * 16.0f, 0.0f, 15.0f)); +loc |= (int)roundf(CLAMP(y * 16.0f, 0.0f, 15.0f)) << 4; +locations[pixel * samples + sample_index] = loc; + } + } + + util_sample_locations_flip_y( + st->pipe->screen, st->state.fb_height, samples, locations); + + if (!st->state.enable_sample_locations || + st->state.sample_locations_samples != samples || + memcmp(locations, st->state.sample_locations, size) != 0) { + st->pipe->set_sample_locations( st->pipe, size, locations); +
[Mesa-dev] [PATCH v3 4/5] nvc0: add support for programmable sample locations
Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_driver.h | 2 + .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 7 + .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 102 -- .../nouveau/codegen/nv50_ir_lowering_nvc0.h| 2 + src/gallium/drivers/nouveau/nvc0/nvc0_context.h| 15 +- src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 3 + src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 33 - src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 17 ++- .../drivers/nouveau/nvc0/nvc0_state_validate.c | 152 + src/gallium/drivers/nouveau/nvc0/nvc0_surface.c| 12 ++ 10 files changed, 299 insertions(+), 46 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index 3d0782f86b..7c835ceab8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -73,6 +73,7 @@ struct nv50_ir_prog_symbol #define NVISA_GK104_CHIPSET0xe0 #define NVISA_GK20A_CHIPSET0xea #define NVISA_GM107_CHIPSET0x110 +#define NVISA_GM200_CHIPSET0x120 struct nv50_ir_prog_info { @@ -145,6 +146,7 @@ struct nv50_ir_prog_info bool persampleInvocation; bool usesSampleMaskIn; bool readsFramebuffer; + bool readsSampleLocations; } fp; struct { uint32_t inputOffset; /* base address for user args */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 3c5bad05fe..d7844d7381 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -1520,6 +1520,10 @@ void Source::scanInstructionSrc(const Instruction& insn, info->out[src.getIndex(0)].oread = 1; } } + if (src.getFile() == TGSI_FILE_SYSTEM_VALUE) { + if (info->sv[src.getIndex(0)].sn == TGSI_SEMANTIC_SAMPLEPOS) + info->prop.fp.readsSampleLocations = true; + } if (src.getFile() != TGSI_FILE_INPUT) return; @@ -1560,6 +1564,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) if (insn.getOpcode() == TGSI_OPCODE_FBFETCH) info->prop.fp.readsFramebuffer = true; + if (insn.getOpcode() == TGSI_OPCODE_INTERP_SAMPLE) + info->prop.fp.readsSampleLocations = true; + if (insn.dstCount()) { Instruction::DstRegister dst = insn.getDst(0); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 29f674b451..5723847234 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -2662,17 +2662,33 @@ NVC0LoweringPass::handleRDSV(Instruction *i) ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; break; case SV_SAMPLE_POS: { - Value *off = new_LValue(func, FILE_GPR); - ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); + Value *sampleID = bld.getScratch(); + ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0)); ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; - bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3)); - bld.mkLoad(TYPE_F32, - i->getDef(0), - bld.mkSymbol( - FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, - TYPE_U32, prog->driver->io.sampleInfoBase + - 4 * sym->reg.data.sv.index), - off); + Value *offset = calculateSampleOffset(sampleID); + + assert(prog->driver->prop.fp.readsSampleLocations); + + if (targ->getChipset() >= NVISA_GM200_CHIPSET) { + bld.mkLoad(TYPE_F32, +i->getDef(0), +bld.mkSymbol( + FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, + TYPE_U32, prog->driver->io.sampleInfoBase), +offset); + bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), + bld.mkImm(0x040c + sym->reg.data.sv.index * 16)); + bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0)); + bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f)); + } else { + bld.mkLoad(TYPE_F32, +i->getDef(0), +bld.mkSymbol( + FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, + TYPE_U32, prog->driver->io.sampleInfoBase + + 4 * sym->reg.data.sv.index), +offset); + } break; } case SV_SAMPLE_MASK: { @@ -2832,6 +2848,69 @@ NVC0Lo
[Mesa-dev] [PATCH v3 5/5] docs: document addition of GL_ARB_sample_locations for nvc0
Signed-off-by: Rhys Perry Reviewed-by: Brian Paul (v2) --- docs/features.txt | 2 +- docs/relnotes/18.2.0.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index e786bbecf4..2eac14fb32 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -305,7 +305,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve GL_ARB_parallel_shader_compilenot started, but Chia-I Wu did some related work in 2014 GL_ARB_post_depth_coverageDONE (i965, nvc0) GL_ARB_robustness_isolation not started - GL_ARB_sample_locations not started + GL_ARB_sample_locations DONE (nvc0) GL_ARB_seamless_cubemap_per_texture DONE (i965, nvc0, radeonsi, r600, softpipe, swr) GL_ARB_shader_ballot DONE (i965/gen8+, nvc0, radeonsi) GL_ARB_shader_clock DONE (i965/gen7+, nv50, nvc0, r600, radeonsi) diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html index f3bdb6605c..1e24d9c9de 100644 --- a/docs/relnotes/18.2.0.html +++ b/docs/relnotes/18.2.0.html @@ -44,7 +44,7 @@ Note: some of the new features are only available with certain drivers. -TBD +GL_ARB_sample_locations and GL_NV_sample_locations on nvc0 (GM200+) Bug fixes -- 2.14.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nv50/ir: fix image stores with indirect handles
Having this if statement here prevented the next if statement from being reached in the case of image stores, which is needed for instructions with indirect bindless handles like "STORE TEMP[ADDR[2].x+1](1) ...". Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 3c5bad05fe..7712963c53 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -1563,6 +1563,11 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) if (insn.dstCount()) { Instruction::DstRegister dst = insn.getDst(0); + if (insn.getOpcode() == TGSI_OPCODE_STORE && + dst.getFile() != TGSI_FILE_MEMORY) { + info->io.globalAccess |= 0x2; + } + if (dst.getFile() == TGSI_FILE_OUTPUT) { if (dst.isIndirect(0)) for (unsigned i = 0; i < info->numOutputs; ++i) @@ -1580,10 +1585,6 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) if (isEdgeFlagPassthrough(insn)) info->io.edgeFlagIn = insn.getSrc(0).getIndex(0); } else - if (dst.getFile() != TGSI_FILE_MEMORY && - insn.getOpcode() == TGSI_OPCODE_STORE) { - info->io.globalAccess |= 0x2; - } else if (dst.getFile() == TGSI_FILE_TEMPORARY) { if (dst.isIndirect(0)) indirectTempArrays.insert(dst.getArrayId()); -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/6] Fix Various Compilation Issues With Bindless
Previously, there were some errors in the compiler's implementation of ARB_bindless_texture, mostly related to usage of bound image or sampler handles allowed by ARB_bindless_texture, resulting in assertions or compilation errors. This series fixes following issues found in mesa: - Assertions when casting bound handles to uvec2 - Compilation errors when using the ?: operator with bound handles - Assertions creating a constant image/sampler handle - For example: image2D(uvec2(5, 6)) - Inlining of function calls with rvalues other than dereferences to handle uniforms passed into them creates assertion failures - Usage of bound handles as l-values In order to create bindless handles from bound images or samplers, two new TGSI opcodes needed to be added: SAMP2HND and IMG2HND. These are used when casting bound handles or when using them as l-values (e.g. using them with the ?: operator). This series has the following limitations because I don't have the hardware needed to test the needed changes: - radeonsi and gallivm do not handle SAMP2HND and IMG2HND - similar instructions/intrinsics for nir have not been added - the tgsi to nir conversion code does not handle SAMP2HND and IMG2HND - IMG2HND with Kepler is not implemented Usage of bound handles as l-values and casting them is handled better than before though. Some tests for these changes have been posted on the piglit mailing list. Rhys Perry (6): gallium: add new SAMP2HND and IMG2HND opcodes nv50/ir: add support for SAMP2HND on gk104+ and IMG2HND on gm107+ glsl_to_tgsi: allow bound samplers and images to be used as l-values glsl: allow ?: operator with images and samplers when bindless is enabled glsl,glsl_to_tgsi: fix sampler/image constants glsl: fix function inlining with opaque parameters src/compiler/glsl/ast_to_hir.cpp | 8 ++- src/compiler/glsl/ir.cpp | 32 +- src/compiler/glsl/opt_function_inlining.cpp| 52 +--- src/gallium/auxiliary/tgsi/tgsi_info.c | 2 + src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h | 4 +- src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h | 3 + src/gallium/docs/source/tgsi.rst | 25 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 2 + src/gallium/drivers/nouveau/codegen/nv50_ir.h | 2 + .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 22 +++ .../drivers/nouveau/codegen/nv50_ir_inlines.h | 4 +- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 25 .../nouveau/codegen/nv50_ir_lowering_nvc0.h| 1 + .../drivers/nouveau/codegen/nv50_ir_print.cpp | 2 + .../drivers/nouveau/codegen/nv50_ir_target.cpp | 7 ++- src/gallium/include/pipe/p_shader_tokens.h | 2 + src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 69 -- src/mesa/state_tracker/st_glsl_to_tgsi_private.h | 1 + 18 files changed, 239 insertions(+), 24 deletions(-) -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 6/6] glsl: fix function inlining with opaque parameters
Signed-off-by: Rhys Perry --- src/compiler/glsl/opt_function_inlining.cpp | 52 - 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/src/compiler/glsl/opt_function_inlining.cpp b/src/compiler/glsl/opt_function_inlining.cpp index 04690b6cf4..52f57da936 100644 --- a/src/compiler/glsl/opt_function_inlining.cpp +++ b/src/compiler/glsl/opt_function_inlining.cpp @@ -131,6 +131,18 @@ ir_save_lvalue_visitor::visit_enter(ir_dereference_array *deref) return visit_stop; } +static bool +should_replace_variable(ir_variable *sig_param, ir_rvalue *param) { + /* For opaque types, we want the inlined variable references +* referencing the passed in variable, since that will have +* the location information, which an assignment of an opaque +* variable wouldn't. +*/ + return sig_param->type->contains_opaque() && + param->is_dereference() && + sig_param->data.mode == ir_var_function_in; +} + void ir_call::generate_inline(ir_instruction *next_ir) { @@ -155,12 +167,8 @@ ir_call::generate_inline(ir_instruction *next_ir) ir_rvalue *param = (ir_rvalue *) actual_node; /* Generate a new variable for the parameter. */ - if (sig_param->type->contains_opaque()) { -/* For opaque types, we want the inlined variable references - * referencing the passed in variable, since that will have - * the location information, which an assignment of an opaque - * variable wouldn't. Fix it up below. - */ + if (should_replace_variable(sig_param, param)) { + /* Actual replacement happens below */ parameters[i] = NULL; } else { parameters[i] = sig_param->clone(ctx, ht); @@ -242,10 +250,9 @@ ir_call::generate_inline(ir_instruction *next_ir) ir_rvalue *const param = (ir_rvalue *) actual_node; ir_variable *sig_param = (ir_variable *) formal_node; - if (sig_param->type->contains_opaque()) { + if (should_replace_variable(sig_param, param)) { ir_dereference *deref = param->as_dereference(); -assert(deref); do_variable_replacement(&new_instructions, sig_param, deref); } } @@ -351,6 +358,9 @@ public: virtual ir_visitor_status visit_leave(ir_dereference_array *); virtual ir_visitor_status visit_leave(ir_dereference_record *); virtual ir_visitor_status visit_leave(ir_texture *); + virtual ir_visitor_status visit_leave(ir_assignment *); + virtual ir_visitor_status visit_leave(ir_expression *); + virtual ir_visitor_status visit_leave(ir_return *); void replace_deref(ir_dereference **deref); void replace_rvalue(ir_rvalue **rvalue); @@ -391,6 +401,32 @@ ir_variable_replacement_visitor::visit_leave(ir_texture *ir) return visit_continue; } +ir_visitor_status +ir_variable_replacement_visitor::visit_leave(ir_assignment *ir) +{ + replace_deref(&ir->lhs); + replace_rvalue(&ir->rhs); + + return visit_continue; +} + +ir_visitor_status +ir_variable_replacement_visitor::visit_leave(ir_expression *ir) +{ + for (uint8_t i = 0; i < ir->num_operands; i++) + replace_rvalue(&ir->operands[i]); + + return visit_continue; +} + +ir_visitor_status +ir_variable_replacement_visitor::visit_leave(ir_return *ir) +{ + replace_rvalue(&ir->value); + + return visit_continue; +} + ir_visitor_status ir_variable_replacement_visitor::visit_leave(ir_dereference_array *ir) { -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/6] glsl: allow ?: operator with images and samplers when bindless is enabled
Signed-off-by: Rhys Perry --- src/compiler/glsl/ast_to_hir.cpp | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 3bf581571e..8a7dd62506 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -1850,9 +1850,11 @@ ast_expression::do_hir(exec_list *instructions, * expressions; such use results in a compile-time error." */ if (type->contains_opaque()) { - _mesa_glsl_error(&loc, state, "opaque variables cannot be operands " - "of the ?: operator"); - error_emitted = true; + if (!(state->has_bindless() && (type->is_image() || type->is_sampler( { +_mesa_glsl_error(&loc, state, "variables of type %s cannot be " + "operands of the ?: operator", type->name); +error_emitted = true; + } } ir_constant *cond_val = op[0]->constant_expression_value(ctx); -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/6] nv50/ir: add support for SAMP2HND on gk104+ and IMG2HND on gm107+
Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 2 ++ src/gallium/drivers/nouveau/codegen/nv50_ir.h | 2 ++ .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 22 +++ .../drivers/nouveau/codegen/nv50_ir_inlines.h | 4 ++-- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 25 ++ .../nouveau/codegen/nv50_ir_lowering_nvc0.h| 1 + .../drivers/nouveau/codegen/nv50_ir_print.cpp | 2 ++ .../drivers/nouveau/codegen/nv50_ir_target.cpp | 7 +++--- 8 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index c987da9908..7c1c76a912 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -903,6 +903,8 @@ TexInstruction::TexInstruction(Function *fn, operation op) if (op == OP_TXF) sType = TYPE_U32; + if (op == OP_SAMP2HND || op == OP_IMG2HND) + setType(TYPE_U32); } TexInstruction::~TexInstruction() diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index f4f3c70888..97aa8d1109 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -134,6 +134,8 @@ enum operation OP_SUCLAMP, // clamp surface coordinates OP_SUEAU, // surface effective address OP_SUQ, // surface query + OP_SAMP2HND, // convert bound texture to bindless handle + OP_IMG2HND, // convert bound image to bindless handle OP_MADSP, // special integer multiply-add OP_TEXBAR, // texture dependency barrier OP_DFDX, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 3c5bad05fe..8149c72dd1 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -3570,6 +3570,28 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) handleTXQ(dst0, TXQ_TYPE, 0); std::swap(dst0[0], dst0[2]); break; + case TGSI_OPCODE_IMG2HND: + case TGSI_OPCODE_SAMP2HND: + if (!tgsi.getDst(0).isMasked(1)) + mkOp1(OP_MOV, TYPE_U32, dst0[1], mkImm(0)); + + if (!tgsi.getDst(0).isMasked(0)) { + bool is_image = tgsi.getOpcode() == TGSI_OPCODE_IMG2HND; + + TexInstruction *texi = new_TexInstruction( +func, is_image ? OP_IMG2HND : OP_SAMP2HND); + texi->setDef(0, dst0[0]); + if (is_image) +texi->tex.target = tgsi.getImageTarget(); + else +texi->tex.target = tgsi.getTexture(code, 0); + texi->tex.r = tgsi.getSrc(0).getIndex(0); + if (tgsi.getSrc(0).isIndirect(0)) +texi->setIndirectR(fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, NULL)); + + bb->insertTail(texi); + } + break; case TGSI_OPCODE_FBFETCH: handleFBFETCH(dst0); break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h index 4cb53ab42e..0262ae9d1f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h @@ -311,14 +311,14 @@ const FlowInstruction *Instruction::asFlow() const TexInstruction *Instruction::asTex() { - if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ) + if ((op >= OP_TEX && op <= OP_SULEA) || (op >= OP_SUQ && op <= OP_IMG2HND)) return static_cast(this); return NULL; } const TexInstruction *Instruction::asTex() const { - if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ) + if ((op >= OP_TEX && op <= OP_SULEA) || (op >= OP_SUQ && op <= OP_IMG2HND)) return static_cast(this); return NULL; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 29f674b451..c2cc120147 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1347,6 +1347,27 @@ NVC0LoweringPass::handleBUFQ(Instruction *bufq) return true; } +bool +NVC0LoweringPass::handle2HND(TexInstruction *i) +{ + assert(targ->getChipset() >= NVISA_GK104_CHIPSET); + assert(!i->tex.bindless); + bool is_sampler = i->op == OP_SAMP2HND; + + if (is_sampler || targ->getChipset() >= NVISA_GM107_CHIPSET) { + //Sampler or image on GM107+ + uint16_t slot = (is_sampler ? 0 : 32) + i->tex.r; + Value *hnd = loadTexHandle(i->getIndirectR(), slot); + bld.mkOp1(OP_MOV, TYPE_U32, i->getDef(0), hnd); + } else { + //Image on NVE4/GK104 + assert(!"not implemented&
[Mesa-dev] [PATCH 5/6] glsl, glsl_to_tgsi: fix sampler/image constants
Signed-off-by: Rhys Perry --- src/compiler/glsl/ir.cpp | 32 -- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 14 ++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp index e3134eaa1c..1d1a56ae9a 100644 --- a/src/compiler/glsl/ir.cpp +++ b/src/compiler/glsl/ir.cpp @@ -820,6 +820,10 @@ ir_constant::ir_constant(const struct glsl_type *type, exec_list *value_list) for (unsigned i = 0; i < type->components(); i++) this->value.b[i] = value->value.b[0]; break; +case GLSL_TYPE_SAMPLER: +case GLSL_TYPE_IMAGE: + this->value.u64[0] = value->value.u64[0]; + break; default: assert(!"Should not get here."); break; @@ -939,6 +943,8 @@ ir_constant::get_bool_component(unsigned i) const case GLSL_TYPE_FLOAT: return ((int)this->value.f[i]) != 0; case GLSL_TYPE_BOOL: return this->value.b[i]; case GLSL_TYPE_DOUBLE: return this->value.d[i] != 0.0; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return this->value.u64[i] != 0; case GLSL_TYPE_INT64: return this->value.i64[i] != 0; default: assert(!"Should not get here."); break; @@ -959,6 +965,8 @@ ir_constant::get_float_component(unsigned i) const case GLSL_TYPE_FLOAT: return this->value.f[i]; case GLSL_TYPE_BOOL: return this->value.b[i] ? 1.0f : 0.0f; case GLSL_TYPE_DOUBLE: return (float) this->value.d[i]; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return (float) this->value.u64[i]; case GLSL_TYPE_INT64: return (float) this->value.i64[i]; default: assert(!"Should not get here."); break; @@ -979,6 +987,8 @@ ir_constant::get_double_component(unsigned i) const case GLSL_TYPE_FLOAT: return (double) this->value.f[i]; case GLSL_TYPE_BOOL: return this->value.b[i] ? 1.0 : 0.0; case GLSL_TYPE_DOUBLE: return this->value.d[i]; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return (double) this->value.u64[i]; case GLSL_TYPE_INT64: return (double) this->value.i64[i]; default: assert(!"Should not get here."); break; @@ -999,6 +1009,8 @@ ir_constant::get_int_component(unsigned i) const case GLSL_TYPE_FLOAT: return (int) this->value.f[i]; case GLSL_TYPE_BOOL: return this->value.b[i] ? 1 : 0; case GLSL_TYPE_DOUBLE: return (int) this->value.d[i]; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return (int) this->value.u64[i]; case GLSL_TYPE_INT64: return (int) this->value.i64[i]; default: assert(!"Should not get here."); break; @@ -1019,6 +1031,8 @@ ir_constant::get_uint_component(unsigned i) const case GLSL_TYPE_FLOAT: return (unsigned) this->value.f[i]; case GLSL_TYPE_BOOL: return this->value.b[i] ? 1 : 0; case GLSL_TYPE_DOUBLE: return (unsigned) this->value.d[i]; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return (unsigned) this->value.u64[i]; case GLSL_TYPE_INT64: return (unsigned) this->value.i64[i]; default: assert(!"Should not get here."); break; @@ -1039,6 +1053,8 @@ ir_constant::get_int64_component(unsigned i) const case GLSL_TYPE_FLOAT: return (int64_t) this->value.f[i]; case GLSL_TYPE_BOOL: return this->value.b[i] ? 1 : 0; case GLSL_TYPE_DOUBLE: return (int64_t) this->value.d[i]; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return (int64_t) this->value.u64[i]; case GLSL_TYPE_INT64: return this->value.i64[i]; default: assert(!"Should not get here."); break; @@ -1059,6 +1075,8 @@ ir_constant::get_uint64_component(unsigned i) const case GLSL_TYPE_FLOAT: return (uint64_t) this->value.f[i]; case GLSL_TYPE_BOOL: return this->value.b[i] ? 1 : 0; case GLSL_TYPE_DOUBLE: return (uint64_t) this->value.d[i]; + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: return this->value.u64[i]; case GLSL_TYPE_INT64: return (uint64_t) this->value.i64[i]; default: assert(!"Should not get here."); break; @@ -1110,6 +1128,8 @@ ir_constant::copy_offset(ir_constant *src, int offset) case GLSL_TYPE_INT: case GLSL_TYPE_FLOAT: case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: case GLSL_TYPE_UINT64: case GLSL_TYPE_INT64: case GLSL_TYPE_BOOL: { @@ -1132,7 +1152,9 @@ ir_constant::copy_offset(ir_constant *src, int offset) case GLSL_TYPE_DOUBLE: value.d[i+offset] = src->get_double_component(i); break; - case GLSL
[Mesa-dev] [PATCH 3/6] glsl_to_tgsi: allow bound samplers and images to be used as l-values
Signed-off-by: Rhys Perry --- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 +++- src/mesa/state_tracker/st_glsl_to_tgsi_private.h | 1 + 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index b321112cf8..7938753453 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -316,6 +316,7 @@ public: st_src_reg *indirect, unsigned *location); st_src_reg canonicalize_gather_offset(st_src_reg offset); + bool handle_bound_deref(ir_dereference *ir); bool try_emit_mad(ir_expression *ir, int mul_operand); @@ -2439,10 +2440,15 @@ st_translate_interp_loc(ir_variable *var) void glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir) { - variable_storage *entry = find_variable_storage(ir->var); + variable_storage *entry; ir_variable *var = ir->var; bool remove_array; + if (handle_bound_deref(ir->as_dereference())) + return; + + entry = find_variable_storage(ir->var); + if (!entry) { switch (var->data.mode) { case ir_var_uniform: @@ -2669,6 +2675,9 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir) bool is_2D = false; ir_variable *var = ir->variable_referenced(); + if (handle_bound_deref(ir->as_dereference())) + return; + /* We only need the logic provided by st_glsl_storage_type_size() * for arrays of structs. Indirect sampler and image indexing is handled * elsewhere. @@ -2768,6 +2777,9 @@ glsl_to_tgsi_visitor::visit(ir_dereference_record *ir) ir_variable *var = ir->record->variable_referenced(); int offset = 0; + if (handle_bound_deref(ir->as_dereference())) + return; + ir->record->accept(this); assert(ir->field_idx >= 0); @@ -4110,6 +4122,45 @@ glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset) return offset; } + +bool +glsl_to_tgsi_visitor::handle_bound_deref(ir_dereference *ir) +{ + ir_variable *var = ir->variable_referenced(); + + if (!var || var->data.mode != ir_var_uniform || var->data.bindless || + !(ir->type->is_image() || ir->type->is_sampler())) + return false; + + //Convert from bound sampler/image to bindless handle + bool is_image = ir->type->is_image(); + st_src_reg resource(is_image ? PROGRAM_IMAGE : PROGRAM_SAMPLER, 0, GLSL_TYPE_UINT); + uint16_t index = 0; + unsigned array_size = 1, base = 0; + st_src_reg reladdr; + get_deref_offsets(ir, &array_size, &base, &index, &reladdr, true); + + resource.index = index; + if (reladdr.file != PROGRAM_UNDEFINED) { + resource.reladdr = ralloc(mem_ctx, st_src_reg); + *resource.reladdr = reladdr; + emit_arl(ir, sampler_reladdr, reladdr); + } + + this->result = get_temp(glsl_type::uvec2_type); + st_dst_reg dst(this->result); + dst.writemask = WRITEMASK_XY; + + glsl_to_tgsi_instruction *inst = emit_asm( + ir, is_image ? TGSI_OPCODE_IMG2HND : TGSI_OPCODE_SAMP2HND, dst); + + inst->tex_target = ir->type->sampler_index(); + inst->resource = resource; + inst->sampler_array_size = array_size; + inst->sampler_base = base; + + return true; +} void glsl_to_tgsi_visitor::visit(ir_texture *ir) @@ -5904,6 +5955,7 @@ compile_tgsi_instruction(struct st_translate *t, case TGSI_OPCODE_TXL2: case TGSI_OPCODE_TG4: case TGSI_OPCODE_LODQ: + case TGSI_OPCODE_SAMP2HND: if (inst->resource.file == PROGRAM_SAMPLER) { src[num_src] = t->samplers[inst->resource.index]; } else { @@ -5942,6 +5994,7 @@ compile_tgsi_instruction(struct st_translate *t, case TGSI_OPCODE_ATOMUMAX: case TGSI_OPCODE_ATOMIMIN: case TGSI_OPCODE_ATOMIMAX: + case TGSI_OPCODE_IMG2HND: for (i = num_src - 1; i >= 0; i--) src[i + 1] = src[i]; num_src++; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi_private.h b/src/mesa/state_tracker/st_glsl_to_tgsi_private.h index c482828edd..fccb7041cf 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi_private.h +++ b/src/mesa/state_tracker/st_glsl_to_tgsi_private.h @@ -179,6 +179,7 @@ is_resource_instruction(unsigned opcode) case TGSI_OPCODE_ATOMUMAX: case TGSI_OPCODE_ATOMIMIN: case TGSI_OPCODE_ATOMIMAX: + case TGSI_OPCODE_IMG2HND: return true; default: return false; -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/6] gallium: add new SAMP2HND and IMG2HND opcodes
This commit does not add support for the opcodes in gallivm or tgsi_to_nir.c Signed-off-by: Rhys Perry --- src/gallium/auxiliary/tgsi/tgsi_info.c | 2 ++ src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h | 4 ++-- src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h | 3 +++ src/gallium/docs/source/tgsi.rst | 25 + src/gallium/include/pipe/p_shader_tokens.h | 2 ++ 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c index 4aa658785c..bbe1a21e43 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.c +++ b/src/gallium/auxiliary/tgsi/tgsi_info.c @@ -153,6 +153,8 @@ tgsi_opcode_infer_type(enum tgsi_opcode opcode) case TGSI_OPCODE_POPC: case TGSI_OPCODE_LSB: case TGSI_OPCODE_UMSB: + case TGSI_OPCODE_IMG2HND: + case TGSI_OPCODE_SAMP2HND: return TGSI_TYPE_UNSIGNED; case TGSI_OPCODE_ARL: case TGSI_OPCODE_ARR: diff --git a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h index 1b2803cf3f..c3787c2fbb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h +++ b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h @@ -162,8 +162,8 @@ OPCODE(1, 1, COMP, IABS) OPCODE(1, 1, COMP, ISSG) OPCODE(1, 2, OTHR, LOAD) OPCODE(1, 2, OTHR, STORE, .is_store = 1) -OPCODE_GAP(163) /* removed */ -OPCODE_GAP(164) /* removed */ +OPCODE(1, 1, OTHR, IMG2HND) +OPCODE(1, 1, OTHR, SAMP2HND, .is_tex = 1) OPCODE_GAP(165) /* removed */ OPCODE(0, 0, OTHR, BARRIER) diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h index 9a13fa6684..54a1ee15b6 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h +++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h @@ -160,6 +160,9 @@ OP13(UCMP) OP11(IABS) OP11(ISSG) +OP11(IMG2HND) +OP11(SAMP2HND) + OP12(IMUL_HI) OP12(UMUL_HI) diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index 9e956586c4..a4a78e6267 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -2592,6 +2592,31 @@ For these opcodes, the resource can be a BUFFER, IMAGE, or MEMORY. barrier in between. +.. _bindlessopcodes: + +Bindless Opcodes + + +These opcodes are for working with bindless sampler or image handles and +require PIPE_CAP_BINDLESS_TEXTURE. + +.. opcode:: IMG2HND - Get a bindless handle for a image + + Syntax: ``IMG2HND dst, image`` + + Example: ``IMG2HND TEMP[0], IMAGE[0]`` + + Sets 'dst' to a bindless handle for 'image'. + +.. opcode:: SAMP2HND - Get a bindless handle for a sampler view + + Syntax: ``SAMP2HND dst, sampler`` + + Example: ``SAMP2HND TEMP[0], SVIEW[0]`` + + Sets 'dst' to a bindless handle for 'sampler'. + + .. _threadsyncopcodes: Inter-thread synchronization opcodes diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index f4e45c2560..08ed08156e 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -506,6 +506,8 @@ enum tgsi_opcode { TGSI_OPCODE_LOAD = 161, TGSI_OPCODE_STORE = 162, + TGSI_OPCODE_IMG2HND= 163, + TGSI_OPCODE_SAMP2HND = 164, /* gap */ TGSI_OPCODE_BARRIER= 166, -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 0/6] Fix Various Compilation Issues With Bindless
Oops, I meant r-values, not l-values. Seems to meaning of the word in my head changed at some point. On Wed, Jun 6, 2018 at 8:55 PM, Rhys Perry wrote: > Previously, there were some errors in the compiler's implementation of > ARB_bindless_texture, mostly related to usage of bound image or sampler > handles allowed by ARB_bindless_texture, resulting in assertions or > compilation errors. This series fixes following issues found in mesa: > - Assertions when casting bound handles to uvec2 > - Compilation errors when using the ?: operator with bound handles > - Assertions creating a constant image/sampler handle >- For example: image2D(uvec2(5, 6)) > - Inlining of function calls with rvalues other than dereferences to > handle uniforms passed into them creates assertion failures > - Usage of bound handles as l-values > > In order to create bindless handles from bound images or samplers, two new > TGSI opcodes needed to be added: SAMP2HND and IMG2HND. These are used when > casting bound handles or when using them as l-values (e.g. using them with > the ?: operator). > > This series has the following limitations because I don't have the > hardware needed to test the needed changes: > - radeonsi and gallivm do not handle SAMP2HND and IMG2HND > - similar instructions/intrinsics for nir have not been added > - the tgsi to nir conversion code does not handle SAMP2HND and IMG2HND > - IMG2HND with Kepler is not implemented > Usage of bound handles as l-values and casting them is handled better than > before though. > > Some tests for these changes have been posted on the piglit mailing list. > > Rhys Perry (6): > gallium: add new SAMP2HND and IMG2HND opcodes > nv50/ir: add support for SAMP2HND on gk104+ and IMG2HND on gm107+ > glsl_to_tgsi: allow bound samplers and images to be used as l-values > glsl: allow ?: operator with images and samplers when bindless is enabled > glsl,glsl_to_tgsi: fix sampler/image constants > glsl: fix function inlining with opaque parameters > > src/compiler/glsl/ast_to_hir.cpp | 8 ++- > src/compiler/glsl/ir.cpp | 32 +- > src/compiler/glsl/opt_function_inlining.cpp| 52 +--- > src/gallium/auxiliary/tgsi/tgsi_info.c | 2 + > src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h | 4 +- > src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h | 3 + > src/gallium/docs/source/tgsi.rst | 25 > src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 2 + > src/gallium/drivers/nouveau/codegen/nv50_ir.h | 2 + > .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 22 +++ > .../drivers/nouveau/codegen/nv50_ir_inlines.h | 4 +- > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 25 > .../nouveau/codegen/nv50_ir_lowering_nvc0.h| 1 + > .../drivers/nouveau/codegen/nv50_ir_print.cpp | 2 + > .../drivers/nouveau/codegen/nv50_ir_target.cpp | 7 ++- > src/gallium/include/pipe/p_shader_tokens.h | 2 + > src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 69 > -- > src/mesa/state_tracker/st_glsl_to_tgsi_private.h | 1 + > 18 files changed, 239 insertions(+), 24 deletions(-) > > -- > 2.14.4 > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 00/16] Move the Mesa Website to Sphinx
Might be good to do something like this: https://codepen.io/anon/pen/ERNdYJ So that those with NoScript or something won't have gears constantly rotating on their screen. On Fri, Jun 8, 2018 at 2:25 PM, Erik Faye-Lund wrote: > On Fri, Jun 8, 2018 at 2:06 PM, Rob Clark wrote: >> On Fri, Jun 8, 2018 at 3:02 AM, Jordan Justen wrote: >>> On Thu, Jun 7, 2018 at 2:56 AM Eero Tamminen >>> wrote: On 07.06.2018 12:01, Erik Faye-Lund wrote: > Just as a fun toy, I decided to give an animated SVG "variation" of > this a go myself: > > https://codepen.io/kusma/pen/vrXppL > > The actual SVG can be found here: > > https://gitlab.freedesktop.org/snippets/492 > > The gears were generated by this python script, based on the glxgears > source code: > > https://gitlab.freedesktop.org/snippets/491 > > Now, dropping this onto the black background doesn't work that well, > as it gets a bit bland, so it's probably better to add back the colors > then. > > Also, I'm not really sure if animation is a good idea or not. Maybe it could be a link target for the static logo? (Kind of website "easter egg"). > But I definitely think logos should be vector rather than raster ;) For Mesa, WebGL would be more fitting implementation than SVG though... >>> >>> https://github.com/gears3d/gears3d.github.io/blob/master/webgl10.js >>> >>> One comment I would have for any animation on the main pages (as >>> opposed to a separate 'easter egg' page), it probably should be >>> significantly slower moving than the traditional 70 degrees / second. >>> The faster animation would be distracting on the main pages. >>> >> >> so one idea, which I think isn't too over the top, is to have the >> static mesa-gears logo in top corner, but clicking on it starts/stops >> the animation (just toggle between static and animated svg, I guess?) > > Good idea. I updated the codepen to do a variation of that; it rotates > as long as the mouse hovers it. > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nv50/ir: Improve performance of signed division by powers of two
Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 29 +++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 39177bd044..7a18a5fe73 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1095,10 +1095,35 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->op = OP_MOV; i->setSrc(1, NULL); } else + if (imm0.reg.data.s32 == -1) { + i->op = OP_NEG; + i->setSrc(1, NULL); + } else if (i->dType == TYPE_U32 && imm0.isPow2()) { i->op = OP_SHR; i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32))); } else + if (i->dType == TYPE_S32 && util_is_power_of_two_or_zero(abs(imm0.reg.data.s32))) { + Value *a = i->getSrc(0); + int32_t b = imm0.reg.data.s32; + + if (b < 0) { +a = bld.getSSA(); +bld.mkOp1(OP_NEG, TYPE_S32, a, i->getSrc(0)); +b = -b; + } + + Value *sign = bld.getSSA(); + Value *tmp0 = bld.getSSA(); + Value *tmp1 = bld.getSSA(); + bld.mkOp2(OP_SHR, TYPE_U32, sign, a, bld.mkImm(31)); + bld.mkOp2(OP_ADD, TYPE_U32, tmp0, a, bld.mkImm(b - 1)); + bld.mkOp3(OP_SELP, TYPE_U32, tmp1, tmp0, a, sign); + + i->op = OP_SHR; + i->setSrc(0, tmp1); + i->setSrc(1, bld.mkImm(util_logbase2(b))); + } else if (i->dType == TYPE_U32) { Instruction *mul; Value *tA, *tB; @@ -1129,10 +1154,6 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); delete_Instruction(prog, i); - } else - if (imm0.reg.data.s32 == -1) { - i->op = OP_NEG; - i->setSrc(1, NULL); } else { LValue *tA, *tB; LValue *tD; -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2] nv50/ir: improve performance of signed division by powers of two
Changes in v2: - Stylistic changes - Use OP_SLCT instead of OP_SELP which only worked by luck - Fix issues in edge cases Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 30 +++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 39177bd044..d636eb130a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1095,10 +1095,36 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->op = OP_MOV; i->setSrc(1, NULL); } else + if (imm0.reg.data.s32 == -1) { + i->op = OP_NEG; + i->setSrc(1, NULL); + } else if (i->dType == TYPE_U32 && imm0.isPow2()) { i->op = OP_SHR; i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32))); } else + if (i->dType == TYPE_S32 && util_is_power_of_two_or_zero(llabs(imm0.reg.data.s32))) { + Value *a = i->getSrc(0); + int64_t absb = llabs(imm0.reg.data.s32); + + Value *sign = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), a, bld.mkImm(31)); + Value *adjusted = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), a, + bld.loadImm(bld.getSSA(), (uint32_t)(absb - 1))); + + Value *selected = bld.getSSA(); + bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, selected, TYPE_U32, adjusted, a, sign); + + if (imm0.reg.data.s32 < 0) { +i->op = OP_NEG; +i->setSrc(0, bld.mkOp2v( + OP_SHR, TYPE_S32, bld.getSSA(), selected, bld.mkImm(util_logbase2(absb; +i->setSrc(1, NULL); + } else { +i->op = OP_SHR; +i->setSrc(0, selected); +i->setSrc(1, bld.mkImm(util_logbase2(absb))); + } + } else if (i->dType == TYPE_U32) { Instruction *mul; Value *tA, *tB; @@ -1129,10 +1155,6 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); delete_Instruction(prog, i); - } else - if (imm0.reg.data.s32 == -1) { - i->op = OP_NEG; - i->setSrc(1, NULL); } else { LValue *tA, *tB; LValue *tD; -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nv50/ir: fix TargetNVC0::insnCanLoadOffset()
Previously, TargetNVC0::insnCanLoadOffset() returned whether the offset could be set to a specific value. The IndirectPropagation pass expected it to return whether the offset could be increased. Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 954aec0a2f..8938d19f6c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -415,6 +415,7 @@ bool TargetNVC0::insnCanLoadOffset(const Instruction *insn, int s, int offset) const { const ValueRef& ref = insn->src(s); + offset += insn->src(s).get()->reg.data.offset; if (ref.getFile() == FILE_MEMORY_CONST && (insn->op != OP_LOAD || insn->subOp != NV50_IR_SUBOP_LDC_IS)) return offset >= -0x8000 && offset < 0x8000; -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/6] Fix Various Compilation Issues With Bindless
Ping to those who seem appropriate for this patch in case it was forgotten or missed. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nv50/ir: handle SHLADD in IndirectPropagation
An alternative solution to the problem fixed in 0bd83d0 ("nv50/ir: move LateAlgebraicOpt to the very end"). Should be useful in the future and seems to make dolphin ubershaders a bit smaller. total instructions in shared programs : 226722 -> 226464 (-0.11%) total gprs used in shared programs: 19378 -> 19378 (0.00%) total shared used in shared programs : 0 -> 0 (0.00%) total local used in shared programs : 0 -> 0 (0.00%) local sharedgpr inst bytes helped 0 0 0 51 51 hurt 0 0 0 0 0 Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 39177bd044..4d0589214d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -283,6 +283,8 @@ class IndirectPropagation : public Pass { private: virtual bool visit(BasicBlock *); + + BuildUtil bld; }; bool @@ -294,6 +296,8 @@ IndirectPropagation::visit(BasicBlock *bb) for (Instruction *i = bb->getEntry(); i; i = next) { next = i->next; + bld.setPosition(i, false); + for (int s = 0; i->srcExists(s); ++s) { Instruction *insn; ImmediateValue imm; @@ -325,6 +329,14 @@ IndirectPropagation::visit(BasicBlock *bb) i->setIndirect(s, 0, NULL); i->setSrc(s, cloneShallow(func, i->getSrc(s))); i->src(s).get()->reg.data.offset += imm.reg.data.u32; + } else if (insn->op == OP_SHLADD) { +if (!insn->src(2).getImmediate(imm) || +!targ->insnCanLoadOffset(i, s, imm.reg.data.s32)) + continue; +i->setIndirect(s, 0, bld.mkOp2v( + OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1))); +i->setSrc(s, cloneShallow(func, i->getSrc(s))); +i->src(s).get()->reg.data.offset += imm.reg.data.u32; } } } @@ -3797,11 +3809,11 @@ Program::optimizeSSA(int level) RUN_PASS(2, AlgebraicOpt, run); RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks RUN_PASS(1, ConstantFolding, foldAll); + RUN_PASS(2, LateAlgebraicOpt, run); RUN_PASS(1, Split64BitOpPreRA, run); RUN_PASS(1, LoadPropagation, run); RUN_PASS(1, IndirectPropagation, run); RUN_PASS(2, MemoryOpt, run); - RUN_PASS(2, LateAlgebraicOpt, run); RUN_PASS(2, LocalCSE, run); RUN_PASS(0, DeadCodeElim, buryAll); -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 0/2] nv50/ir: SHLADD related improvements
This series implements an alternative solution to the problem fixed in 0bd83d0 ("nv50/ir: move LateAlgebraicOpt to the very end"). Overall, it slightly helps various shaders while slightly hurting a few others. Effects of both patches: total instructions in shared programs : 5265148 -> 5256901 (-0.16%) total gprs used in shared programs: 624346 -> 624328 (-0.00%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 20952 -> 20952 (0.00%) local sharedgpr inst bytes helped 0 0 7120162016 hurt 0 0 52 19 19 Rhys Perry (2): nv50/ir: handle SHLADD in IndirectPropagation nv50/ir: move LateAlgebraicOpt back to right after ConstantFolding src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 2/2] nv50/ir: move LateAlgebraicOpt back to right after ConstantFolding
Reverts 3072bbe ("nv50/ir: move LateAlgebraicOpt to the very end") since SHLADD is now handled in IndirectPropagation. total instructions in shared programs : 5264804 -> 5256901 (-0.15%) total gprs used in shared programs: 624341 -> 624328 (-0.00%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 20952 -> 20952 (0.00%) local sharedgpr inst bytes helped 0 0 6919931993 hurt 0 0 52 32 32 Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 83fb15ca34..4d0589214d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -3809,11 +3809,11 @@ Program::optimizeSSA(int level) RUN_PASS(2, AlgebraicOpt, run); RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks RUN_PASS(1, ConstantFolding, foldAll); + RUN_PASS(2, LateAlgebraicOpt, run); RUN_PASS(1, Split64BitOpPreRA, run); RUN_PASS(1, LoadPropagation, run); RUN_PASS(1, IndirectPropagation, run); RUN_PASS(2, MemoryOpt, run); - RUN_PASS(2, LateAlgebraicOpt, run); RUN_PASS(2, LocalCSE, run); RUN_PASS(0, DeadCodeElim, buryAll); -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 1/2] nv50/ir: handle SHLADD in IndirectPropagation
An alternative solution to the problem fixed in 0bd83d0 ("nv50/ir: move LateAlgebraicOpt to the very end"). total instructions in shared programs : 5265148 -> 5264804 (-0.01%) total gprs used in shared programs: 624346 -> 624341 (-0.00%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 20952 -> 20952 (0.00%) local sharedgpr inst bytes helped 0 0 2 31 31 hurt 0 0 0 0 0 Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 12 1 file changed, 12 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 39177bd044..83fb15ca34 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -283,6 +283,8 @@ class IndirectPropagation : public Pass { private: virtual bool visit(BasicBlock *); + + BuildUtil bld; }; bool @@ -294,6 +296,8 @@ IndirectPropagation::visit(BasicBlock *bb) for (Instruction *i = bb->getEntry(); i; i = next) { next = i->next; + bld.setPosition(i, false); + for (int s = 0; i->srcExists(s); ++s) { Instruction *insn; ImmediateValue imm; @@ -325,6 +329,14 @@ IndirectPropagation::visit(BasicBlock *bb) i->setIndirect(s, 0, NULL); i->setSrc(s, cloneShallow(func, i->getSrc(s))); i->src(s).get()->reg.data.offset += imm.reg.data.u32; + } else if (insn->op == OP_SHLADD) { +if (!insn->src(2).getImmediate(imm) || +!targ->insnCanLoadOffset(i, s, imm.reg.data.s32)) + continue; +i->setIndirect(s, 0, bld.mkOp2v( + OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1))); +i->setSrc(s, cloneShallow(func, i->getSrc(s))); +i->src(s).get()->reg.data.offset += imm.reg.data.u32; } } } -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] nv50/ir: add preliminary support for OP_XMAD
Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 3 ++- src/gallium/drivers/nouveau/codegen/nv50_ir.h | 14 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 12 +-- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 20 + .../drivers/nouveau/codegen/nv50_ir_target.cpp | 7 +++--- .../nouveau/codegen/nv50_ir_target_gm107.cpp | 1 + .../nouveau/codegen/nv50_ir_target_nv50.cpp| 5 +++-- .../nouveau/codegen/nv50_ir_target_nvc0.cpp| 25 -- 8 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index 49425b98b9..99bf8de370 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -53,7 +53,8 @@ Modifier Modifier::operator*(const Modifier m) const b &= ~NV50_IR_MOD_NEG; a = (this->bits ^ b) & (NV50_IR_MOD_NOT | NV50_IR_MOD_NEG); - c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT); + c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT | +NV50_IR_MOD_H1 | NV50_IR_MOD_SEXT); return Modifier(a | c); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index f4f3c70888..4deaf09989 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -58,6 +58,7 @@ enum operation OP_FMA, OP_SAD, // abs(src0 - src1) + src2 OP_SHLADD, + OP_XMAD, // extended multiply-add (GM107+), does a lot of things OP_ABS, OP_NEG, OP_NOT, @@ -251,6 +252,13 @@ enum operation #define NV50_IR_SUBOP_VOTE_ALL 0 #define NV50_IR_SUBOP_VOTE_ANY 1 #define NV50_IR_SUBOP_VOTE_UNI 2 +#define NV50_IR_SUBOP_XMAD_PSL (1 << 0) +#define NV50_IR_SUBOP_XMAD_MRG (1 << 1) +#define NV50_IR_SUBOP_XMAD_CLO (1 << 2) +#define NV50_IR_SUBOP_XMAD_CHI (2 << 2) +#define NV50_IR_SUBOP_XMAD_CSFU (3 << 2) +#define NV50_IR_SUBOP_XMAD_CBCC (4 << 2) +#define NV50_IR_SUBOP_XMAD_CMODE_MASK (0x7 << 2) #define NV50_IR_SUBOP_MINMAX_LOW 1 #define NV50_IR_SUBOP_MINMAX_MED 2 @@ -527,6 +535,9 @@ struct Storage #define NV50_IR_MOD_SAT (1 << 2) #define NV50_IR_MOD_NOT (1 << 3) #define NV50_IR_MOD_NEG_ABS (NV50_IR_MOD_NEG | NV50_IR_MOD_ABS) +// modifiers only for XMAD +#define NV50_IR_MOD_H1 (1 << 4) +#define NV50_IR_MOD_SEXT (1 << 5) #define NV50_IR_INTERP_MODE_MASK 0x3 #define NV50_IR_INTERP_LINEAR (0 << 0) @@ -556,11 +567,14 @@ public: inline Modifier operator&(const Modifier m) const { return bits & m.bits; } inline Modifier operator|(const Modifier m) const { return bits | m.bits; } inline Modifier operator^(const Modifier m) const { return bits ^ m.bits; } + inline Modifier operator~() const { return ~bits; } operation getOp() const; inline int neg() const { return (bits & NV50_IR_MOD_NEG) ? 1 : 0; } inline int abs() const { return (bits & NV50_IR_MOD_ABS) ? 1 : 0; } + inline int h1() const { return (bits & NV50_IR_MOD_H1) ? 1 : 0; } + inline int sext() const { return (bits & NV50_IR_MOD_SEXT) ? 1 : 0; } inline operator bool() const { return bits ? true : false; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 4d0589214d..a43b481a01 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -191,9 +191,16 @@ void LoadPropagation::checkSwapSrc01(Instruction *insn) { const Target *targ = prog->getTarget(); - if (!targ->getOpInfo(insn).commutative) - if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB) + if (!targ->getOpInfo(insn).commutative) { + if (insn->op != OP_SET && insn->op != OP_SLCT && + insn->op != OP_SUB && insn->op != OP_XMAD) return; + // XMAD is only commutative if both the CBCC and MRG flags are not set. + if (insn->op == OP_XMAD && (insn->subOp & 0x1c) == NV50_IR_SUBOP_XMAD_CBCC) + return; + if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG)) + return; + } if (insn->src(1).getFile() != FILE_GPR) return; // This is the special OP_SET used for alphatesting, we can't reverse its @@ -488,6 +495,7 @@ Modifier::applyTo(ImmediateValue& imm) const imm.reg.data.s32 = -imm.reg.data.s32; if (bits & NV50_IR_MOD_NOT) imm.reg.data.s32 = ~imm.reg.data.s32; + // NOTE: applying the h1 and sext modifiers is confusing and not very useful break; c
[Mesa-dev] [PATCH 4/4] nv50/ir: further optimize multiplication by immediates
Strongly mitigates the harm from the previous commit, which made many integer multiplications much more heavy on the register and instruction count. total instructions in shared programs : 5294693 -> 5268293 (-0.50%) total gprs used in shared programs: 624962 -> 624196 (-0.12%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 21048 -> 20952 (-0.46%) local sharedgpr inst bytes helped 1 0 36817721772 hurt 0 0 74 23 23 Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 123 ++--- src/util/bitscan.h | 26 + 2 files changed, 135 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 84cb5eb04b..aaad4db479 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -371,6 +371,10 @@ private: void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&); CmpInstruction *findOriginForTestWithZero(Value *); + + Value *createMulMethod1(Value *a, unsigned b, Value *c); + Value *createMulMethod2(Value *a, unsigned b, Value *c); + Value *createMul(Value *a, unsigned b, Value *c); unsigned int foldCount; @@ -946,6 +950,97 @@ ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2) return; } } + +Value * +ConstantFolding::createMulMethod1(Value *a, unsigned b, Value *c) +{ + if (b == 1) + return a; + + // Basically constant folded shift and add multiplication. + Value *res = c ? c : bld.loadImm(NULL, 0u); + bool resZero = !c; + unsigned ashift = 0; + while (b) { + if ((b & 1) && ashift) { + if (resZero) +res = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, bld.mkImm(ashift)); + else +res = bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, bld.mkImm(ashift), res); + resZero = false; + } else if (b & 1) { + if (resZero) +res = a; + else +res = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), res, a); + resZero = false; + } + b >>= 1; + ashift++; + } + return res; +} + +Value * +ConstantFolding::createMulMethod2(Value *a, unsigned b, Value *c) +{ + uint64_t b2 = u_next_power_of_two(b); + unsigned b2shift = ffsll(b2) - 1; + if (b2 != b) { // a * b2 - a * (b2 - b) + // mul1 = a * (b2 - b) + Value *mul1 = createMulMethod1(a, b2 - b, NULL); + + if (b2shift < 32 && c) { // a * b2 - mul1 + c (implemented as a * b2 + c - mul1) + return bld.mkOp2v(OP_SUB, TYPE_U32, bld.getSSA(), + bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), + a, bld.mkImm(b2shift), c), + mul1); + } else + if (b2shift < 32) { // a * b2 - mul1 + Value *res = bld.getSSA(); + Instruction *i = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(b2shift), mul1); + if (bld.getProgram()->getTarget()->isModSupported(i, 2, NV50_IR_MOD_NEG)) +i->src(2).mod *= Modifier(NV50_IR_MOD_NEG); + else +i->setSrc(2, bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1)); + return res; + } else + if (c) { // - mul1 + c (implemented as c - mul1) + return bld.mkOp2v(OP_SUB, TYPE_U32, bld.getSSA(), c, mul1); + } else { // - mul1 + return bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1); + } + } else { + if (c) // a * b2 + c + return bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, bld.mkImm(b2shift), c); + else // a * b2 + return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, bld.loadImm(NULL, b2shift)); + } +} + +Value * +ConstantFolding::createMul(Value *a, unsigned b, Value *c) +{ + unsigned cost[2]; + + // Estimate cost for first method (a << i) + (b << j) + ... + cost[0] = u_bit_count64(b >> 1); + + // Estimate cost for second method (a << i) - ((a << j) + (a << k) + ...) + uint64_t rounded_b = u_next_power_of_two(b); + cost[1] = rounded_b == b ? 1 : (u_bit_count64((rounded_b - b) >> 1) + 2); + if (c) cost[1]++; + + // The general method, multiplication by XMADs, costs three instructions. + // So nothing larger than that or it could be making things worse. + if (cost[0] > 3 && cost[1] > 3) + return NULL; + + if (cost[0] < cost[1]) + return createMulMethod1(a, b, c); + else + return createMulMethod2(a, b, c); +} void ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) @@ -1034,13 +1129,13 @
[Mesa-dev] [PATCH 3/4] nv50/ir: optimize imul/imad to xmads
This hits the shader-db numbers a good bit, though a few xmads is way faster than an imul or imad and the cost is mitigated by the next commit, which optimizes many multiplications by immediates into shorter and less register heavy instructions than the xmads. total instructions in shared programs : 5256901 -> 5294693 (0.72%) total gprs used in shared programs: 624328 -> 624962 (0.10%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 20952 -> 21048 (0.46%) local sharedgpr inst bytes helped 0 0 39 0 0 hurt 1 0 33422772277 Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 53 ++ 1 file changed, 53 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index a43b481a01..84cb5eb04b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -2246,13 +2246,18 @@ AlgebraicOpt::visit(BasicBlock *bb) // = // ADD(SHL(a, b), c) -> SHLADD(a, b, c) +// MUL(a, b) -> a few XMADs +// MAD/FMA(a, b, c) -> a few XMADs class LateAlgebraicOpt : public Pass { private: virtual bool visit(Instruction *); void handleADD(Instruction *); + void handleMULMAD(Instruction *); bool tryADDToSHLADD(Instruction *); + + BuildUtil bld; }; void @@ -2312,6 +2317,49 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add) return true; } + +// MUL(a, b) -> a few XMADs +// MAD/FMA(a, b, c) -> a few XMADs +void +LateAlgebraicOpt::handleMULMAD(Instruction *i) +{ + // TODO: handle NV50_IR_SUBOP_MUL_HIGH + if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32)) + return; + if (isFloatType(i->dType) || typeSizeof(i->dType) != 4) + return; + if (i->subOp || i->usesFlags() || i->flagsDef >= 0) + return; + + assert(!i->src(0).mod); + assert(!i->src(1).mod); + assert(i->op == OP_MUL ? 1 : !i->src(2).mod); + + bld.setPosition(i, true); + + Value *a = i->getSrc(0); + Value *b = i->getSrc(1); + Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2); + + Value *tmp0 = bld.getSSA(); + Value *tmp1 = bld.getSSA(); + + Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c); + insn->setPredicate(i->cc, i->getPredicate()); + + insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0)); + insn->setPredicate(i->cc, i->getPredicate()); + insn->src(1).mod = NV50_IR_MOD_H1; + insn->subOp = NV50_IR_SUBOP_XMAD_MRG; + + insn = bld.mkOp3(OP_XMAD, TYPE_U32, i->getDef(0), b, tmp1, tmp0); + insn->setPredicate(i->cc, i->getPredicate()); + insn->src(0).mod = NV50_IR_MOD_H1; + insn->src(1).mod = NV50_IR_MOD_H1; + insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC; + + delete_Instruction(prog, i); +} bool LateAlgebraicOpt::visit(Instruction *i) @@ -2320,6 +2368,11 @@ LateAlgebraicOpt::visit(Instruction *i) case OP_ADD: handleADD(i); break; + case OP_MUL: + case OP_MAD: + case OP_FMA: + handleMULMAD(i); + break; default: break; } -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/4] nv50/ir: Improve Performance of Integer Multiplication
This series improve the performance of integer multiplication by removing much usage of the very slow IMAD and IMUL. It depends on the SHLADD/IndirectPropagation patches. The first and second patch add support for the XMAD instruction in codegen The third patch replaces most IMADs and IMULs with a sequence of XMADs. This is far faster but increases the total instructions in the shader-db by 0.72%. This number is significantly lowered with the next patch. It replaces many multiplications with instructions that should be as fast or faster than the XMAD approach. They are also typically be smaller and less register heavy, so they decrease the total instruction count by -0.50%. This series gives about a ~50% speedup in fragment-heavy scenaries with Dolphin 5.0. All timings were made with interesting looking fifos from Dolphin's bugtracker: Wind Waker: 18 FPS -> 26 FPS at 3x internal resolution Wind Waker: 8 FPS -> 11 FPS at 5x internal resolution Paper Mario?: 26 FPS -> 42 FPS at 5x internal resolution SpongeBob Movie: 19 FPS -> 30 FPS at 5x internal resolution Unigine Heaven and Unigine Valley seems to run the same at low quality with no anti-aliasing and no tessellation. SuperTuxKart and 0 A.D. also show no change. It's possible these patches may break something, especially the fourth one. Piglit shows no functionality regressions though they should probably be tested for improvements or breakage with actual applications. These patches can also be found on my github: https://github.com/pendingchaos/mesa/tree/nv-xmad-v1 The final changes in shader-db are as follows: total instructions in shared programs : 5256901 -> 5268293 (0.22%) total gprs used in shared programs: 624328 -> 624196 (-0.02%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 20952 -> 20952 (0.00%) local sharedgpr inst bytes helped 0 0 255 680 680 hurt 0 0 128 14841484 Rhys Perry (4): nv50/ir: add preliminary support for OP_XMAD gm107/ir: add support for OP_XMAD on GM107+ nv50/ir: optimize imul/imad to xmads nv50/ir: further optimize multiplication by immediates src/gallium/drivers/nouveau/codegen/nv50_ir.cpp| 3 +- src/gallium/drivers/nouveau/codegen/nv50_ir.h | 14 ++ .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 61 +++ .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 188 +++-- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 20 +++ .../drivers/nouveau/codegen/nv50_ir_target.cpp | 7 +- .../nouveau/codegen/nv50_ir_target_gm107.cpp | 5 + .../nouveau/codegen/nv50_ir_target_nv50.cpp| 5 +- .../nouveau/codegen/nv50_ir_target_nvc0.cpp| 26 ++- src/util/bitscan.h | 26 +++ 10 files changed, 331 insertions(+), 24 deletions(-) -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/4] gm107/ir: add support for OP_XMAD on GM107+
Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 61 ++ .../nouveau/codegen/nv50_ir_target_gm107.cpp | 6 ++- .../nouveau/codegen/nv50_ir_target_nvc0.cpp| 1 + 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 26826d6360..8ace77aa59 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -155,6 +155,7 @@ private: void emitIMUL(); void emitIMAD(); void emitISCADD(); + void emitXMAD(); void emitIMNMX(); void emitICMP(); void emitISET(); @@ -1881,6 +1882,63 @@ CodeEmitterGM107::emitISCADD() emitGPR (0x08, insn->src(0)); emitGPR (0x00, insn->def(0)); } + +void +CodeEmitterGM107::emitXMAD() +{ + assert(insn->src(0).getFile() == FILE_GPR); + + bool constbuf = false; + bool psl_mrg = true; + bool immediate = false; + if (insn->src(2).getFile() == FILE_MEMORY_CONST) { + assert(insn->src(1).getFile() == FILE_GPR); + constbuf = true; + psl_mrg = false; + emitInsn(0x5100); + emitGPR(0x27, insn->src(1)); + emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(2)); + } else if (insn->src(1).getFile() == FILE_MEMORY_CONST) { + assert(insn->src(2).getFile() == FILE_GPR); + constbuf = true; + emitInsn(0x4e00); + emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1)); + emitGPR(0x27, insn->src(2)); + } else if (insn->src(1).getFile() == FILE_IMMEDIATE) { + assert(insn->src(2).getFile() == FILE_GPR); + assert(!insn->src(1).mod.h1()); + immediate = false; + emitInsn(0x3600); + emitIMMD(0x14, 19, insn->src(1)); + emitGPR(0x27, insn->src(2)); + } else { + assert(insn->src(1).getFile() == FILE_GPR); + assert(insn->src(2).getFile() == FILE_GPR); + emitInsn(0x5b00); + emitGPR(0x14, insn->src(1)); + emitGPR(0x27, insn->src(2)); + } + + if (insn->src(0).mod.sext()) + emitField(0x30, 2, insn->src(1).mod.sext() ? 3 : 1); + else + emitField(0x30, 2, insn->src(1).mod.sext() ? 2 : 0); + emitField(0x35, 1, insn->src(0).mod.h1()); + if (!immediate) + emitField(constbuf ? 0x34 : 0x23, 1, insn->src(1).mod.h1()); + + if (psl_mrg) { + emitField(constbuf ? 0x37 : 0x24, 1, insn->subOp & NV50_IR_SUBOP_XMAD_PSL ? 1 : 0); + emitField(constbuf ? 0x38 : 0x25, 1, insn->subOp & NV50_IR_SUBOP_XMAD_MRG ? 1 : 0); + } + emitField(0x32, constbuf ? 2 : 3, (insn->subOp >> 2) & 0x7); + + emitX(constbuf ? 0x36 : 0x26); + emitCC(0x2f); + + emitGPR(0x0, insn->def(0)); + emitGPR(0x8, insn->src(0)); +} void CodeEmitterGM107::emitIMNMX() @@ -3253,6 +3311,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i) case OP_SHLADD: emitISCADD(); break; + case OP_XMAD: + emitXMAD(); + break; case OP_MIN: case OP_MAX: if (isFloatType(insn->dType)) { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp index 24a1cbb8da..f918fbfdd3 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp @@ -60,8 +60,11 @@ TargetGM107::isOpSupported(operation op, DataType ty) const case OP_SQRT: case OP_DIV: case OP_MOD: - case OP_XMAD: return false; + case OP_XMAD: + if (isFloatType(ty)) + return false; + break; default: break; } @@ -230,6 +233,7 @@ TargetGM107::getLatency(const Instruction *insn) const case OP_SUB: case OP_VOTE: case OP_XOR: + case OP_XMAD: if (insn->dType != TYPE_F64) return 6; break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 66efa0135f..3b96c71f44 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -161,6 +161,7 @@ static const struct opProperties _initPropsGM107[] = { { OP_SUSTP, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 }, { OP_SUREDB, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 }, { OP_SUREDP, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 }, + { OP_XMAD,0x0, 0x0, 0x0, 0x0, 0x6, 0x2 }, }; void TargetNVC0::initProps(const struct opProperties *props, int size) -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/4] nv50/ir: Improve Performance of Integer Multiplication
Forgot to CC you. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/5] mesa, glsl: add support for EXT_shader_image_load_formatted
Signed-off-by: Rhys Perry --- src/compiler/glsl/ast_to_hir.cpp | 5 + src/compiler/glsl/glsl_parser_extras.cpp | 1 + src/compiler/glsl/glsl_parser_extras.h | 7 +++ src/mesa/main/extensions_table.h | 1 + src/mesa/main/mtypes.h | 1 + 5 files changed, 15 insertions(+) diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index dd60a2a87f..09ce5a44e6 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -3461,6 +3461,11 @@ apply_image_qualifier_to_variable(const struct ast_type_qualifier *qual, } var->data.image_format = qual->image_format; + } else if (state->has_image_load_formatted()) { + if (var->data.mode == ir_var_uniform && + state->EXT_shader_image_load_formatted_warn) { + _mesa_glsl_warning(loc, state, "GL_EXT_image_load_formatted used"); + } } else { if (var->data.mode == ir_var_uniform) { if (state->es_shader) { diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp index 04eba980e0..187bc0f18e 100644 --- a/src/compiler/glsl/glsl_parser_extras.cpp +++ b/src/compiler/glsl/glsl_parser_extras.cpp @@ -714,6 +714,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(EXT_separate_shader_objects), EXT(EXT_shader_framebuffer_fetch), EXT(EXT_shader_framebuffer_fetch_non_coherent), + EXT(EXT_shader_image_load_formatted) EXT(EXT_shader_integer_mix), EXT_AEP(EXT_shader_io_blocks), EXT(EXT_shader_samples_identical), diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h index 59a173418b..2818cdbb07 100644 --- a/src/compiler/glsl/glsl_parser_extras.h +++ b/src/compiler/glsl/glsl_parser_extras.h @@ -343,6 +343,11 @@ struct _mesa_glsl_parse_state { return ARB_bindless_texture_enable; } + bool has_image_load_formatted() const + { + return EXT_shader_image_load_formatted_enable; + } + void process_version_directive(YYLTYPE *locp, int version, const char *ident); @@ -790,6 +795,8 @@ struct _mesa_glsl_parse_state { bool EXT_shader_framebuffer_fetch_warn; bool EXT_shader_framebuffer_fetch_non_coherent_enable; bool EXT_shader_framebuffer_fetch_non_coherent_warn; + bool EXT_shader_image_load_formatted_enable; + bool EXT_shader_image_load_formatted_warn; bool EXT_shader_integer_mix_enable; bool EXT_shader_integer_mix_warn; bool EXT_shader_io_blocks_enable; diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index 79ef228b69..ac6acbb5ad 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -254,6 +254,7 @@ EXT(EXT_separate_shader_objects , dummy_true EXT(EXT_separate_specular_color , dummy_true , GLL, x , x , x , 1997) EXT(EXT_shader_framebuffer_fetch, EXT_shader_framebuffer_fetch , GLL, GLC, x , ES2, 2013) EXT(EXT_shader_framebuffer_fetch_non_coherent, EXT_shader_framebuffer_fetch_non_coherent, GLL, GLC, x, ES2, 2018) +EXT(EXT_shader_image_load_formatted , EXT_shader_image_load_formatted , GLL, GLC, x , x , 2014) EXT(EXT_shader_integer_mix , EXT_shader_integer_mix , GLL, GLC, x , 30, 2013) EXT(EXT_shader_io_blocks, dummy_true , x , x , x , 31, 2014) EXT(EXT_shader_samples_identical, EXT_shader_samples_identical , GLL, GLC, x , 31, 2015) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 482c42a4b2..4d0fdfe8e7 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -4179,6 +4179,7 @@ struct gl_extensions GLboolean EXT_provoking_vertex; GLboolean EXT_semaphore; GLboolean EXT_semaphore_fd; + GLboolean EXT_shader_image_load_formatted; GLboolean EXT_shader_integer_mix; GLboolean EXT_shader_samples_identical; GLboolean EXT_stencil_two_side; -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev