[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry
It's common in some applications to bind a new graphics pipeline without
ending up changing any context registers.

This has a pipline have two command buffers: one for setting context
registers and one for everything else. The context register command buffer
is only emitted if it differs from the previous pipeline's.

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c |  46 +--
 src/amd/vulkan/radv_pipeline.c   | 217 ---
 src/amd/vulkan/radv_private.h|   2 +
 3 files changed, 150 insertions(+), 115 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f41d6c0b3e7..59903ab64d8 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
*cmd_buffer,
}
 }
 
-static void
+static bool
 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
  struct radv_pipeline *pipeline)
 {
@@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
*cmd_buffer,
cmd_buffer->sample_positions_needed = true;
 
if (old_pipeline && num_samples == 
old_pipeline->graphics.ms.num_samples)
-   return;
+   return false;
 
radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
@@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
*cmd_buffer,
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | 
EVENT_INDEX(0));
}
+
+   return true;
 }
 
 static void
@@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
 }
 
-static void
+static bool
 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
 {
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
 
if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
-   return;
+   return false;
 
-   radv_update_multisample_state(cmd_buffer, pipeline);
+   bool context_roll = radv_update_multisample_state(cmd_buffer, pipeline);
 
cmd_buffer->scratch_size_needed =
  MAX2(cmd_buffer->scratch_size_needed,
@@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
 
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
 
+   if (!cmd_buffer->state.emitted_pipeline ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
pipeline->ctx_cs.cdw ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
pipeline->ctx_cs_hash ||
+   memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
+  pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
+   radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, 
pipeline->ctx_cs.cdw);
+   context_roll = true;
+   }
+
for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
if (!pipeline->shaders[i])
continue;
@@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
cmd_buffer->state.emitted_pipeline = pipeline;
 
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
+
+   return context_roll;
 }
 
 static void
@@ -2859,6 +2872,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
return;
 
+   assert(!pipeline->ctx_cs.cdw);
+
cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 
pipeline->cs.cdw);
@@ -3609,30 +3624,30 @@ radv_emit_draw_packets(struct radv_cmd_buffer 
*cmd_buffer,
  * any context registers.
  */
 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
-bool indexed_draw)
+bool indexed_draw,
+bool pipeline_context_roll)
 {
struct radv_cmd_state *state = &cmd_buffer->state;
 
if (!cmd_buffer->device->physical_device->has_scissor_bug)
return false;
 
+   if (pipeline_context_roll)
+   return true;
+
uint32_t used_states = 
cmd_buffer->state.pipeline->graphics.needed_dynamic_state | 
~RADV_CMD_DIRTY_DYNAMIC_ALL;
 
/* Index, vertex and streamout buffers don't change context regs, and
-* pipeline is handle

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry
I did and found small improvements in Rise of the Tomb Raider. I
measured framerates ~104.3% that of without the changes for the
Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
for Prophets Tomb.

I found no change with Dota 2 but I've heard it's cpu-bound.

On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  wrote:
>
> Did you benchmark?
>
> On 1/14/19 5:01 PM, Rhys Perry wrote:
> > It's common in some applications to bind a new graphics pipeline without
> > ending up changing any context registers.
> >
> > This has a pipline have two command buffers: one for setting context
> > registers and one for everything else. The context register command buffer
> > is only emitted if it differs from the previous pipeline's.
> >
> > Signed-off-by: Rhys Perry 
> > ---
> >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> >   src/amd/vulkan/radv_pipeline.c   | 217 ---
> >   src/amd/vulkan/radv_private.h|   2 +
> >   3 files changed, 150 insertions(+), 115 deletions(-)
> >
> > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > b/src/amd/vulkan/radv_cmd_buffer.c
> > index f41d6c0b3e7..59903ab64d8 100644
> > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> > *cmd_buffer,
> >   }
> >   }
> >
> > -static void
> > +static bool
> >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > struct radv_pipeline *pipeline)
> >   {
> > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > *cmd_buffer,
> >   cmd_buffer->sample_positions_needed = true;
> >
> >   if (old_pipeline && num_samples == 
> > old_pipeline->graphics.ms.num_samples)
> > - return;
> > + return false;
> >
> >   radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 
> > 2);
> >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > *cmd_buffer,
> >   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> >   radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | 
> > EVENT_INDEX(0));
> >   }
> > +
> > + return true;
> >   }
> >
> >   static void
> > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> > *cmd_buffer)
> >   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> >   }
> >
> > -static void
> > +static bool
> >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> >   {
> >   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> >
> >   if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> > - return;
> > + return false;
> >
> > - radv_update_multisample_state(cmd_buffer, pipeline);
> > + bool context_roll = radv_update_multisample_state(cmd_buffer, 
> > pipeline);
> >
> >   cmd_buffer->scratch_size_needed =
> > MAX2(cmd_buffer->scratch_size_needed,
> > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> > *cmd_buffer)
> >
> >   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
> >
> > + if (!cmd_buffer->state.emitted_pipeline ||
> > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
> > pipeline->ctx_cs.cdw ||
> > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
> > pipeline->ctx_cs_hash ||
> > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
> > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
> > + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, 
> > pipeline->ctx_cs.cdw);
> > + context_roll = true;
> > + }
> > +
> >   for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
> >   if (!pipeline->shaders[i])
> >   continue;
> > @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> > *cmd_buffer)
> >   cmd_buffer->state.emitted_pipeline = pipeline;
> >
> >   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
> > +
> > + return context_roll;

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry
This is with Rise of the Tomb Raider's graphics settings set to "High"
by the way.

On Mon, 14 Jan 2019 at 16:12, Rhys Perry  wrote:
>
> I did and found small improvements in Rise of the Tomb Raider. I
> measured framerates ~104.3% that of without the changes for the
> Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> for Prophets Tomb.
>
> I found no change with Dota 2 but I've heard it's cpu-bound.
>
> On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> wrote:
> >
> > Did you benchmark?
> >
> > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > It's common in some applications to bind a new graphics pipeline without
> > > ending up changing any context registers.
> > >
> > > This has a pipline have two command buffers: one for setting context
> > > registers and one for everything else. The context register command buffer
> > > is only emitted if it differs from the previous pipeline's.
> > >
> > > Signed-off-by: Rhys Perry 
> > > ---
> > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > >   src/amd/vulkan/radv_pipeline.c   | 217 ---
> > >   src/amd/vulkan/radv_private.h|   2 +
> > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > >
> > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > index f41d6c0b3e7..59903ab64d8 100644
> > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> > > *cmd_buffer,
> > >   }
> > >   }
> > >
> > > -static void
> > > +static bool
> > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > struct radv_pipeline *pipeline)
> > >   {
> > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > > *cmd_buffer,
> > >   cmd_buffer->sample_positions_needed = true;
> > >
> > >   if (old_pipeline && num_samples == 
> > > old_pipeline->graphics.ms.num_samples)
> > > - return;
> > > + return false;
> > >
> > >   radeon_set_context_reg_seq(cmd_buffer->cs, 
> > > R_028BDC_PA_SC_LINE_CNTL, 2);
> > >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > > *cmd_buffer,
> > >   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> > >   radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) 
> > > | EVENT_INDEX(0));
> > >   }
> > > +
> > > + return true;
> > >   }
> > >
> > >   static void
> > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> > > *cmd_buffer)
> > >   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> > >   }
> > >
> > > -static void
> > > +static bool
> > >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > >   {
> > >   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> > >
> > >   if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> > > - return;
> > > + return false;
> > >
> > > - radv_update_multisample_state(cmd_buffer, pipeline);
> > > + bool context_roll = radv_update_multisample_state(cmd_buffer, 
> > > pipeline);
> > >
> > >   cmd_buffer->scratch_size_needed =
> > > MAX2(cmd_buffer->scratch_size_needed,
> > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> > > *cmd_buffer)
> > >
> > >   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, 
> > > pipeline->cs.cdw);
> > >
> > > + if (!cmd_buffer->state.emitted_pipeline ||
> > > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
> > > pipeline->ctx_cs.cdw ||
> > > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
> > > pipeline->ctx_cs_hash ||
> > > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
> > > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
> > >

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry
Sure

On Mon, 14 Jan 2019 at 16:50, Samuel Pitoiset  wrote:
>
> While you are on it, can you experiment the tracked ctx stuff that
> RadeonSI implements (ie. SI_TRACKED_XXX)?
>
> This approach will likely be more costly from the CPU side, but it will
> reduce the number of register changes a lot more.
>
> Not sure if that will improve anything though, but I think it's worth to
> try?
>
> On 1/14/19 5:12 PM, Rhys Perry wrote:
> > I did and found small improvements in Rise of the Tomb Raider. I
> > measured framerates ~104.3% that of without the changes for the
> > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > for Prophets Tomb.
> >
> > I found no change with Dota 2 but I've heard it's cpu-bound.
> >
> > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> > wrote:
> >> Did you benchmark?
> >>
> >> On 1/14/19 5:01 PM, Rhys Perry wrote:
> >>> It's common in some applications to bind a new graphics pipeline without
> >>> ending up changing any context registers.
> >>>
> >>> This has a pipline have two command buffers: one for setting context
> >>> registers and one for everything else. The context register command buffer
> >>> is only emitted if it differs from the previous pipeline's.
> >>>
> >>> Signed-off-by: Rhys Perry 
> >>> ---
> >>>src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> >>>src/amd/vulkan/radv_pipeline.c   | 217 ---
> >>>src/amd/vulkan/radv_private.h|   2 +
> >>>3 files changed, 150 insertions(+), 115 deletions(-)
> >>>
> >>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> >>> b/src/amd/vulkan/radv_cmd_buffer.c
> >>> index f41d6c0b3e7..59903ab64d8 100644
> >>> --- a/src/amd/vulkan/radv_cmd_buffer.c
> >>> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> >>> @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> >>> *cmd_buffer,
> >>>}
> >>>}
> >>>
> >>> -static void
> >>> +static bool
> >>>radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> >>>  struct radv_pipeline *pipeline)
> >>>{
> >>> @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> >>> *cmd_buffer,
> >>>cmd_buffer->sample_positions_needed = true;
> >>>
> >>>if (old_pipeline && num_samples == 
> >>> old_pipeline->graphics.ms.num_samples)
> >>> - return;
> >>> + return false;
> >>>
> >>>radeon_set_context_reg_seq(cmd_buffer->cs, 
> >>> R_028BDC_PA_SC_LINE_CNTL, 2);
> >>>radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> >>> @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> >>> *cmd_buffer,
> >>>radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> >>>radeon_emit(cmd_buffer->cs, 
> >>> EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
> >>>}
> >>> +
> >>> + return true;
> >>>}
> >>>
> >>>static void
> >>> @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> >>> *cmd_buffer)
> >>>radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> >>>}
> >>>
> >>> -static void
> >>> +static bool
> >>>radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> >>>{
> >>>struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> >>>
> >>>if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> >>> - return;
> >>> + return false;
> >>>
> >>> - radv_update_multisample_state(cmd_buffer, pipeline);
> >>> + bool context_roll = radv_update_multisample_state(cmd_buffer, 
> >>> pipeline);
> >>>
> >>>cmd_buffer->scratch_size_needed =
> >>>  MAX2(cmd_buffer->scratch_size_needed,
> >>> @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> >>> *cmd_buffer)
> >>>
> >>>radeon_emit_array(c

[Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change

2019-01-15 Thread Rhys Perry
DXVK often sets dynamic state without actually changing it.

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 92 ++--
 1 file changed, 76 insertions(+), 16 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 59903ab64d8..56b3c934c2e 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2965,6 +2965,11 @@ void radv_CmdSetViewport(
assert(firstViewport < MAX_VIEWPORTS);
assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
 
+   if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
+   pViewports, viewportCount * sizeof(*pViewports))) {
+   return;
+   }
+
memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
   viewportCount * sizeof(*pViewports));
 
@@ -2984,6 +2989,11 @@ void radv_CmdSetScissor(
assert(firstScissor < MAX_SCISSORS);
assert(total_count >= 1 && total_count <= MAX_SCISSORS);
 
+   if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
+   scissorCount * sizeof(*pScissors))) {
+   return;
+   }
+
memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
   scissorCount * sizeof(*pScissors));
 
@@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth(
float   lineWidth)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (cmd_buffer->state.dynamic.line_width == lineWidth)
+   return;
+
cmd_buffer->state.dynamic.line_width = lineWidth;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
 }
@@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias(
float   depthBiasSlopeFactor)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
 
-   cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
-   cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
-   cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
+   if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
+   state->dynamic.depth_bias.clamp == depthBiasClamp &&
+   state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
+   return;
+   }
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
+   state->dynamic.depth_bias.bias = depthBiasConstantFactor;
+   state->dynamic.depth_bias.clamp = depthBiasClamp;
+   state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
+
+   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
 }
 
 void radv_CmdSetBlendConstants(
@@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants(
const float blendConstants[4])
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
 
-   memcpy(cmd_buffer->state.dynamic.blend_constants,
-  blendConstants, sizeof(float) * 4);
+   if (!memcmp(state->dynamic.blend_constants, blendConstants, 
sizeof(float) * 4))
+   return;
+
+   memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 
4);
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
+   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
 }
 
 void radv_CmdSetDepthBounds(
@@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds(
float   maxDepthBounds)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
 
-   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
-   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+   if (state->dynamic.depth_bounds.min == minDepthBounds &&
+   state->dynamic.depth_bounds.max == maxDepthBounds) {
+   return;
+   }
+
+   state->dynamic.depth_bounds.min = minDepthBounds;
+   state->dynamic.depth_bounds.max = maxDepthBounds;
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
+   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
 }
 
 void radv_CmdSetStencilCompareMask(
@@ -3045,13 +3075,21 @@ void radv_CmdSetStencilCompareMask(
uint32_tcompareMask)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
+   bool front_same = state->dynamic.stencil_compare_mask.front == 
compareMask;
+   bool back_same = state->dyn

Re: [Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change

2019-01-15 Thread Rhys Perry
I misread some code and forgot to remove it.

It was always unrelated to this patch.

On Wed, 16 Jan 2019 at 00:22, Bas Nieuwenhuizen  
wrote:
>
> On Tue, Jan 15, 2019 at 10:59 PM Rhys Perry  wrote:
> >
> > DXVK often sets dynamic state without actually changing it.
> >
> > Signed-off-by: Rhys Perry 
> > ---
> >  src/amd/vulkan/radv_cmd_buffer.c | 92 ++--
> >  1 file changed, 76 insertions(+), 16 deletions(-)
> >
> > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > b/src/amd/vulkan/radv_cmd_buffer.c
> > index 59903ab64d8..56b3c934c2e 100644
> > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > @@ -2965,6 +2965,11 @@ void radv_CmdSetViewport(
> > assert(firstViewport < MAX_VIEWPORTS);
> > assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
> >
> > +   if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
> > +   pViewports, viewportCount * sizeof(*pViewports))) {
> > +   return;
> > +   }
> > +
> > memcpy(state->dynamic.viewport.viewports + firstViewport, 
> > pViewports,
> >viewportCount * sizeof(*pViewports));
> >
> > @@ -2984,6 +2989,11 @@ void radv_CmdSetScissor(
> > assert(firstScissor < MAX_SCISSORS);
> > assert(total_count >= 1 && total_count <= MAX_SCISSORS);
> >
> > +   if (!memcmp(state->dynamic.scissor.scissors + firstScissor, 
> > pScissors,
> > +   scissorCount * sizeof(*pScissors))) {
> > +   return;
> > +   }
> > +
> > memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
> >scissorCount * sizeof(*pScissors));
> >
> > @@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth(
> > float   lineWidth)
> >  {
> > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
> > +
> > +   if (cmd_buffer->state.dynamic.line_width == lineWidth)
> > +   return;
> > +
> > cmd_buffer->state.dynamic.line_width = lineWidth;
> > cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
> >  }
> > @@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias(
> > float   depthBiasSlopeFactor)
> >  {
> > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
> > +   struct radv_cmd_state *state = &cmd_buffer->state;
> >
> > -   cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
> > -   cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
> > -   cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
> > +   if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
> > +   state->dynamic.depth_bias.clamp == depthBiasClamp &&
> > +   state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
> > +   return;
> > +   }
> >
> > -   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
> > +   state->dynamic.depth_bias.bias = depthBiasConstantFactor;
> > +   state->dynamic.depth_bias.clamp = depthBiasClamp;
> > +   state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
> > +
> > +   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
> >  }
> >
> >  void radv_CmdSetBlendConstants(
> > @@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants(
> > const float blendConstants[4])
> >  {
> > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
> > +   struct radv_cmd_state *state = &cmd_buffer->state;
> >
> > -   memcpy(cmd_buffer->state.dynamic.blend_constants,
> > -  blendConstants, sizeof(float) * 4);
> > +   if (!memcmp(state->dynamic.blend_constants, blendConstants, 
> > sizeof(float) * 4))
> > +   return;
> > +
> > +   memcpy(state->dynamic.blend_constants, blendConstants, 
> > sizeof(float) * 4);
> >
> > -   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
> > +   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
> >  }
> >
> >  void radv_CmdSetDepthBounds(
> > @@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds(
> > float   maxDepthB

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-15 Thread Rhys Perry
I did a before/after comparison during development with multiple runs
but only 1 before and after run to produce the numbers I sent. They
seemed to match up well enough to the runs during development, so I
wasn't too concerned.

IIRC, the two runs were with a Vega 64 at 1080p with "High" settings.
The kernel/distro was 4.19.13 and Fedora 29. Also
"/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to
"performance" and
"/sys/class/drm/card*/device/power_dpm_force_performance_level" was
set to "high" while running.

I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I
get anything too different.

On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen  
wrote:
>
> On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry  wrote:
> >
> > I did and found small improvements in Rise of the Tomb Raider. I
> > measured framerates ~104.3% that of without the changes for the
> > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > for Prophets Tomb.
>
> My main question would be what the statistical significance is.  e.g.
> did you do one run of each, did you do multiple, and what was your
> test setup?
>
> Just curious because I have tried the exact same thing before and
> could not find anything more than noise.
>
> >
> > I found no change with Dota 2 but I've heard it's cpu-bound.
> >
> > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> > wrote:
> > >
> > > Did you benchmark?
> > >
> > > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > > It's common in some applications to bind a new graphics pipeline without
> > > > ending up changing any context registers.
> > > >
> > > > This has a pipline have two command buffers: one for setting context
> > > > registers and one for everything else. The context register command 
> > > > buffer
> > > > is only emitted if it differs from the previous pipeline's.
> > > >
> > > > Signed-off-by: Rhys Perry 
> > > > ---
> > > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > > >   src/amd/vulkan/radv_pipeline.c   | 217 ---
> > > >   src/amd/vulkan/radv_private.h|   2 +
> > > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > > >
> > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > > index f41d6c0b3e7..59903ab64d8 100644
> > > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct 
> > > > radv_cmd_buffer *cmd_buffer,
> > > >   }
> > > >   }
> > > >
> > > > -static void
> > > > +static bool
> > > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > > struct radv_pipeline *pipeline)
> > > >   {
> > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct 
> > > > radv_cmd_buffer *cmd_buffer,
> > > >   cmd_buffer->sample_positions_needed = true;
> > > >
> > > >   if (old_pipeline && num_samples == 
> > > > old_pipeline->graphics.ms.num_samples)
> > > > - return;
> > > > + return false;
> > > >
> > > >   radeon_set_context_reg_seq(cmd_buffer->cs, 
> > > > R_028BDC_PA_SC_LINE_CNTL, 2);
> > > >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct 
> > > > radv_cmd_buffer *cmd_buffer,
> > > >   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> > > >   radeon_emit(cmd_buffer->cs, 
> > > > EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
> > > >   }
> > > > +
> > > > + return true;
> > > >   }
> > > >
> > > >   static void
> > > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> > > > *cmd_buffer)
> > > >   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> > > >   }
> > > >
> > > > -static void
> > > > +static bool
> > > >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > > >   {
> > > >   struct radv_pipeline *pipeline = 

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-16 Thread Rhys Perry
Rise of the Tomb Raider from without to with the change (average of 3 runs):
SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%)
ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%)
GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%)

So not much improvement (if any).

On Wed, 16 Jan 2019 at 00:39, Rhys Perry  wrote:
>
> I did a before/after comparison during development with multiple runs
> but only 1 before and after run to produce the numbers I sent. They
> seemed to match up well enough to the runs during development, so I
> wasn't too concerned.
>
> IIRC, the two runs were with a Vega 64 at 1080p with "High" settings.
> The kernel/distro was 4.19.13 and Fedora 29. Also
> "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to
> "performance" and
> "/sys/class/drm/card*/device/power_dpm_force_performance_level" was
> set to "high" while running.
>
> I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I
> get anything too different.
>
> On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen  
> wrote:
> >
> > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry  wrote:
> > >
> > > I did and found small improvements in Rise of the Tomb Raider. I
> > > measured framerates ~104.3% that of without the changes for the
> > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > > for Prophets Tomb.
> >
> > My main question would be what the statistical significance is.  e.g.
> > did you do one run of each, did you do multiple, and what was your
> > test setup?
> >
> > Just curious because I have tried the exact same thing before and
> > could not find anything more than noise.
> >
> > >
> > > I found no change with Dota 2 but I've heard it's cpu-bound.
> > >
> > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> > > wrote:
> > > >
> > > > Did you benchmark?
> > > >
> > > > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > > > It's common in some applications to bind a new graphics pipeline 
> > > > > without
> > > > > ending up changing any context registers.
> > > > >
> > > > > This has a pipline have two command buffers: one for setting context
> > > > > registers and one for everything else. The context register command 
> > > > > buffer
> > > > > is only emitted if it differs from the previous pipeline's.
> > > > >
> > > > > Signed-off-by: Rhys Perry 
> > > > > ---
> > > > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > > > >   src/amd/vulkan/radv_pipeline.c   | 217 
> > > > > ---
> > > > >   src/amd/vulkan/radv_private.h|   2 +
> > > > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > > > >
> > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > index f41d6c0b3e7..59903ab64d8 100644
> > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct 
> > > > > radv_cmd_buffer *cmd_buffer,
> > > > >   }
> > > > >   }
> > > > >
> > > > > -static void
> > > > > +static bool
> > > > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > > > struct radv_pipeline *pipeline)
> > > > >   {
> > > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct 
> > > > > radv_cmd_buffer *cmd_buffer,
> > > > >   cmd_buffer->sample_positions_needed = true;
> > > > >
> > > > >   if (old_pipeline && num_samples == 
> > > > > old_pipeline->graphics.ms.num_samples)
> > > > > - return;
> > > > > + return false;
> > > > >
> > > > >   radeon_set_context_reg_seq(cmd_buffer->cs, 
> > > > > R_028BDC_PA_SC_LINE_CNTL, 2);
> > > > >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct 
> > > > > radv_cmd_buffer *cmd_buffer,
> > > > >   radeon_emit(cmd_buffer-&g

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-16 Thread Rhys Perry
Seems I accidentally had it use Fedora 29's mesa build in both the
before and after runs...
Running again I get (again, average of 3 runs):
GeothermalValley: 58.2 fps -> 59.633 fps (+2.5%)
ProphetsTomb: 59 fps -> 60 fps (+1.7%)
SpineOfTheMountain: 64 fps -> 64.06667 fps (+0.1%) (1 extreme from
"before" run excluded)

Sorry for the noise.


On Wed, 16 Jan 2019 at 11:46, Rhys Perry  wrote:
>
> Rise of the Tomb Raider from without to with the change (average of 3 runs):
> SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%)
> ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%)
> GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%)
>
> So not much improvement (if any).
>
> On Wed, 16 Jan 2019 at 00:39, Rhys Perry  wrote:
> >
> > I did a before/after comparison during development with multiple runs
> > but only 1 before and after run to produce the numbers I sent. They
> > seemed to match up well enough to the runs during development, so I
> > wasn't too concerned.
> >
> > IIRC, the two runs were with a Vega 64 at 1080p with "High" settings.
> > The kernel/distro was 4.19.13 and Fedora 29. Also
> > "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to
> > "performance" and
> > "/sys/class/drm/card*/device/power_dpm_force_performance_level" was
> > set to "high" while running.
> >
> > I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I
> > get anything too different.
> >
> > On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen  
> > wrote:
> > >
> > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry  
> > > wrote:
> > > >
> > > > I did and found small improvements in Rise of the Tomb Raider. I
> > > > measured framerates ~104.3% that of without the changes for the
> > > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > > > for Prophets Tomb.
> > >
> > > My main question would be what the statistical significance is.  e.g.
> > > did you do one run of each, did you do multiple, and what was your
> > > test setup?
> > >
> > > Just curious because I have tried the exact same thing before and
> > > could not find anything more than noise.
> > >
> > > >
> > > > I found no change with Dota 2 but I've heard it's cpu-bound.
> > > >
> > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset 
> > > >  wrote:
> > > > >
> > > > > Did you benchmark?
> > > > >
> > > > > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > > > > It's common in some applications to bind a new graphics pipeline 
> > > > > > without
> > > > > > ending up changing any context registers.
> > > > > >
> > > > > > This has a pipline have two command buffers: one for setting context
> > > > > > registers and one for everything else. The context register command 
> > > > > > buffer
> > > > > > is only emitted if it differs from the previous pipeline's.
> > > > > >
> > > > > > Signed-off-by: Rhys Perry 
> > > > > > ---
> > > > > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > > > > >   src/amd/vulkan/radv_pipeline.c   | 217 
> > > > > > ---
> > > > > >   src/amd/vulkan/radv_private.h|   2 +
> > > > > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > > > > >
> > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > > > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > > index f41d6c0b3e7..59903ab64d8 100644
> > > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct 
> > > > > > radv_cmd_buffer *cmd_buffer,
> > > > > >   }
> > > > > >   }
> > > > > >
> > > > > > -static void
> > > > > > +static bool
> > > > > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > > > > struct radv_pipeline *pipeline)
> > > > > >   {
> > > > > > @@ -646,7 +646,7 @@ radv_update_multisamp

[Mesa-dev] [PATCH v3 3/5] st/mesa: add support for EXT_shader_image_load_formatted

2019-01-16 Thread Rhys Perry
v3: rebase

Signed-off-by: Rhys Perry 
Reviewed-by: Marek Olšák  (v2)
---
 src/mesa/state_tracker/st_extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/state_tracker/st_extensions.c 
b/src/mesa/state_tracker/st_extensions.c
index 4628079260..b713eed969 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -717,6 +717,7 @@ void st_init_extensions(struct pipe_screen *screen,
   { o(ARB_shader_clock), PIPE_CAP_TGSI_CLOCK   
},
   { o(ARB_shader_draw_parameters),   PIPE_CAP_DRAW_PARAMETERS  
},
   { o(ARB_shader_group_vote),PIPE_CAP_TGSI_VOTE
},
+  { o(EXT_shader_image_load_formatted),  PIPE_CAP_IMAGE_LOAD_FORMATTED 
},
   { o(ARB_shader_stencil_export),PIPE_CAP_SHADER_STENCIL_EXPORT
},
   { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS
},
   { o(ARB_shader_texture_lod),   PIPE_CAP_SM3  
},
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 1/5] gallium: add support for formatted image loads

2019-01-16 Thread Rhys Perry
v3: rebase
v3: make use of u_pipe_screen_get_param_defaults

Signed-off-by: Rhys Perry 
---
 src/gallium/auxiliary/util/u_screen.c  | 1 +
 src/gallium/docs/source/screen.rst | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 +
 src/gallium/drivers/swr/swr_screen.cpp | 1 +
 src/gallium/drivers/vc4/vc4_screen.c   | 1 +
 src/gallium/include/pipe/p_defines.h   | 1 +
 8 files changed, 8 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_screen.c 
b/src/gallium/auxiliary/util/u_screen.c
index c14edde859..470632f5ec 100644
--- a/src/gallium/auxiliary/util/u_screen.c
+++ b/src/gallium/auxiliary/util/u_screen.c
@@ -314,6 +314,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen 
*pscreen,
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS:
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_TGSI_ATOMFADD:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_MAX_GS_INVOCATIONS:
diff --git a/src/gallium/docs/source/screen.rst 
b/src/gallium/docs/source/screen.rst
index 9b75a407db..b2d0c401d5 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -483,6 +483,7 @@ The integer capabilities:
 * ``PIPE_CAP_TGSI_ATOMFADD``: Atomic floating point adds are supported on
   images, buffers, and shared memory.
 * ``PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND``: True if the driver needs blend 
state to use zero/one instead of destination alpha for RGB/XRGB formats.
+* ``PIPE_CAP_IMAGE_LOAD_FORMATTED``: True if a format for image loads does not 
need to be specified in the shader IR
 
 .. _pipe_capf:
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c 
b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2b69a8f696..d6e0f43f6c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -243,6 +243,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_MAX_GS_INVOCATIONS:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c 
b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index d83926f2b1..ff92012894 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -310,6 +310,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
case PIPE_CAP_TGSI_ATOMFADD:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index f5f3cf..b7cf2cd2e4 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -334,6 +334,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS:
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/swr/swr_screen.cpp 
b/src/gallium/drivers/swr/swr_screen.cpp
index de9008ddf6..38b76366cb 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -364,6 +364,7 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
case PIPE_CAP_MAX_GS_INVOCATIONS:
   return 32;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c 
b/src/gallium/drivers/vc4/vc4_screen.c
index e7f7c82c27..22de60f02c 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -293,6 +293,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen,
 case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
 case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
 case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+case PIPE_SHADER_CAP_IMAGE_LOAD_FORMATTED:
 return 0;
 case PIPE_SHADER_CAP_SCALAR_ISA:
 return 1;
diff --git a/src/gallium/include/pipe/p_defines.h 
b/src/gallium/include/pipe/p_defines.h
index ae53c723c7..5c0652d7a9 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -854,6 +854,7 @@ enum pipe_cap
PIPE_CAP_TGSI_ATOMFADD

[Mesa-dev] [PATCH v3 0/5] nvc0: Implement EXT_shader_image_load_formatted

2019-01-16 Thread Rhys Perry
This patch series implements EXT_shader_image_load_formatted on Maxwell+.

It should implement all of the spec except, if the extension is enabled,
passing image variables without a format qualifier to atomic operations
will not raise a compilation error like it should.

This is because knowing the format used in an image operation before
function inlining can be difficult, because formats don't have to (and
currently can't) be specified in the paramter declaration. So this series
leaves this issue to hopefully be resolved in a later patch.

I tested the second version of this series when it was released in June
2018 but I can't easily test this version. Nothing changed too much though
so it should be fine.

v2: change from PIPE_SHADER_CAP_* to PIPE_CAP_*
v2: fix broken feature detection in the state tracker
v2: move code in AlgebraicOpt::handleSULDP() to nv50_ir_ra.cpp
v3: rebase
v3: make use of u_pipe_screen_get_param_defaults
v3: move RA code into it's own function

Rhys Perry (5):
  gallium: add support for formatted image loads
  mesa,glsl: add support for EXT_shader_image_load_formatted
  st/mesa: add support for EXT_shader_image_load_formatted
  nv50/ir: use suld.p on GM107+
  nvc0,nv50/ir: enable support for formatted image loads on GM107+

 src/compiler/glsl/ast_to_hir.cpp  |  5 +++
 src/compiler/glsl/glsl_parser_extras.cpp  |  1 +
 src/compiler/glsl/glsl_parser_extras.h|  7 
 src/gallium/auxiliary/util/u_screen.c |  1 +
 src/gallium/docs/source/screen.rst|  1 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  4 +++
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 ---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp |  3 +-
 .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++
 .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 +
 .../drivers/nouveau/nv30/nv30_screen.c|  1 +
 .../drivers/nouveau/nv50/nv50_screen.c|  1 +
 .../drivers/nouveau/nvc0/nvc0_screen.c|  2 ++
 src/gallium/drivers/swr/swr_screen.cpp|  1 +
 src/gallium/drivers/vc4/vc4_screen.c  |  1 +
 src/gallium/include/pipe/p_defines.h  |  1 +
 src/mesa/main/extensions_table.h  |  1 +
 src/mesa/main/mtypes.h|  1 +
 src/mesa/state_tracker/st_extensions.c|  1 +
 19 files changed, 100 insertions(+), 14 deletions(-)

-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 5/5] nvc0, nv50/ir: enable support for formatted image loads on GM107+

2019-01-16 Thread Rhys Perry
v3: rebase

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +--
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c| 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 295497be2f..6c134962b4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2414,12 +2414,11 @@ 
NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
   bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
 TYPE_U32, bld.mkImm(0),
 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
-   if (su->op != OP_SUSTP && su->tex.format) {
+   if (su->op != OP_SUSTP && su->tex.format && su->tex.format->components > 0) 
{
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
   int blockwidth = format->bits[0] + format->bits[1] +
format->bits[2] + format->bits[3];
 
-  assert(format->components != 0);
   // make sure that the format doesn't mismatch when it's not FMT_NONE
   bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index b7cf2cd2e4..c47502cae1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -288,6 +288,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
   return class_3d >= GM200_3D_CLASS;
case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
   return class_3d >= GP100_3D_CLASS;
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
+  return class_3d >= GM107_3D_CLASS;
 
/* unsupported caps */
case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
@@ -334,7 +336,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS:
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
-   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 2/5] mesa, glsl: add support for EXT_shader_image_load_formatted

2019-01-16 Thread Rhys Perry
v3: rebase

Signed-off-by: Rhys Perry 
Reviewed-by: Marek Olšák  (v2)
---
 src/compiler/glsl/ast_to_hir.cpp | 5 +
 src/compiler/glsl/glsl_parser_extras.cpp | 1 +
 src/compiler/glsl/glsl_parser_extras.h   | 7 +++
 src/mesa/main/extensions_table.h | 1 +
 src/mesa/main/mtypes.h   | 1 +
 5 files changed, 15 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 67a5a8c050..d9a57d37f6 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3476,6 +3476,11 @@ apply_image_qualifier_to_variable(const struct 
ast_type_qualifier *qual,
   }
 
   var->data.image_format = qual->image_format;
+   } else if (state->has_image_load_formatted()) {
+  if (var->data.mode == ir_var_uniform &&
+  state->EXT_shader_image_load_formatted_warn) {
+ _mesa_glsl_warning(loc, state, "GL_EXT_image_load_formatted used");
+  }
} else {
   if (var->data.mode == ir_var_uniform) {
  if (state->es_shader) {
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp 
b/src/compiler/glsl/glsl_parser_extras.cpp
index 2048a7f900..1e035e94d8 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -721,6 +721,7 @@ static const _mesa_glsl_extension 
_mesa_glsl_supported_extensions[] = {
EXT(EXT_separate_shader_objects),
EXT(EXT_shader_framebuffer_fetch),
EXT(EXT_shader_framebuffer_fetch_non_coherent),
+   EXT(EXT_shader_image_load_formatted),
EXT(EXT_shader_implicit_conversions),
EXT(EXT_shader_integer_mix),
EXT_AEP(EXT_shader_io_blocks),
diff --git a/src/compiler/glsl/glsl_parser_extras.h 
b/src/compiler/glsl/glsl_parser_extras.h
index b17b5125e0..63a5cca5d2 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -344,6 +344,11 @@ struct _mesa_glsl_parse_state {
   return ARB_bindless_texture_enable;
}
 
+   bool has_image_load_formatted() const
+   {
+  return EXT_shader_image_load_formatted_enable;
+   }
+
bool has_implicit_conversions() const
{
   return EXT_shader_implicit_conversions_enable || is_version(120, 0);
@@ -816,6 +821,8 @@ struct _mesa_glsl_parse_state {
bool EXT_shader_framebuffer_fetch_warn;
bool EXT_shader_framebuffer_fetch_non_coherent_enable;
bool EXT_shader_framebuffer_fetch_non_coherent_warn;
+   bool EXT_shader_image_load_formatted_enable;
+   bool EXT_shader_image_load_formatted_warn;
bool EXT_shader_implicit_conversions_enable;
bool EXT_shader_implicit_conversions_warn;
bool EXT_shader_integer_mix_enable;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index dad38124d5..c3eb019f81 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -264,6 +264,7 @@ EXT(EXT_separate_shader_objects , dummy_true
 EXT(EXT_separate_specular_color , dummy_true   
  , GLL,  x ,  x ,  x , 1997)
 EXT(EXT_shader_framebuffer_fetch, EXT_shader_framebuffer_fetch 
  , GLL, GLC,  x , ES2, 2013)
 EXT(EXT_shader_framebuffer_fetch_non_coherent, 
EXT_shader_framebuffer_fetch_non_coherent, GLL, GLC,  x, ES2, 2018)
+EXT(EXT_shader_image_load_formatted , EXT_shader_image_load_formatted  
  , GLL, GLC,  x ,  x , 2014)
 EXT(EXT_shader_implicit_conversions , dummy_true   
  ,  x ,  x ,  x ,  31, 2013)
 EXT(EXT_shader_integer_mix  , EXT_shader_integer_mix   
  , GLL, GLC,  x ,  30, 2013)
 EXT(EXT_shader_io_blocks, dummy_true   
  ,  x ,  x ,  x ,  31, 2014)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 241c2b92f7..bd90727e26 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -4264,6 +4264,7 @@ struct gl_extensions
GLboolean EXT_render_snorm;
GLboolean EXT_semaphore;
GLboolean EXT_semaphore_fd;
+   GLboolean EXT_shader_image_load_formatted;
GLboolean EXT_shader_integer_mix;
GLboolean EXT_shader_samples_identical;
GLboolean EXT_stencil_two_side;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 4/5] nv50/ir: use suld.p on GM107+

2019-01-16 Thread Rhys Perry
v3: rebase
v3: move RA code into it's own function

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  4 +++
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 ---
 .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++
 .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 +
 4 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 8085bb2f54..2388f3923c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -224,6 +224,10 @@ enum operation
 #define NV50_IR_SUBOP_SULD_ZERO0
 #define NV50_IR_SUBOP_SULD_TRAP1
 #define NV50_IR_SUBOP_SULD_SDCL3
+// These three are only for GM107+ and are set during register allocation
+#define NV50_IR_SUBOP_SULDP_RGBA   (0 << 2)
+#define NV50_IR_SUBOP_SULDP_RG (1 << 2)
+#define NV50_IR_SUBOP_SULDP_R  (2 << 2)
 #define NV50_IR_SUBOP_SUBFM_3D 1
 #define NV50_IR_SUBOP_SUCLAMP_2D   0x10
 #define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0))
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index be00db3131..d7f4380b34 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -3257,26 +3257,36 @@ void
 CodeEmitterGM107::emitSULDx()
 {
const TexInstruction *insn = this->insn->asTex();
-   int type = 0;
 
emitInsn(0xeb00);
if (insn->op == OP_SULDB)
   emitField(0x34, 1, 1);
emitSUTarget();
 
-   switch (insn->dType) {
-   case TYPE_S8:   type = 1; break;
-   case TYPE_U16:  type = 2; break;
-   case TYPE_S16:  type = 3; break;
-   case TYPE_U32:  type = 4; break;
-   case TYPE_U64:  type = 5; break;
-   case TYPE_B128: type = 6; break;
-   default:
-  assert(insn->dType == TYPE_U8);
-  break;
+   if (insn->op == OP_SULDB) {
+  int type = 0;
+  switch (insn->dType) {
+  case TYPE_S8:   type = 1; break;
+  case TYPE_U16:  type = 2; break;
+  case TYPE_S16:  type = 3; break;
+  case TYPE_U32:  type = 4; break;
+  case TYPE_U64:  type = 5; break;
+  case TYPE_B128: type = 6; break;
+  default:
+ assert(insn->dType == TYPE_U8);
+ break;
+  }
+  emitField(0x14, 3, type);
+   } else {
+  int type = 0;
+  switch (insn->subOp & 0xc) {
+  case NV50_IR_SUBOP_SULDP_R:type = 0x1; break;
+  case NV50_IR_SUBOP_SULDP_RG:   type = 0x3; break;
+  case NV50_IR_SUBOP_SULDP_RGBA: type = 0xf; break;
+  }
+  emitField(0x14, 4, type);
}
emitLDSTc(0x18);
-   emitField(0x14, 3, type);
emitGPR  (0x00, insn->def(0));
emitGPR  (0x08, insn->src(0));
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index 5dcbf3c3e0..43011c23af 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -246,6 +246,16 @@ static const char *xmadOpCModeStr[] =
"clo", "chi", "csfu", "cbcc"
 };
 
+static const char *suldOpStr[] =
+{
+   "zero", "trap", "sdcl"
+};
+
+static const char *suldSwizzleOpStr[] =
+{
+   "rgba", "rg", "r"
+};
+
 static const char *DataTypeStr[] =
 {
"-",
@@ -672,6 +682,13 @@ void Instruction::print() const
 PRINT("h%d ", (subOp & NV50_IR_SUBOP_XMAD_H1(i)) ? 1 : 0);
  break;
   }
+  case OP_SULDB:
+  case OP_SULDP:
+ if ((subOp & 0x3) < ARRAY_SIZE(suldOpStr))
+PRINT("%s ", suldOpStr[subOp & 0x3]);
+ if (op == OP_SULDP && subOp >> 2 < (int)ARRAY_SIZE(suldSwizzleOpStr))
+PRINT("%s ", suldSwizzleOpStr[subOp >> 2]);
+ break;
   default:
  if (subOp)
 PRINT("(SUBOP:%u) ", subOp);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 322b79fe62..8e57bda254 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -264,6 +264,7 @@ private:
 
   void addHazard(Instruction *i, const ValueRef *src);
   void textureMask(TexInstruction *);
+  void suldpMask(TexInstruction *);
   void addConstraint(Instruction *, int s, int n);
   bool detectConflict(Instruction *, int s);
 
@@ -1996,6 +1997,33 @@ 
RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex)
   tex->setDef(c, NULL);
 }
 
+void
+RegAlloc::InsertConstraintsPass::suldpMask(TexInstruction *tex)
+{
+   int max = 0;
+   

[Mesa-dev] [PATCH 1/2] radv: pass radv_draw_info to radv_emit_draw_registers()

2019-01-19 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 118 +++
 1 file changed, 58 insertions(+), 60 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f41d6c0b3e7..f430b4f20dd 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2074,10 +2074,60 @@ radv_upload_graphics_shader_descriptors(struct 
radv_cmd_buffer *cmd_buffer, bool
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
 }
 
+struct radv_draw_info {
+   /**
+* Number of vertices.
+*/
+   uint32_t count;
+
+   /**
+* Index of the first vertex.
+*/
+   int32_t vertex_offset;
+
+   /**
+* First instance id.
+*/
+   uint32_t first_instance;
+
+   /**
+* Number of instances.
+*/
+   uint32_t instance_count;
+
+   /**
+* First index (indexed draws only).
+*/
+   uint32_t first_index;
+
+   /**
+* Whether it's an indexed draw.
+*/
+   bool indexed;
+
+   /**
+* Indirect draw parameters resource.
+*/
+   struct radv_buffer *indirect;
+   uint64_t indirect_offset;
+   uint32_t stride;
+
+   /**
+* Draw count parameters resource.
+*/
+   struct radv_buffer *count_buffer;
+   uint64_t count_buffer_offset;
+
+   /**
+* Stream output parameters resource.
+*/
+   struct radv_buffer *strmout_buffer;
+   uint64_t strmout_buffer_offset;
+};
+
 static void
-radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw,
-bool instanced_draw, bool indirect_draw,
-uint32_t draw_vertex_count)
+radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
+const struct radv_draw_info *draw_info)
 {
struct radeon_info *info = 
&cmd_buffer->device->physical_device->rad_info;
struct radv_cmd_state *state = &cmd_buffer->state;
@@ -2087,8 +2137,9 @@ radv_emit_draw_registers(struct radv_cmd_buffer 
*cmd_buffer, bool indexed_draw,
 
/* Draw state. */
ia_multi_vgt_param =
-   si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw,
- indirect_draw, draw_vertex_count);
+   si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count 
> 1,
+ draw_info->indirect,
+ draw_info->indirect ? 0 : 
draw_info->count);
 
if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
if (info->chip_class >= GFX9) {
@@ -2108,7 +2159,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer 
*cmd_buffer, bool indexed_draw,
 
/* Primitive restart. */
primitive_reset_en =
-   indexed_draw && state->pipeline->graphics.prim_restart_enable;
+   draw_info->indexed && 
state->pipeline->graphics.prim_restart_enable;
 
if (primitive_reset_en != state->last_primitive_reset_en) {
state->last_primitive_reset_en = primitive_reset_en;
@@ -3411,57 +3462,6 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer 
*cmd_buffer,
}
 }
 
-struct radv_draw_info {
-   /**
-* Number of vertices.
-*/
-   uint32_t count;
-
-   /**
-* Index of the first vertex.
-*/
-   int32_t vertex_offset;
-
-   /**
-* First instance id.
-*/
-   uint32_t first_instance;
-
-   /**
-* Number of instances.
-*/
-   uint32_t instance_count;
-
-   /**
-* First index (indexed draws only).
-*/
-   uint32_t first_index;
-
-   /**
-* Whether it's an indexed draw.
-*/
-   bool indexed;
-
-   /**
-* Indirect draw parameters resource.
-*/
-   struct radv_buffer *indirect;
-   uint64_t indirect_offset;
-   uint32_t stride;
-
-   /**
-* Draw count parameters resource.
-*/
-   struct radv_buffer *count_buffer;
-   uint64_t count_buffer_offset;
-
-   /**
-* Stream output parameters resource.
-*/
-   struct radv_buffer *strmout_buffer;
-   uint64_t strmout_buffer_offset;
-};
-
 static void
 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
   const struct radv_draw_info *info)
@@ -3672,9 +3672,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer 
*cmd_buffer,
 
radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
 
-   radv_emit_draw_registers(cmd_buffer, info->indexed,
-info->instance_count > 1, info->indirect,
-info->indirect ? 0 : info->count);
+   radv_emit_draw_registers(cmd_

[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-19 Thread Rhys Perry
It's common in some applications to bind a new graphics pipeline without
ending up changing any context registers.

This has a pipline have two command buffers: one for setting context
registers and one for everything else. The context register command buffer
is only emitted if it differs from the previous pipeline's.

v2: ensure late scissor emission is done when radv_emit_rbplus_state() is
called
v2: make use of cmd_buffer->state.workaround_scissor_bug

Signed-off-by: Rhys Perry 
---
This second version depends on the patch "radv: add missed situations for
scissor bug workaround".

 src/amd/vulkan/radv_cmd_buffer.c |  30 -
 src/amd/vulkan/radv_pipeline.c   | 217 ---
 src/amd/vulkan/radv_private.h|   2 +
 3 files changed, 141 insertions(+), 108 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 6d538d7e88a..f406a3a42f3 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
*cmd_buffer,
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | 
EVENT_INDEX(0));
}
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 static void
@@ -857,10 +859,13 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << 
(i * 4);
sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << 
(i * 4);
}
+   /* TODO: avoid redundantly setting context registers */
radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 
3);
radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 static void
@@ -884,6 +889,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
 
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
 
+   if (!cmd_buffer->state.emitted_pipeline ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
pipeline->ctx_cs.cdw ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
pipeline->ctx_cs_hash ||
+   memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
+  pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
+   radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, 
pipeline->ctx_cs.cdw);
+   cmd_buffer->state.workaround_scissor_bug = true;
+   }
+
for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
if (!pipeline->shaders[i])
continue;
@@ -2939,6 +2953,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
return;
 
+   assert(!pipeline->ctx_cs.cdw);
+
cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 
pipeline->cs.cdw);
@@ -3630,20 +3646,16 @@ static bool radv_need_late_scissor_emission(struct 
radv_cmd_buffer *cmd_buffer,
uint32_t used_states = 
cmd_buffer->state.pipeline->graphics.needed_dynamic_state | 
~RADV_CMD_DIRTY_DYNAMIC_ALL;
 
/* Index, vertex and streamout buffers don't change context regs, and
-* pipeline is handled later.
+* pipeline is already handled.
 */
used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
 RADV_CMD_DIRTY_VERTEX_BUFFER |
 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
 RADV_CMD_DIRTY_PIPELINE);
 
-   /* Assume all state changes except  these two can imply context rolls. 
*/
if (cmd_buffer->state.dirty & used_states)
return true;
 
-   if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
-   return true;
-
if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
(state->index_type ? 0xu : 0xu) != 
state->last_primitive_reset_index)
return true;
@@ -3655,7 +3667,7 @@ static void
 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
  const struct radv_draw_info *info)
 {
-   bool late_scissor_emission = 
radv_need_late_scissor_emission(cmd_buffer, info);
+   bool late_scissor_emission;
 
if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipel

[Mesa-dev] [PATCH 2/2] radv: add missed situations for scissor bug workaround

2019-01-19 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 65 
 src/amd/vulkan/radv_private.h|  2 +
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f430b4f20dd..6d538d7e88a 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -920,6 +920,8 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
  cmd_buffer->state.dynamic.scissor.scissors,
  cmd_buffer->state.dynamic.viewport.viewports,
  
cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
+
+   cmd_buffer->state.workaround_scissor_bug = false;
 }
 
 static void
@@ -1217,6 +1219,8 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer 
*cmd_buffer,
radv_update_zrange_precision(cmd_buffer, &att->ds, image,
 layout, false);
}
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 /**
@@ -1442,6 +1446,8 @@ radv_update_bound_fast_clear_color(struct radv_cmd_buffer 
*cmd_buffer,
radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx 
* 0x3c, 2);
radeon_emit(cs, color_values[0]);
radeon_emit(cs, color_values[1]);
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 /**
@@ -1704,6 +1710,8 @@ void radv_set_db_count_control(struct radv_cmd_buffer 
*cmd_buffer)
}
 
radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, 
db_count_control);
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 static void
@@ -2185,6 +2193,27 @@ radv_emit_draw_registers(struct radv_cmd_buffer 
*cmd_buffer,
state->last_primitive_reset_index = 
primitive_reset_index;
}
}
+
+   if (draw_info->strmout_buffer) {
+   uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
+
+   va += draw_info->strmout_buffer->offset +
+ draw_info->strmout_buffer_offset;
+
+   radeon_set_context_reg(cs, 
R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
+  draw_info->stride);
+
+   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
+   COPY_DATA_DST_SEL(COPY_DATA_REG) |
+   COPY_DATA_WR_CONFIRM);
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
+   radeon_emit(cs, 
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
+   radeon_emit(cs, 0); /* unused */
+
+   radv_cs_add_buffer(cmd_buffer->device->ws, cs, 
draw_info->strmout_buffer->bo);
+   }
 }
 
 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
@@ -3470,27 +3499,6 @@ radv_emit_draw_packets(struct radv_cmd_buffer 
*cmd_buffer,
struct radeon_winsys *ws = cmd_buffer->device->ws;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
 
-   if (info->strmout_buffer) {
-   uint64_t va = radv_buffer_get_va(info->strmout_buffer->bo);
-
-   va += info->strmout_buffer->offset +
- info->strmout_buffer_offset;
-
-   radeon_set_context_reg(cs, 
R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
-  info->stride);
-
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
-   COPY_DATA_DST_SEL(COPY_DATA_REG) |
-   COPY_DATA_WR_CONFIRM);
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
-   radeon_emit(cs, 
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
-   radeon_emit(cs, 0); /* unused */
-
-   radv_cs_add_buffer(ws, cs, info->strmout_buffer->bo);
-   }
-
if (info->indirect) {
uint64_t va = radv_buffer_get_va(info->indirect->bo);
uint64_t count_va = 0;
@@ -3609,13 +3617,16 @@ radv_emit_draw_packets(struct radv_cmd_buffer 
*cmd_buffer,
  * any context registers.
  */
 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
-bool indexed_draw)
+const struct radv_draw_info *info)
 {
struct radv_cmd_state *state = &cmd_buffer->state;
 
if (!cmd_buffer->device->physical_device->has_scissor_bug)
return false;
 
+   if (cmd_buffer->state.workaround_scissor_bug || info->strmout_buffer)
+   return true;
+
uint32_

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-12 Thread Rhys Perry
It currently requires review (and possibly rebasing). Marek Olšák send
some feedback for a few of the patches but other than that, it hasn't
gotten much attention.

Also patch 35 seems to vectorize 32-bit code which can help or hurt
shaders quite a bit and seems to hurt shaders overall. I'm not yet
sure how to solve this without removing it or changing the result of
LLVM's SLP vectorizer significantly.
IIRC enabling SLP vectorizer also uncovered a RA bug with a shader.

I think I'll look into the issues with patch 35 again.

On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset  wrote:
>
> What's the status of this?
>
> On 12/7/18 6:21 PM, Rhys Perry wrote:
> > This series add support for:
> > - VK_KHR_shader_float16_int8
> > - VK_AMD_gpu_shader_half_float
> > - VK_AMD_gpu_shader_int16
> > - VK_KHR_8bit_storage
> > on VI+. Half floats are currently disabled on LLVM 7 because of a bug
> > causing large memory usage and long (or unbounded) compilation times with
> > some tests.
> >
> > It depends on the follow patch series:
> > - https://patchwork.freedesktop.org/series/53454/
> > - https://patchwork.freedesktop.org/series/53602/
> > - https://patchwork.freedesktop.org/series/53660/
> >
> > An older version was tested on my Polaris card, but due to hardware issues
> > I currently can't test the latest version of the series.
> >
> > deqp-vk has no regressions and none of the newly enabled tests fail.
> >
> > Rhys Perry (38):
> >ac: add various helpers for float16/int16/int8
> >ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >ac/nir: implement 8-bit ssbo stores
> >ac/nir: fix 16-bit ssbo stores
> >ac/nir: implement 8-bit nir_load_const_instr
> >ac/nir: implement 8-bit conversions
> >ac/nir: fix 64-bit nir_op_f2f16_rtz
> >ac/nir: make ac_build_clamp work on all bit sizes
> >ac/nir: make ac_build_fract work on all bit sizes
> >ac/nir: make ac_build_isign work on all bit sizes
> >ac/nir: make ac_build_fsign work on all bit sizes
> >ac/nir: make ac_build_fdiv support 16-bit floats
> >ac/nir: implement half-float nir_op_frcp
> >ac/nir: implement half-float nir_op_frsq
> >ac/nir: implement half-float nir_op_ldexp
> >radv: lower 16-bit flrp
> >ac/nir: support half floats in emit_b2f
> >ac/nir: make emit_b2i work on all bit sizes
> >ac/nir: implement 16-bit shifts
> >compiler/nir: add lowering option for 16-bit ffma
> >ac/nir: implement 16-bit ac_build_ddxy
> >ac/nir: implement 8 and 16 bit ac_build_readlane
> >nir: make bitfield_reverse and ifind_msb work with all integers
> >ac/nir: make ac_find_lsb work on all bit sizes
> >ac/nir: make ac_build_umsb work on all bit sizes
> >ac/nir: implement 8 and 16 bit ac_build_imsb
> >ac/nir: make ac_build_bit_count work on all bit sizes
> >ac/nir: make ac_build_bitfield_reverse work on all bit sizes
> >ac/nir: implement 16-bit pack/unpack opcodes
> >ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
> >ac/nir,radv: create an array of varying output types
> >ac/nir: store all outputs as f32
> >radv: store all fragment shader inputs as f32
> >radv: handle all fragment output types
> >ac,radv: run LLVM's SLP vectorizer
> >ac/nir: generate better code for nir_op_f2f16_rtz
> >ac/nir: have nir_op_f2f16 round to zero
> >radv: expose float16, int16 and int8 features and extensions
> >
> >   src/amd/common/ac_llvm_build.c| 355 ++
> >   src/amd/common/ac_llvm_build.h|  22 +-
> >   src/amd/common/ac_llvm_util.c |   9 +-
> >   src/amd/common/ac_llvm_util.h |   1 +
> >   src/amd/common/ac_nir_to_llvm.c   | 258 +++
> >   src/amd/common/ac_shader_abi.h|   1 +
> >   src/amd/vulkan/radv_device.c  |  17 ++
> >   src/amd/vulkan/radv_extensions.py |   4 +
> >   src/amd/vulkan/radv_nir_to_llvm.c |  92 ---
> >   src/amd/vulkan/radv_shader.c  |   7 +
> >   src/broadcom/compiler/nir_to_vir.c|   1 +
> >   src/compiler/nir/nir.h|   1 +
> >   src/compiler/nir/nir_opcodes.py   |   4 +-
> >   src/compiler/nir/nir_opt_algebraic.py |   4 +-
> >   src/gallium/drivers/radeonsi/si_get.c |   1 +
> >   src/gallium/drivers/vc4/vc4_program.c |   1 +
> >   16 files changed, 516 insertions(+), 262 deletions(-)
> >
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-13 Thread Rhys Perry
Quite a bit of the patches aren't specific to a single extension as
many make code size-generic and some of the extensions intersect in
functionality.
It might still be possible to roughly order the patches by
functionality but I'm not sure if it would be very useful (possible
order in attachment). I didn't look at the actual content of the
patches when creating the attachment, this is from memory and looking
at the descriptions.
Would you like me to send out a v2 of this series doing like that?

On Tue, 12 Feb 2019 at 17:08, Samuel Pitoiset  wrote:
>
> How about splitting this series in four different parts? One for every
> extension? Is this doable without too much troubles?
>
> On 2/12/19 6:02 PM, Rhys Perry wrote:
> > It currently requires review (and possibly rebasing). Marek Olšák send
> > some feedback for a few of the patches but other than that, it hasn't
> > gotten much attention.
> >
> > Also patch 35 seems to vectorize 32-bit code which can help or hurt
> > shaders quite a bit and seems to hurt shaders overall. I'm not yet
> > sure how to solve this without removing it or changing the result of
> > LLVM's SLP vectorizer significantly.
> > IIRC enabling SLP vectorizer also uncovered a RA bug with a shader.
> >
> > I think I'll look into the issues with patch 35 again.
> >
> > On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset  
> > wrote:
> >> What's the status of this?
> >>
> >> On 12/7/18 6:21 PM, Rhys Perry wrote:
> >>> This series add support for:
> >>> - VK_KHR_shader_float16_int8
> >>> - VK_AMD_gpu_shader_half_float
> >>> - VK_AMD_gpu_shader_int16
> >>> - VK_KHR_8bit_storage
> >>> on VI+. Half floats are currently disabled on LLVM 7 because of a bug
> >>> causing large memory usage and long (or unbounded) compilation times with
> >>> some tests.
> >>>
> >>> It depends on the follow patch series:
> >>> - https://patchwork.freedesktop.org/series/53454/
> >>> - https://patchwork.freedesktop.org/series/53602/
> >>> - https://patchwork.freedesktop.org/series/53660/
> >>>
> >>> An older version was tested on my Polaris card, but due to hardware issues
> >>> I currently can't test the latest version of the series.
> >>>
> >>> deqp-vk has no regressions and none of the newly enabled tests fail.
> >>>
> >>> Rhys Perry (38):
> >>> ac: add various helpers for float16/int16/int8
> >>> ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >>> ac/nir: implement 8-bit ssbo stores
> >>> ac/nir: fix 16-bit ssbo stores
> >>> ac/nir: implement 8-bit nir_load_const_instr
> >>> ac/nir: implement 8-bit conversions
> >>> ac/nir: fix 64-bit nir_op_f2f16_rtz
> >>> ac/nir: make ac_build_clamp work on all bit sizes
> >>> ac/nir: make ac_build_fract work on all bit sizes
> >>> ac/nir: make ac_build_isign work on all bit sizes
> >>> ac/nir: make ac_build_fsign work on all bit sizes
> >>> ac/nir: make ac_build_fdiv support 16-bit floats
> >>> ac/nir: implement half-float nir_op_frcp
> >>> ac/nir: implement half-float nir_op_frsq
> >>> ac/nir: implement half-float nir_op_ldexp
> >>> radv: lower 16-bit flrp
> >>> ac/nir: support half floats in emit_b2f
> >>> ac/nir: make emit_b2i work on all bit sizes
> >>> ac/nir: implement 16-bit shifts
> >>> compiler/nir: add lowering option for 16-bit ffma
> >>> ac/nir: implement 16-bit ac_build_ddxy
> >>> ac/nir: implement 8 and 16 bit ac_build_readlane
> >>> nir: make bitfield_reverse and ifind_msb work with all integers
> >>> ac/nir: make ac_find_lsb work on all bit sizes
> >>> ac/nir: make ac_build_umsb work on all bit sizes
> >>> ac/nir: implement 8 and 16 bit ac_build_imsb
> >>> ac/nir: make ac_build_bit_count work on all bit sizes
> >>> ac/nir: make ac_build_bitfield_reverse work on all bit sizes
> >>> ac/nir: implement 16-bit pack/unpack opcodes
> >>> ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
> >>> ac/nir,radv: create an array of varying output types
> >>> ac/nir: store all outputs as f32
> >>> radv: store all fragment shader inputs as f32
> >>> radv: handle all fragment output types
> >>> ac,ra

[Mesa-dev] [PATCH v2 01/41] radv: bitcast 16-bit outputs to integers

2019-02-15 Thread Rhys Perry
16-bit outputs are stored as 16-bit floats in the outputs array, so they
have to be bitcast.

Fixes: b722b29f10d ('radv: add support for 16bit input/output')
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 7f74678d5f1..a8268c44ecf 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2365,7 +2365,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (is_16bit) {
for (unsigned chan = 0; chan < 4; chan++)
values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
- 
values[chan],
+ 
ac_to_integer(&ctx->ac, values[chan]),
  
ctx->ac.i32, "");
}
break;
@@ -2376,7 +2376,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (is_16bit) {
for (unsigned chan = 0; chan < 4; chan++)
values[chan] = 
LLVMBuildSExt(ctx->ac.builder,
- 
values[chan],
+ 
ac_to_integer(&ctx->ac, values[chan]),
  
ctx->ac.i32, "");
}
break;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 89a78b43c6f..b260142c177 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
} else if (num_bytes == 2) {
store_name = "llvm.amdgcn.tbuffer.store.i32";
data_type = ctx->ac.i32;
+   data = LLVMBuildBitCast(ctx->ac.builder, data, 
ctx->ac.i16, "");
+   data = LLVMBuildZExt(ctx->ac.builder, data, data_type, 
"");
LLVMValueRef tbuffer_params[] = {
data,
rsrc,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-15 Thread Rhys Perry
This series add support for:
- VK_KHR_shader_float16_int8
- VK_AMD_gpu_shader_half_float
- VK_AMD_gpu_shader_int16
- VK_KHR_8bit_storage
on VI+. Half floats are disabled on LLVM 7 because of a bug causing large
memory usage and long (or unbounded) compilation times with some CTS
tests.

It is written against the following patch series:
- https://patchwork.freedesktop.org/series/53454/ (v4)
- https://patchwork.freedesktop.org/series/53660/ (v1)

With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega
and VI except for
dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.*
which fails or crashes because of unrelated radv bugs with 64-bit varyings
and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even
though radv does not support it.

With LLVM 9, there are no reproducable piglit regressions except for
glsl-array-bounds-12.shader_test because of a LLVM bug when
SLP vectorization is enabled.

With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega
and VI except for those with LLVM 9 and a couple of tests because of a
LLVM bug after the SLP vectorizer and with the current lack of fallback
for 16-bit interpolation on LLVM versions before LLVM 9.

With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega
and VI except for those with LLVM 9 and a couple of tests because of a
LLVM bug after the SLP vectorizer.

The SLP vectorization patch is marked as WIP because it exposes LLVM bugs
with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and
some shader-db test for a game I can't remember. It also over-vectorizes
32-bit code which can cause significant worsening in generated code
quality.

The 16-bit interpolation patch is marked as WIP because it currently
requires intrinsics only available in LLVM 9 and does not have a fallback.

A branch on Github containing this series can be found at:
https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2

v2: rebase
v2: implement 16-bit interpolation
v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass
v2: run vectorization unconditionally on GFX9 and later
v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof()
v2: remove ac_int_of_size()
v2: fix 64-bit visit_load_var()
v2: mark VK_KHR_8bit_storage as DONE in features.txt
v2: mark SLP vectorization patch as WIP
v2: fix C++ style comment

Rhys Perry (41):
  radv: bitcast 16-bit outputs to integers
  radv: ensure export arguments are always float
  ac: add various helpers for float16/int16/int8
  ac/nir: implement 8-bit push constant, ssbo and ubo loads
  ac/nir: implement 8-bit ssbo stores
  ac/nir: fix 16-bit ssbo stores
  ac/nir: implement 8-bit nir_load_const_instr
  ac/nir: implement 8-bit conversions
  ac/nir: fix 64-bit nir_op_f2f16_rtz
  ac/nir: make ac_build_clamp work on all bit sizes
  ac/nir: make ac_build_fract work on all bit sizes
  ac/nir: make ac_build_isign work on all bit sizes
  ac/nir: make ac_build_fsign work on all bit sizes
  ac/nir: make ac_build_fdiv support 16-bit floats
  ac/nir: implement half-float nir_op_frcp
  ac/nir: implement half-float nir_op_frsq
  ac/nir: implement half-float nir_op_ldexp
  radv: lower 16-bit flrp
  ac/nir: support half floats in emit_b2f
  ac/nir: make emit_b2i work on all bit sizes
  ac/nir: implement 16-bit shifts
  compiler/nir: add lowering option for 16-bit ffma
  ac/nir: implement 16-bit ac_build_ddxy
  ac/nir: implement 8 and 16 bit ac_build_readlane
  nir: make bitfield_reverse and ifind_msb work with all integers
  ac/nir: make ac_find_lsb work on all bit sizes
  ac/nir: make ac_build_umsb work on all bit sizes
  ac/nir: implement 8 and 16 bit ac_build_imsb
  ac/nir: make ac_build_bit_count work on all bit sizes
  ac/nir: make ac_build_bitfield_reverse work on all bit sizes
  ac/nir: implement 16-bit pack/unpack opcodes
  ac/nir: add 8-bit types to glsl_base_to_llvm_type
  ac/nir,radv: create an array of varying output types
  ac/nir: store all outputs as f32
  radv: store all fragment shader inputs as f32
  radv: handle all fragment output types
  WIP: radv,ac: implement 16-bit interpolation
  WIP: ac,radv: run LLVM's SLP vectorizer
  ac/nir: generate better code for nir_op_f2f16_rtz
  ac/nir: have nir_op_f2f16 round to zero
  radv,docs: expose float16, int16 and int8 features and extensions

 docs/features.txt|   2 +-
 src/amd/common/ac_llvm_build.c   | 325 +++
 src/amd/common/ac_llvm_build.h   |  18 +-
 src/amd/common/ac_llvm_util.c|   8 +-
 src/amd/common/ac_nir_to_llvm.c  | 268 +++
 src/amd/common/ac_shader_abi.h   |   1 +
 src/amd/vulkan/radv_device.c |  17 ++
 src/amd/vulkan/radv_extensions.py|   4 +
 src/amd/vulkan/radv_nir_to_llvm.c| 123 +
 src/amd/vulkan/radv_pipeline.c   |  19 +-
 src/amd/vulkan/radv_shader.c |   4 +

[Mesa-dev] [PATCH v2 04/41] ac/nir: implement 8-bit push constant, ssbo and ubo loads

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bed52490bad..17d952d1ae8 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1399,7 +1399,30 @@ static LLVMValueRef visit_load_push_constant(struct 
ac_nir_context *ctx,
 
ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
 
-   if (instr->dest.ssa.bit_size == 16) {
+   if (instr->dest.ssa.bit_size == 8) {
+   unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 
1;
+   LLVMTypeRef vec_type = 
LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+   ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
+   LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+
+   LLVMValueRef params[3];
+   if (load_dwords > 1) {
+   LLVMValueRef res_vec = 
LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+   params[0] = LLVMBuildExtractElement(ctx->ac.builder, 
res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
+   params[1] = LLVMBuildExtractElement(ctx->ac.builder, 
res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
+   } else {
+   res = LLVMBuildBitCast(ctx->ac.builder, res, 
ctx->ac.i32, "");
+   params[0] = ctx->ac.i32_0;
+   params[1] = res;
+   }
+   params[2] = addr;
+   res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", 
ctx->ac.i32, params, 3, 0);
+
+   res = LLVMBuildTrunc(ctx->ac.builder, res, 
LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
+   if (instr->dest.ssa.num_components > 1)
+   res = LLVMBuildBitCast(ctx->ac.builder, res, 
LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 
instr->dest.ssa.num_components), "");
+   return res;
+   } else if (instr->dest.ssa.bit_size == 16) {
unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
LLVMTypeRef vec_type = 
LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
@@ -1676,7 +1699,7 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * 
elem_size_bytes, false);
 
LLVMValueRef ret;
-   if (load_bytes == 2) {
+   if (load_bytes <= 2) {
ret = ac_build_tbuffer_load_short_byte(&ctx->ac,
   rsrc,
   vindex,
@@ -1684,7 +1707,7 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
   ctx->ac.i32_0,
   immoffset,
   glc,
-  2);
+  load_bytes);
} else {
const char *load_name;
LLVMTypeRef data_type;
@@ -1700,6 +1723,7 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
data_type = ctx->ac.v2f32;
break;
case 4:
+   case 3:
load_name = "llvm.amdgcn.buffer.load.f32";
data_type = ctx->ac.f32;
break;
@@ -1746,7 +1770,8 @@ static LLVMValueRef visit_load_ubo_buffer(struct 
ac_nir_context *ctx,
if (instr->dest.ssa.bit_size == 64)
num_components *= 2;
 
-   if (instr->dest.ssa.bit_size == 16) {
+   if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) {
+   unsigned size = instr->dest.ssa.bit_size / 8;
LLVMValueRef results[num_components];
for (unsigned i = 0; i < num_components; ++i) {
results[i] = ac_build_tbuffer_load_short_byte(&ctx->ac,
@@ -1754,9 +1779,9 @@ static LLVMValueRef visit_load_ubo_buffer(struct 
ac_nir_context *ctx,
  
ctx->ac.i32_0,
   

[Mesa-dev] [PATCH v2 02/41] radv: ensure export arguments are always float

2019-02-15 Thread Rhys Perry
So that the signature is correct and consistent, the inputs to a export
intrinsic should always be 32-bit floats.

This and the previous commit fixes a large amount crashes from
dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_*
tests

Fixes: b722b29f10d ('radv: add support for 16bit input/output')
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index a8268c44ecf..d3795eec403 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2429,12 +2429,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
} else
memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 
-   for (unsigned i = 0; i < 4; ++i) {
-   if (!(args->enabled_channels & (1 << i)))
-   continue;
-
+   for (unsigned i = 0; i < 4; ++i)
args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
-   }
 }
 
 static void
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 07/41] ac/nir: implement 8-bit nir_load_const_instr

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index b260142c177..f39232b91a1 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1114,6 +1114,10 @@ static void visit_load_const(struct ac_nir_context *ctx,
 
for (unsigned i = 0; i < instr->def.num_components; ++i) {
switch (instr->def.bit_size) {
+   case 8:
+   values[i] = LLVMConstInt(element_type,
+instr->value.u8[i], false);
+   break;
case 16:
values[i] = LLVMConstInt(element_type,
 instr->value.u16[i], false);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 03/41] ac: add various helpers for float16/int16/int8

2019-02-15 Thread Rhys Perry
v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof()
v2: remove ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c  | 55 ++---
 src/amd/common/ac_llvm_build.h  | 15 +++--
 src/amd/common/ac_nir_to_llvm.c | 30 +-
 3 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 9395bd1bbda..b53d9c7ff8c 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -87,12 +87,16 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
 
+   ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
+   ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
+   ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
+   ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
@@ -201,7 +205,9 @@ ac_get_type_size(LLVMTypeRef type)
 
 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, 
LLVMTypeRef t)
 {
-   if (t == ctx->f16 || t == ctx->i16)
+   if (t == ctx->i8)
+   return ctx->i8;
+   else if (t == ctx->f16 || t == ctx->i16)
return ctx->i16;
else if (t == ctx->f32 || t == ctx->i32)
return ctx->i32;
@@ -281,6 +287,42 @@ ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), 
"");
 }
 
+LLVMTypeRef ac_float_of_size(struct ac_llvm_context *ctx, unsigned bit_size)
+{
+   switch (bit_size) {
+   case 16:
+   return ctx->f16;
+   case 32:
+   return ctx->f32;
+   case 64:
+   return ctx->f64;
+   default:
+   unreachable("Unhandled bit size");
+   }
+}
+
+LLVMValueRef ac_build_ui_cast(struct ac_llvm_context *ctx, LLVMValueRef v, 
LLVMTypeRef t)
+{
+   unsigned new_bit_size = ac_get_elem_bits(ctx, t);
+   unsigned old_bit_size = ac_get_elem_bits(ctx, LLVMTypeOf(v));
+   if (new_bit_size > old_bit_size)
+   return LLVMBuildZExt(ctx->builder, v, t, "");
+   else if (new_bit_size < old_bit_size)
+   return LLVMBuildTrunc(ctx->builder, v, t, "");
+   else
+   return v;
+}
+
+LLVMValueRef ac_build_reinterpret(struct ac_llvm_context *ctx, LLVMValueRef v, 
LLVMTypeRef t)
+{
+   if (LLVMTypeOf(v) == t)
+   return v;
+
+   v = ac_to_integer(ctx, v);
+   v = ac_build_ui_cast(ctx, v, ac_to_integer_type(ctx, t));
+   return LLVMBuildBitCast(ctx->builder, v, t, "");
+}
+
 
 LLVMValueRef
 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
@@ -1338,15 +1380,18 @@ LLVMValueRef 
ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
 }
 
 LLVMValueRef
-ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+ac_build_tbuffer_load_short_byte(struct ac_llvm_context *ctx,
LLVMValueRef rsrc,
LLVMValueRef vindex,
LLVMValueRef voffset,
LLVMValueRef soffset,
LLVMValueRef immoffset,
-   LLVMValueRef glc)
+   LLVMValueRef glc,
+   unsigned size)
 {
+   assert(size == 1 || size == 2);
const char *name = "llvm.amdgcn.tbuffer.load.i32";
+   int data_format = size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : 
V_008F0C_BUF_DATA_FORMAT_16;
LLVMTypeRef type = ctx->i32;
LLVMValueRef params[] = {
rsrc,
@@ -1354,13 +1399,13 @@ ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
voffset,
soffset,
immoffset,
-   LLVMConstInt(ctx->i32, 
V_008F0C_BUF_DATA_FORMAT_16, false),
+   LLVMConstInt(ctx->i32, data_format, false),
LLVMConstInt(ctx->i32, 
V_008F0C_BUF_NUM_FORMAT_UINT, false),
glc,
ctx->i1false,
};
LLVMValueRef res = ac_build

[Mesa-dev] [PATCH v2 09/41] ac/nir: fix 64-bit nir_op_f2f16_rtz

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 691d444db05..741059b5f1a 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -886,6 +886,8 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_f2f16_rtz:
src[0] = ac_to_float(&ctx->ac, src[0]);
+   if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+   src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
result = LLVMBuildExtractElement(ctx->ac.builder, result, 
ctx->ac.i32_0, "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 05/41] ac/nir: implement 8-bit ssbo stores

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 17d952d1ae8..89a78b43c6f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1524,7 +1524,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 
LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
get_src(ctx, instr->src[1]), true);
-   LLVMValueRef base_data = ac_to_float(&ctx->ac, src_data);
+   LLVMValueRef base_data = src_data;
base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
 
@@ -1565,7 +1565,25 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
  LLVMConstInt(ctx->ac.i32, start * 
elem_size_bytes, false), "");
}
-   if (num_bytes == 2) {
+   if (num_bytes == 1) {
+   store_name = "llvm.amdgcn.tbuffer.store.i32";
+   data_type = ctx->ac.i32;
+   data = LLVMBuildZExt(ctx->ac.builder, data, data_type, 
"");
+   LLVMValueRef tbuffer_params[] = {
+   data,
+   rsrc,
+   ctx->ac.i32_0, /* vindex */
+   offset,/* voffset */
+   ctx->ac.i32_0,
+   ctx->ac.i32_0,
+   LLVMConstInt(ctx->ac.i32, 1, false), // dfmt (= 
8bit)
+   LLVMConstInt(ctx->ac.i32, 4, false), // nfmt (= 
uint)
+   glc,
+   ctx->ac.i1false,
+   };
+   ac_build_intrinsic(&ctx->ac, store_name,
+  ctx->ac.voidt, tbuffer_params, 10, 
0);
+   } else if (num_bytes == 2) {
store_name = "llvm.amdgcn.tbuffer.store.i32";
data_type = ctx->ac.i32;
LLVMValueRef tbuffer_params[] = {
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 13/41] ac/nir: make ac_build_fsign work on all bit sizes

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_zerof() and ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 16 
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 3b2257e8bf0..23e454385d7 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2079,19 +2079,11 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context 
*ctx, LLVMValueRef src0,
 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
 {
-   LLVMValueRef cmp, val, zero, one;
-   LLVMTypeRef type;
-
-   if (bitsize == 32) {
-   type = ctx->f32;
-   zero = ctx->f32_0;
-   one = ctx->f32_1;
-   } else {
-   type = ctx->f64;
-   zero = ctx->f64_0;
-   one = ctx->f64_1;
-   }
+   LLVMTypeRef type = ac_float_of_size(ctx, bitsize);
+   LLVMValueRef zero = LLVMConstReal(type, 0.0);
+   LLVMValueRef one = LLVMConstReal(type, 1.0);
 
+   LLVMValueRef cmp, val;
cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 20/41] ac/nir: make emit_b2i work on all bit sizes

2019-02-15 Thread Rhys Perry
v2: don't use ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index e459001c1cf..75bb19031bf 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -347,11 +347,7 @@ static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
 unsigned bitsize)
 {
LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
-
-   if (bitsize == 32)
-   return result;
-
-   return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
+   return ac_build_ui_cast(ctx, result, LLVMIntTypeInContext(ctx->context, 
bitsize));
 }
 
 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 08/41] ac/nir: implement 8-bit conversions

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f39232b91a1..691d444db05 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -858,12 +858,14 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
src[i] = ac_to_integer(&ctx->ac, src[i]);
result = ac_build_gather_values(&ctx->ac, src, num_components);
break;
+   case nir_op_f2i8:
case nir_op_f2i16:
case nir_op_f2i32:
case nir_op_f2i64:
src[0] = ac_to_float(&ctx->ac, src[0]);
result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
break;
+   case nir_op_f2u8:
case nir_op_f2u16:
case nir_op_f2u32:
case nir_op_f2u64:
@@ -898,15 +900,14 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
else
result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
break;
+   case nir_op_u2u8:
case nir_op_u2u16:
case nir_op_u2u32:
case nir_op_u2u64:
src[0] = ac_to_integer(&ctx->ac, src[0]);
-   if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < 
ac_get_elem_bits(&ctx->ac, def_type))
-   result = LLVMBuildZExt(ctx->ac.builder, src[0], 
def_type, "");
-   else
-   result = LLVMBuildTrunc(ctx->ac.builder, src[0], 
def_type, "");
+   result = ac_build_ui_cast(&ctx->ac, src[0], def_type);
break;
+   case nir_op_i2i8:
case nir_op_i2i16:
case nir_op_i2i32:
case nir_op_i2i64:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 29/41] ac/nir: make ac_build_bit_count work on all bit sizes

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 33 +++--
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index c986f800fa4..46738faea9d 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2085,35 +2085,16 @@ LLVMValueRef ac_build_fsign(struct ac_llvm_context 
*ctx, LLVMValueRef src0,
 
 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
 {
-   LLVMValueRef result;
-   unsigned bitsize;
+   unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
 
-   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+   char name[64];
+   snprintf(name, sizeof(name), "llvm.ctpop.i%d", bitsize);
 
-   switch (bitsize) {
-   case 64:
-   result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-
-   result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
-   break;
-   case 32:
-   result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   case 16:
-   result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   LLVMValueRef result = ac_build_intrinsic(ctx, name, LLVMTypeOf(src0),
+(LLVMValueRef []) { src0 }, 1,
+AC_FUNC_ATTR_READNONE);
 
-   return result;
+   return ac_build_ui_cast(ctx, result, ctx->i32);
 }
 
 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 36/41] radv: handle all fragment output types

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 55 ---
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 01b8b097ea1..c46eabf3656 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2297,9 +2297,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (!values)
return;
 
-   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
if (ctx->stage == MESA_SHADER_FRAGMENT) {
-   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
unsigned index = target - V_008DFC_SQ_EXP_MRT;
unsigned col_format = (ctx->options->key.fs.col_format >> (4 * 
index)) & 0xf;
bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
@@ -2310,6 +2308,28 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef 
args[2],
  unsigned bits, bool hi) = NULL;
 
+   if (LLVMTypeOf(values[0]) == ctx->ac.f16 &&
+   col_format != V_028714_SPI_SHADER_FP16_ABGR) {
+   for (unsigned chan = 0; chan < 4; chan++)
+   values[chan] = LLVMBuildFPExt(ctx->ac.builder,
+ values[chan],
+ ctx->ac.f32, "");
+   }
+
+   if (LLVMTypeOf(values[0]) == ctx->ac.i16 || 
LLVMTypeOf(values[0]) == ctx->ac.i8) {
+   if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+   for (unsigned chan = 0; chan < 4; chan++)
+   values[chan] = 
LLVMBuildSExt(ctx->ac.builder,
+
values[chan],
+
ctx->ac.i32, "");
+   } else {
+   for (unsigned chan = 0; chan < 4; chan++)
+   values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
+
values[chan],
+
ctx->ac.i32, "");
+   }
+   }
+
switch(col_format) {
case V_028714_SPI_SHADER_ZERO:
args->enabled_channels = 0; /* writemask */
@@ -2335,12 +2355,16 @@ si_llvm_init_export_args(struct radv_shader_context 
*ctx,
 
case V_028714_SPI_SHADER_FP16_ABGR:
args->enabled_channels = 0x5;
-   packf = ac_build_cvt_pkrtz_f16;
-   if (is_16bit) {
-   for (unsigned chan = 0; chan < 4; chan++)
-   values[chan] = 
LLVMBuildFPExt(ctx->ac.builder,
- 
values[chan],
- 
ctx->ac.f32, "");
+   if (LLVMTypeOf(values[0]) == ctx->ac.f16) {
+   packi = ac_build_cvt_pk_u16;
+   for (unsigned chan = 0; chan < 4; chan++) {
+   values[chan] = ac_to_integer(&ctx->ac, 
values[chan]);
+   values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
+
values[chan],
+
ctx->ac.i32, "");
+   }
+   } else {
+   packf = ac_build_cvt_pkrtz_f16;
}
break;
 
@@ -2357,23 +2381,11 @@ si_llvm_init_export_args(struct radv_shader_context 
*ctx,
case V_028714_SPI_SHADER_UINT16_ABGR:
args->enabled_channels = 0x5;
packi = ac_build_cvt_pk_u16;
-   if (is_16bit) {
-   for (unsigned chan = 0; chan < 4; chan++)
-   values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
- 
ac_to_integer(&ctx->ac, values[chan]),
- 
ctx->ac.i32, "");
-   }
break;
 
case V_028714_SPI_SHADER_SIN

[Mesa-dev] [PATCH v2 22/41] compiler/nir: add lowering option for 16-bit ffma

2019-02-15 Thread Rhys Perry
The lowering needs to be disabled for sufficient precision to pass
deqp-vk's 16-bit fma test on radv.

Signed-off-by: Rhys Perry 
---
 src/broadcom/compiler/nir_to_vir.c| 1 +
 src/compiler/nir/nir.h| 1 +
 src/compiler/nir/nir_opt_algebraic.py | 4 +++-
 src/gallium/drivers/radeonsi/si_get.c | 1 +
 src/gallium/drivers/vc4/vc4_program.c | 1 +
 5 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c 
b/src/broadcom/compiler/nir_to_vir.c
index d983f91e718..6c0a623096a 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2471,6 +2471,7 @@ const nir_shader_compiler_options v3d_nir_options = {
 .lower_fdiv = true,
 .lower_find_lsb = true,
 .lower_ffma = true,
+.lower_ffma16 = true,
 .lower_flrp32 = true,
 .lower_fpow = true,
 .lower_fsat = true,
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 740c64d2a94..8df275f4aa3 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2111,6 +2111,7 @@ typedef struct nir_function {
 
 typedef struct nir_shader_compiler_options {
bool lower_fdiv;
+   bool lower_ffma16;
bool lower_ffma;
bool fuse_ffma;
bool lower_flrp16;
diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index 71c626e1b3f..63dff878d35 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -136,7 +136,9 @@ optimizations = [
(('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a, 
('bcsel', c, b, a), 'options->lower_flrp32'),
(('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', 
a, b, c), '!options->lower_flrp32'),
(('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', 
a, b, c), '!options->lower_flrp64'),
-   (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
+   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 
'options->lower_ffma16'),
+   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
+   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
 
(('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d)),
diff --git a/src/gallium/drivers/radeonsi/si_get.c 
b/src/gallium/drivers/radeonsi/si_get.c
index f8ca02d4fcf..5bf107ef6fe 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -491,6 +491,7 @@ static const struct nir_shader_compiler_options nir_options 
= {
.lower_fdiv = true,
.lower_sub = true,
.lower_ffma = true,
+   .lower_ffma16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_snorm_4x8 = true,
.lower_pack_unorm_2x16 = true,
diff --git a/src/gallium/drivers/vc4/vc4_program.c 
b/src/gallium/drivers/vc4/vc4_program.c
index 2d0a52bb5fb..8be258cbba4 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2234,6 +2234,7 @@ static const nir_shader_compiler_options nir_options = {
 .lower_extract_word = true,
 .lower_fdiv = true,
 .lower_ffma = true,
+.lower_ffma16 = true,
 .lower_flrp32 = true,
 .lower_fpow = true,
 .lower_fsat = true,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 27/41] ac/nir: make ac_build_umsb work on all bit sizes

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_zero() and ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 38 +++---
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 61085db9320..ec87a7b9343 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1555,36 +1555,12 @@ ac_build_umsb(struct ac_llvm_context *ctx,
  LLVMValueRef arg,
  LLVMTypeRef dst_type)
 {
-   const char *intrin_name;
-   LLVMTypeRef type;
-   LLVMValueRef highest_bit;
-   LLVMValueRef zero;
-   unsigned bitsize;
-
-   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
-   switch (bitsize) {
-   case 64:
-   intrin_name = "llvm.ctlz.i64";
-   type = ctx->i64;
-   highest_bit = LLVMConstInt(ctx->i64, 63, false);
-   zero = ctx->i64_0;
-   break;
-   case 32:
-   intrin_name = "llvm.ctlz.i32";
-   type = ctx->i32;
-   highest_bit = LLVMConstInt(ctx->i32, 31, false);
-   zero = ctx->i32_0;
-   break;
-   case 16:
-   intrin_name = "llvm.ctlz.i16";
-   type = ctx->i16;
-   highest_bit = LLVMConstInt(ctx->i16, 15, false);
-   zero = ctx->i16_0;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   LLVMTypeRef type = LLVMTypeOf(arg);
+   unsigned bitsize = ac_get_elem_bits(ctx, type);
+   LLVMValueRef highest_bit = LLVMConstInt(type, bitsize - 1, false);
+   LLVMValueRef zero = LLVMConstInt(type, 0, false);
+   char intrin_name[64];
+   snprintf(intrin_name, sizeof(intrin_name), "llvm.ctlz.i%d", bitsize);
 
LLVMValueRef params[2] = {
arg,
@@ -1598,7 +1574,7 @@ ac_build_umsb(struct ac_llvm_context *ctx,
/* The HW returns the last bit index from MSB, but TGSI/NIR wants
 * the index from LSB. Invert it by doing "31 - msb". */
msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
-   msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, "");
+   msb = ac_build_ui_cast(ctx, msb, dst_type);
 
/* check for zero */
return LLVMBuildSelect(ctx->builder,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 24/41] ac/nir: implement 8 and 16 bit ac_build_readlane

2019-02-15 Thread Rhys Perry
v2: don't use ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 71eaac4b7bd..aa92c55c822 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2868,9 +2868,15 @@ ac_build_readlane(struct ac_llvm_context *ctx, 
LLVMValueRef src, LLVMValueRef la
 {
LLVMTypeRef src_type = LLVMTypeOf(src);
src = ac_to_integer(ctx, src);
-   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+   unsigned src_bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+   unsigned bits = src_bits;
LLVMValueRef ret;
 
+   if (bits < 32) {
+   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
+   bits = 32;
+   }
+
if (bits == 32) {
ret = _ac_build_readlane(ctx, src, lane);
} else {
@@ -2887,6 +2893,10 @@ ac_build_readlane(struct ac_llvm_context *ctx, 
LLVMValueRef src, LLVMValueRef la
LLVMConstInt(ctx->i32, i, 0), 
"");
}
}
+
+   if (src_bits < 32)
+   ret = LLVMBuildTrunc(ctx->builder, ret, 
LLVMIntTypeInContext(ctx->context, src_bits), "");
+
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
 }
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 30/41] ac/nir: make ac_build_bitfield_reverse work on all bit sizes

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 26 ++
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 46738faea9d..dff369aae7f 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2100,28 +2100,14 @@ LLVMValueRef ac_build_bit_count(struct ac_llvm_context 
*ctx, LLVMValueRef src0)
 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
   LLVMValueRef src0)
 {
-   LLVMValueRef result;
-   unsigned bitsize;
-
-   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+   unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
 
-   switch (bitsize) {
-   case 32:
-   result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", 
ctx->i32,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   case 16:
-   result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", 
ctx->i16,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   char name[64];
+   snprintf(name, sizeof(name), "llvm.bitreverse.i%d", bitsize);
 
-   return result;
+   return ac_build_intrinsic(ctx, name, LLVMTypeOf(src0),
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
 }
 
 #define AC_EXP_TARGET  0
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 23/41] ac/nir: implement 16-bit ac_build_ddxy

2019-02-15 Thread Rhys Perry
v2: rebase

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index fb871a47400..71eaac4b7bd 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1481,6 +1481,11 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
LLVMValueRef tl, trbl;
LLVMValueRef result;
 
+   int size = ac_get_type_size(LLVMTypeOf(val));
+
+   if (size == 2)
+   val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
+
for (unsigned i = 0; i < 4; ++i) {
tl_lanes[i] = i & mask;
trbl_lanes[i] = (i & mask) + idx;
@@ -1493,12 +1498,19 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
 trbl_lanes[0], trbl_lanes[1],
 trbl_lanes[2], trbl_lanes[3]);
 
-   tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
-   trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
+   if (size == 2) {
+   tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
+   trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
+   }
+
+   LLVMTypeRef type = ac_float_of_size(ctx, size * 8);
+   tl = LLVMBuildBitCast(ctx->builder, tl, type, "");
+   trbl = LLVMBuildBitCast(ctx->builder, trbl, type, "");
result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
 
-   result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32,
-   &result, 1, 0);
+   result = ac_build_intrinsic(ctx,
+   LLVMTypeOf(val) == ctx->f32 ? "llvm.amdgcn.wqm.f32" : 
"llvm.amdgcn.wqm.f16", type,
+   &result, 1, 0);
 
return result;
 }
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 17/41] ac/nir: implement half-float nir_op_ldexp

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 8b0e07d2930..0e5946dfdb3 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -829,8 +829,10 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_ldexp:
src[0] = ac_to_float(&ctx->ac, src[0]);
-   if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 32)
+   if (ac_get_elem_bits(&ctx->ac, def_type) == 32)
result = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE);
+   else if (ac_get_elem_bits(&ctx->ac, def_type) == 16)
+   result = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE);
else
result = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE);
break;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 26/41] ac/nir: make ac_find_lsb work on all bit sizes

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_zero() and ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 33 ++---
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index aa92c55c822..61085db9320 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2474,30 +2474,11 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
 LLVMTypeRef dst_type,
 LLVMValueRef src0)
 {
-   unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
-   const char *intrin_name;
-   LLVMTypeRef type;
-   LLVMValueRef zero;
-
-   switch (src0_bitsize) {
-   case 64:
-   intrin_name = "llvm.cttz.i64";
-   type = ctx->i64;
-   zero = ctx->i64_0;
-   break;
-   case 32:
-   intrin_name = "llvm.cttz.i32";
-   type = ctx->i32;
-   zero = ctx->i32_0;
-   break;
-   case 16:
-   intrin_name = "llvm.cttz.i16";
-   type = ctx->i16;
-   zero = ctx->i16_0;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   }
+   LLVMTypeRef type = LLVMTypeOf(src0);
+   unsigned src0_bitsize = ac_get_elem_bits(ctx, type);
+   char intrin_name[64];
+   LLVMValueRef zero = LLVMConstInt(type, 0, false);
+   snprintf(intrin_name, sizeof(intrin_name), "llvm.cttz.i%d", 
src0_bitsize);
 
LLVMValueRef params[2] = {
src0,
@@ -2518,9 +2499,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
  params, 2,
  AC_FUNC_ATTR_READNONE);
 
-   if (src0_bitsize == 64) {
-   lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
-   }
+   lsb = ac_build_ui_cast(ctx, lsb, ctx->i32);
 
/* TODO: We need an intrinsic to skip this conditional. */
/* Check for zero: */
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 28/41] ac/nir: implement 8 and 16 bit ac_build_imsb

2019-02-15 Thread Rhys Perry
v2: fix C++ style comment

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index ec87a7b9343..c986f800fa4 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1531,6 +1531,10 @@ ac_build_imsb(struct ac_llvm_context *ctx,
  LLVMValueRef arg,
  LLVMTypeRef dst_type)
 {
+   /* TODO: support 64-bit integers */
+   if (LLVMTypeOf(arg) != ctx->i32)
+   arg = LLVMBuildSExt(ctx->builder, arg, ctx->i32, "");
+
LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
  dst_type, &arg, 1,
  AC_FUNC_ATTR_READNONE);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 10/41] ac/nir: make ac_build_clamp work on all bit sizes

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_zerof() and ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index b53d9c7ff8c..667f9700764 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1597,16 +1597,20 @@ ac_build_umsb(struct ac_llvm_context *ctx,
 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
   LLVMValueRef b)
 {
+   char intr[64];
+   snprintf(intr, sizeof(intr), "llvm.minnum.f%d", ac_get_elem_bits(ctx, 
LLVMTypeOf(a)));
LLVMValueRef args[2] = {a, b};
-   return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
+   return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2,
  AC_FUNC_ATTR_READNONE);
 }
 
 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
   LLVMValueRef b)
 {
+   char intr[64];
+   snprintf(intr, sizeof(intr), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, 
LLVMTypeOf(a)));
LLVMValueRef args[2] = {a, b};
-   return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
+   return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2,
  AC_FUNC_ATTR_READNONE);
 }
 
@@ -1633,8 +1637,9 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, 
LLVMValueRef a,
 
 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
-   return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-ctx->f32_1);
+   LLVMTypeRef t = LLVMTypeOf(value);
+   return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 
0.0)),
+LLVMConstReal(t, 1.0));
 }
 
 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 14/41] ac/nir: make ac_build_fdiv support 16-bit floats

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 23e454385d7..fb871a47400 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -661,7 +661,7 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
 * If we do (num * (1 / den)), LLVM does:
 *return num * v_rcp_f32(den);
 */
-   LLVMValueRef one = LLVMTypeOf(num) == ctx->f64 ? ctx->f64_1 : 
ctx->f32_1;
+   LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 21/41] ac/nir: implement 16-bit shifts

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 75bb19031bf..bad1c2a990e 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -672,20 +672,17 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_ishl:
result = LLVMBuildShl(ctx->ac.builder, src[0],
- LLVMBuildZExt(ctx->ac.builder, src[1],
-   LLVMTypeOf(src[0]), ""),
+ ac_build_ui_cast(&ctx->ac, src[1], 
LLVMTypeOf(src[0])),
  "");
break;
case nir_op_ishr:
result = LLVMBuildAShr(ctx->ac.builder, src[0],
-  LLVMBuildZExt(ctx->ac.builder, src[1],
-LLVMTypeOf(src[0]), ""),
+  ac_build_ui_cast(&ctx->ac, src[1], 
LLVMTypeOf(src[0])),
   "");
break;
case nir_op_ushr:
result = LLVMBuildLShr(ctx->ac.builder, src[0],
-  LLVMBuildZExt(ctx->ac.builder, src[1],
-LLVMTypeOf(src[0]), ""),
+  ac_build_ui_cast(&ctx->ac, src[1], 
LLVMTypeOf(src[0])),
   "");
break;
case nir_op_ilt32:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 11/41] ac/nir: make ac_build_fract work on all bit sizes

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 667f9700764..db937eb66fb 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2049,16 +2049,9 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, 
unsigned simm16)
 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
 {
-   LLVMTypeRef type;
-   char *intr;
-
-   if (bitsize == 32) {
-   intr = "llvm.floor.f32";
-   type = ctx->f32;
-   } else {
-   intr = "llvm.floor.f64";
-   type = ctx->f64;
-   }
+   LLVMTypeRef type = ac_float_of_size(ctx, bitsize);
+   char intr[64];
+   snprintf(intr, sizeof(intr), "llvm.floor.f%d", bitsize);
 
LLVMValueRef params[] = {
src0,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 16/41] ac/nir: implement half-float nir_op_frsq

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index cba0cec3e8f..8b0e07d2930 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -788,8 +788,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
case nir_op_frsq:
result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
  ac_to_float_type(&ctx->ac, 
def_type), src[0]);
-   result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size 
== 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
-  result);
+   result = ac_build_fdiv(&ctx->ac, 
LLVMConstReal(LLVMTypeOf(result), 1.0), result);
break;
case nir_op_frexp_exp:
src[0] = ac_to_float(&ctx->ac, src[0]);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 15/41] ac/nir: implement half-float nir_op_frcp

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 741059b5f1a..cba0cec3e8f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -657,8 +657,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_frcp:
src[0] = ac_to_float(&ctx->ac, src[0]);
-   result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size 
== 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
-  src[0]);
+   result = ac_build_fdiv(&ctx->ac, 
LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
break;
case nir_op_iand:
result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 18/41] radv: lower 16-bit flrp

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_shader.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 1dcb0606246..adba730ad8b 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -53,6 +53,7 @@
 static const struct nir_shader_compiler_options nir_options = {
.vertex_id_zero_based = true,
.lower_scmp = true,
+   .lower_flrp16 = true,
.lower_flrp32 = true,
.lower_flrp64 = true,
.lower_device_index_to_zero = true,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 12/41] ac/nir: make ac_build_isign work on all bit sizes

2019-02-15 Thread Rhys Perry
v2: don't use ac_get_zero(), ac_get_one() and ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 27 ---
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index db937eb66fb..3b2257e8bf0 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2064,30 +2064,11 @@ LLVMValueRef ac_build_fract(struct ac_llvm_context 
*ctx, LLVMValueRef src0,
 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
 {
-   LLVMValueRef cmp, val, zero, one;
-   LLVMTypeRef type;
-
-   switch (bitsize) {
-   case 64:
-   type = ctx->i64;
-   zero = ctx->i64_0;
-   one = ctx->i64_1;
-   break;
-   case 32:
-   type = ctx->i32;
-   zero = ctx->i32_0;
-   one = ctx->i32_1;
-   break;
-   case 16:
-   type = ctx->i16;
-   zero = ctx->i16_0;
-   one = ctx->i16_1;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
+   LLVMValueRef zero = LLVMConstInt(type, 0, false);
+   LLVMValueRef one = LLVMConstInt(type, 1, false);
 
+   LLVMValueRef cmp, val;
cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 19/41] ac/nir: support half floats in emit_b2f

2019-02-15 Thread Rhys Perry
This seems to generate fine code, even though the IR is a bit ugly.

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 0e5946dfdb3..e459001c1cf 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -316,14 +316,20 @@ static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
 unsigned bitsize)
 {
LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0,
-  LLVMBuildBitCast(ctx->builder, 
LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""),
+  LLVMBuildBitCast(ctx->builder, 
ctx->f32_1, ctx->i32, ""),
   "");
result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, "");
 
-   if (bitsize == 32)
+   switch (bitsize) {
+   case 16:
+   return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, "");
+   case 32:
return result;
-
-   return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
+   case 64:
+   return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
+   default:
+   unreachable("Unsupported bit size.");
+   }
 }
 
 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 40/41] ac/nir: have nir_op_f2f16 round to zero

2019-02-15 Thread Rhys Perry
In the hope that one day LLVM will then be able to generate code with
vectorized v_cvt_pkrtz_f16_f32 instructions.

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 8bfc63958ca..7a5e95506f2 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -884,6 +884,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
result = LLVMBuildUIToFP(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
break;
case nir_op_f2f16_rtz:
+   case nir_op_f2f16:
src[0] = ac_to_float(&ctx->ac, src[0]);
if (LLVMTypeOf(src[0]) == ctx->ac.f64)
src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
@@ -894,7 +895,6 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, 
"");
break;
case nir_op_f2f16_rtne:
-   case nir_op_f2f16:
case nir_op_f2f32:
case nir_op_f2f64:
src[0] = ac_to_float(&ctx->ac, src[0]);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation

2019-02-15 Thread Rhys Perry
v2: add to patch series

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c   | 33 +---
 src/amd/common/ac_llvm_build.h   |  3 ++-
 src/amd/common/ac_nir_to_llvm.c  | 14 +++---
 src/amd/vulkan/radv_nir_to_llvm.c| 27 ++-
 src/amd/vulkan/radv_pipeline.c   | 19 --
 src/amd/vulkan/radv_shader.h |  1 +
 src/gallium/drivers/radeonsi/si_shader.c |  2 +-
 7 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index dff369aae7f..be2c2251a21 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j)
+  LLVMValueRef j,
+  int word)
 {
-   LLVMValueRef args[5];
+   LLVMValueRef args[6];
LLVMValueRef p1;
 
args[0] = i;
args[1] = llvm_chan;
args[2] = attr_number;
-   args[3] = params;
-
-   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
-   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[3] = LLVMConstInt(ctx->i1, word, false);
+   args[4] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+   ctx->f16, args, 5, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[3] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
+   ctx->f32, args, 4, 
AC_FUNC_ATTR_READNONE);
+   }
 
args[0] = p1;
args[1] = j;
args[2] = llvm_chan;
args[3] = attr_number;
-   args[4] = params;
-
-   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
- ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[4] = LLVMConstInt(ctx->i1, word, false);
+   args[5] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+ ctx->f16, args, 6, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[4] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
+ ctx->f32, args, 5, 
AC_FUNC_ATTR_READNONE);
+   }
 }
 
 LLVMValueRef
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 61c9b5e4b6c..655427567c4 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j);
+  LLVMValueRef j,
+  int word);
 
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bf7024c68e4..939b8eb13de 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
LLVMValueRef j = LLVMBuildExtractElement(
ctx->ac.builder, interp_param, 
ctx->ac.i32_1, "");
 
+   /* This fp16 handling isn't technically correct
+* but should be correct for the attributes we
+* are actually going to use. */
+   bool fp16 = instr->dest.ssa.bit_size == 16;
+   int word = fp16 ? 0 : -1;
v = ac_build_fs_interp(&ctx->ac, llvm_chan, 
attr_number,
-  ctx->abi->prim_mask, i, 
j);
+  ctx->abi->prim_mask, i, 
j, word);
+   if (fp16)
+   v = ac_build_reinterpret(&ctx->ac, v, 
ctx->ac.f32);
} else {
v = ac_build_fs_interp_mov(&ctx->ac, 
LLVMConstInt(ctx->ac.i32, 2, false),
   llvm_chan, 
attr_number, ctx->abi->prim_mask);
@@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, 
attrib_idx, "");
 
}
-   retu

[Mesa-dev] [PATCH v2 38/41] WIP: ac, radv: run LLVM's SLP vectorizer

2019-02-15 Thread Rhys Perry
v2: rebase
v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass
v2: run unconditionally on GFX9 and later
v2: mark as WIP because it can make 32-bit code much worse

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_util.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 69446863b95..8d78b5a850b 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "c11/threads.h"
 #include "gallivm/lp_bld_misc.h"
 #include "util/u_math.h"
@@ -175,7 +176,7 @@ static LLVMTargetMachineRef ac_create_target_machine(enum 
radeon_family family,
 }
 
 static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef 
target_library_info,
-   bool check_ir)
+   bool check_ir, enum radeon_family 
family)
 {
LLVMPassManagerRef passmgr = LLVMCreatePassManager();
if (!passmgr)
@@ -203,6 +204,9 @@ static LLVMPassManagerRef 
ac_create_passmgr(LLVMTargetLibraryInfoRef target_libr
LLVMAddCFGSimplificationPass(passmgr);
/* This is recommended by the instruction combining pass. */
LLVMAddEarlyCSEMemSSAPass(passmgr);
+   /* vectorization is disabled on pre-GFX9 because it's not very useful 
there */
+   if (family >= CHIP_VEGA10)
+   LLVMAddSLPVectorizePass(passmgr);
LLVMAddInstructionCombiningPass(passmgr);
return passmgr;
 }
@@ -327,7 +331,7 @@ ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
goto fail;
 
compiler->passmgr = ac_create_passmgr(compiler->target_library_info,
- tm_options & AC_TM_CHECK_IR);
+ tm_options & AC_TM_CHECK_IR, 
family);
if (!compiler->passmgr)
goto fail;
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 39/41] ac/nir: generate better code for nir_op_f2f16_rtz

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 939b8eb13de..8bfc63958ca 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -889,7 +889,9 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-   result = LLVMBuildExtractElement(ctx->ac.builder, result, 
ctx->ac.i32_0, "");
+   // generates better code than an extractelement with slp 
vectorization
+   result = LLVMBuildBitCast(ctx->ac.builder, result, ctx->ac.i32, 
"");
+   result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, 
"");
break;
case nir_op_f2f16_rtne:
case nir_op_f2f16:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 41/41] radv, docs: expose float16, int16 and int8 features and extensions

2019-02-15 Thread Rhys Perry
v2: rebase
v2: mark VK_KHR_8bit_storage as DONE in features.txt

Signed-off-by: Rhys Perry 
---
 docs/features.txt |  2 +-
 src/amd/vulkan/radv_device.c  | 17 +
 src/amd/vulkan/radv_extensions.py |  4 
 src/amd/vulkan/radv_shader.c  |  3 +++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/features.txt b/docs/features.txt
index 6c2b6d59377..ded753b0182 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -439,7 +439,7 @@ Vulkan 1.1 -- all DONE: anv, radv
   VK_KHR_variable_pointers  DONE (anv, radv)
 
 Khronos extensions that are not part of any Vulkan version:
-  VK_KHR_8bit_storage   DONE (anv)
+  VK_KHR_8bit_storage   DONE (anv, radv)
   VK_KHR_android_surfacenot started
   VK_KHR_create_renderpass2 DONE (anv, radv)
   VK_KHR_displayDONE (anv, radv)
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 0fef92773e1..4137b778466 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -877,6 +877,23 @@ void radv_GetPhysicalDeviceFeatures2(
features->bufferDeviceAddressMultiDevice = false;
break;
}
+   case 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
+   VkPhysicalDeviceFloat16Int8FeaturesKHR *features =
+   (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext;
+   bool enabled = pdevice->rad_info.chip_class >= VI;
+   features->shaderFloat16 = enabled && HAVE_LLVM >= 
0x0800;
+   features->shaderInt8 = enabled;
+   break;
+   }
+   case 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: {
+   VkPhysicalDevice8BitStorageFeaturesKHR *features =
+   (VkPhysicalDevice8BitStorageFeaturesKHR*)ext;
+   bool enabled = pdevice->rad_info.chip_class >= VI;
+   features->storageBuffer8BitAccess = enabled;
+   features->uniformAndStorageBuffer8BitAccess = enabled;
+   features->storagePushConstant8 = enabled;
+   break;
+   }
default:
break;
}
diff --git a/src/amd/vulkan/radv_extensions.py 
b/src/amd/vulkan/radv_extensions.py
index f218598f123..e38cfcfdcbe 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -91,6 +91,8 @@ EXTENSIONS = [
 Extension('VK_KHR_xlib_surface',  6, 
'VK_USE_PLATFORM_XLIB_KHR'),
 Extension('VK_KHR_multiview', 1, True),
 Extension('VK_KHR_display',  23, 
'VK_USE_PLATFORM_DISPLAY_KHR'),
+Extension('VK_KHR_shader_float16_int8',   1, 
'device->rad_info.chip_class >= VI'),
+Extension('VK_KHR_8bit_storage',  1, 
'device->rad_info.chip_class >= VI'),
 Extension('VK_EXT_direct_mode_display',   1, 
'VK_USE_PLATFORM_DISPLAY_KHR'),
 Extension('VK_EXT_acquire_xlib_display',  1, 
'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
 Extension('VK_EXT_buffer_device_address', 1, True),
@@ -121,6 +123,8 @@ EXTENSIONS = [
 Extension('VK_AMD_shader_core_properties',1, True),
 Extension('VK_AMD_shader_info',   1, True),
 Extension('VK_AMD_shader_trinary_minmax', 1, True),
+Extension('VK_AMD_gpu_shader_half_float', 1, 
'device->rad_info.chip_class >= VI && HAVE_LLVM >= 0x0800'),
+Extension('VK_AMD_gpu_shader_int16',  1, 
'device->rad_info.chip_class >= VI'),
 Extension('VK_GOOGLE_decorate_string',1, True),
 Extension('VK_GOOGLE_hlsl_functionality1',1, True),
 ]
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index adba730ad8b..44dea8e7203 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -249,6 +249,9 @@ radv_shader_compile_to_nir(struct radv_device *device,
.transform_feedback = true,
.trinary_minmax = true,
.variable_pointers = true,
+   .float16 = true,
+   .storage_8bit = true,
+   .int8 = 

[Mesa-dev] [PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation

2019-02-15 Thread Rhys Perry
v2: add to patch series

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c   | 33 +---
 src/amd/common/ac_llvm_build.h   |  3 ++-
 src/amd/common/ac_nir_to_llvm.c  | 14 +++---
 src/amd/vulkan/radv_nir_to_llvm.c| 27 ++-
 src/amd/vulkan/radv_pipeline.c   | 19 --
 src/amd/vulkan/radv_shader.h |  1 +
 src/gallium/drivers/radeonsi/si_shader.c |  2 +-
 7 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index dff369aae7f..be2c2251a21 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j)
+  LLVMValueRef j,
+  int word)
 {
-   LLVMValueRef args[5];
+   LLVMValueRef args[6];
LLVMValueRef p1;
 
args[0] = i;
args[1] = llvm_chan;
args[2] = attr_number;
-   args[3] = params;
-
-   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
-   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[3] = LLVMConstInt(ctx->i1, word, false);
+   args[4] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+   ctx->f16, args, 5, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[3] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
+   ctx->f32, args, 4, 
AC_FUNC_ATTR_READNONE);
+   }
 
args[0] = p1;
args[1] = j;
args[2] = llvm_chan;
args[3] = attr_number;
-   args[4] = params;
-
-   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
- ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[4] = LLVMConstInt(ctx->i1, word, false);
+   args[5] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+ ctx->f16, args, 6, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[4] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
+ ctx->f32, args, 5, 
AC_FUNC_ATTR_READNONE);
+   }
 }
 
 LLVMValueRef
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 61c9b5e4b6c..655427567c4 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j);
+  LLVMValueRef j,
+  int word);
 
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bf7024c68e4..939b8eb13de 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
LLVMValueRef j = LLVMBuildExtractElement(
ctx->ac.builder, interp_param, 
ctx->ac.i32_1, "");
 
+   /* This fp16 handling isn't technically correct
+* but should be correct for the attributes we
+* are actually going to use. */
+   bool fp16 = instr->dest.ssa.bit_size == 16;
+   int word = fp16 ? 0 : -1;
v = ac_build_fs_interp(&ctx->ac, llvm_chan, 
attr_number,
-  ctx->abi->prim_mask, i, 
j);
+  ctx->abi->prim_mask, i, 
j, word);
+   if (fp16)
+   v = ac_build_reinterpret(&ctx->ac, v, 
ctx->ac.f32);
} else {
v = ac_build_fs_interp_mov(&ctx->ac, 
LLVMConstInt(ctx->ac.i32, 2, false),
   llvm_chan, 
attr_number, ctx->abi->prim_mask);
@@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, 
attrib_idx, "");
 
}
-   retu

[Mesa-dev] [PATCH v2 34/41] ac/nir: store all outputs as f32

2019-02-15 Thread Rhys Perry
v2: rebase
v2: fix 64-bit visit_load_var()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c   | 14 ++
 src/amd/vulkan/radv_nir_to_llvm.c | 22 +-
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 5821c18aeb1..bf7024c68e4 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2114,7 +2114,10 @@ static LLVMValueRef visit_load_var(struct ac_nir_context 
*ctx,
unreachable("unhandle variable mode");
}
ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp);
-   return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, 
&instr->dest.ssa), "");
+   if (instr->dest.ssa.bit_size == 16)
+   return ac_build_reinterpret(&ctx->ac, ret, get_def_type(ctx, 
&instr->dest.ssa));
+   else
+   return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, 
&instr->dest.ssa), "");
 }
 
 static void
@@ -2152,6 +2155,11 @@ visit_store_var(struct ac_nir_context *ctx,
 
writemask = writemask << comp;
 
+   LLVMTypeRef type = ctx->ac.f32;
+   if (LLVMGetTypeKind(LLVMTypeOf(src)) == LLVMVectorTypeKind)
+   type = LLVMVectorType(ctx->ac.f32, 
LLVMGetVectorSize(LLVMTypeOf(src)));
+   src = ac_build_reinterpret(&ctx->ac, src, type);
+
switch (deref->mode) {
case nir_var_shader_out:
 
@@ -4329,12 +4337,10 @@ ac_handle_shader_output_decl(struct ac_llvm_context 
*ctx,
}
}
 
-   bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
-   LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
for (unsigned i = 0; i < attrib_count; ++i) {
for (unsigned chan = 0; chan < 4; chan++) {
abi->outputs[ac_llvm_reg_index_soa(output_loc + i, 
chan)] =
-  ac_build_alloca_undef(ctx, type, "");
+  ac_build_alloca_undef(ctx, ctx->f32, "");
}
}
 
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 8fdaee72036..2002a744545 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2305,6 +2305,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
 
bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
if (ctx->stage == MESA_SHADER_FRAGMENT) {
+   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
unsigned index = target - V_008DFC_SQ_EXP_MRT;
unsigned col_format = (ctx->options->key.fs.col_format >> (4 * 
index)) & 0xf;
bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
@@ -2421,16 +2422,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
return;
}
 
-   if (is_16bit) {
-   for (unsigned chan = 0; chan < 4; chan++) {
-   values[chan] = LLVMBuildBitCast(ctx->ac.builder, 
values[chan], ctx->ac.i16, "");
-   args->out[chan] = LLVMBuildZExt(ctx->ac.builder, 
values[chan], ctx->ac.i32, "");
-   }
-   } else
-   memcpy(&args->out[0], values, sizeof(values[0]) * 4);
-
-   for (unsigned i = 0; i < 4; ++i)
-   args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
+   for (unsigned chan = 0; chan < 4; chan++)
+   args->out[chan] = ac_build_reinterpret(&ctx->ac, values[chan], 
ctx->ac.f32);
 }
 
 static void
@@ -3137,9 +3130,12 @@ handle_fs_outputs_post(struct radv_shader_context *ctx)
if (i < FRAG_RESULT_DATA0)
continue;
 
-   for (unsigned j = 0; j < 4; j++)
-   values[j] = ac_to_float(&ctx->ac,
-   radv_load_output(ctx, i, j));
+   for (unsigned j = 0; j < 4; j++) {
+   values[j] = radv_load_output(ctx, i, j);
+   unsigned index = ac_llvm_reg_index_soa(i, 0);
+   LLVMTypeRef new_type = ctx->abi.output_types[index];
+   values[j] = ac_build_reinterpret(&ctx->ac, values[j], 
new_type);
+   }
 
bool ret = si_export_mrt_color(ctx, values,
   i - FRAG_RESULT_DATA0,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation

2019-02-15 Thread Rhys Perry
This patch can be ignored. I forgot to delete it and it ended up getting sent.
"[PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation" is
the correct one.

On Sat, 16 Feb 2019 at 00:23, Rhys Perry  wrote:
>
> v2: add to patch series
>
> Signed-off-by: Rhys Perry 
> ---
>  src/amd/common/ac_llvm_build.c   | 33 +---
>  src/amd/common/ac_llvm_build.h   |  3 ++-
>  src/amd/common/ac_nir_to_llvm.c  | 14 +++---
>  src/amd/vulkan/radv_nir_to_llvm.c| 27 ++-
>  src/amd/vulkan/radv_pipeline.c   | 19 --
>  src/amd/vulkan/radv_shader.h |  1 +
>  src/gallium/drivers/radeonsi/si_shader.c |  2 +-
>  7 files changed, 69 insertions(+), 30 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index dff369aae7f..be2c2251a21 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
>LLVMValueRef attr_number,
>LLVMValueRef params,
>LLVMValueRef i,
> -  LLVMValueRef j)
> +  LLVMValueRef j,
> +  int word)
>  {
> -   LLVMValueRef args[5];
> +   LLVMValueRef args[6];
> LLVMValueRef p1;
>
> args[0] = i;
> args[1] = llvm_chan;
> args[2] = attr_number;
> -   args[3] = params;
> -
> -   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
> -   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
> +   if (word >= 0) {
> +   args[3] = LLVMConstInt(ctx->i1, word, false);
> +   args[4] = params;
> +   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
> +   ctx->f16, args, 5, 
> AC_FUNC_ATTR_READNONE);
> +   } else {
> +   args[3] = params;
> +   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
> +   ctx->f32, args, 4, 
> AC_FUNC_ATTR_READNONE);
> +   }
>
> args[0] = p1;
> args[1] = j;
> args[2] = llvm_chan;
> args[3] = attr_number;
> -   args[4] = params;
> -
> -   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
> - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
> +   if (word >= 0) {
> +   args[4] = LLVMConstInt(ctx->i1, word, false);
> +   args[5] = params;
> +   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
> + ctx->f16, args, 6, 
> AC_FUNC_ATTR_READNONE);
> +   } else {
> +   args[4] = params;
> +   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
> + ctx->f32, args, 5, 
> AC_FUNC_ATTR_READNONE);
> +   }
>  }
>
>  LLVMValueRef
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index 61c9b5e4b6c..655427567c4 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
>LLVMValueRef attr_number,
>LLVMValueRef params,
>LLVMValueRef i,
> -  LLVMValueRef j);
> +  LLVMValueRef j,
> +  int word);
>
>  LLVMValueRef
>  ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index bf7024c68e4..939b8eb13de 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
> *ctx,
> LLVMValueRef j = LLVMBuildExtractElement(
> ctx->ac.builder, interp_param, 
> ctx->ac.i32_1, "");
>
> +   /* This fp16 handling isn't technically 
> correct
> +* but should be correct for the attributes we
> +* are actually going to use. */
> +   bool fp16 = instr->dest.ssa.bit_size == 16;
> +   int word = fp16 ? 0 : -1;
> v = ac_build_fs_interp(&ctx->ac, llvm_chan, 
> attr_number,
> -  

[Mesa-dev] [PATCH v2 35/41] radv: store all fragment shader inputs as f32

2019-02-15 Thread Rhys Perry
v2: rebase

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 2002a744545..01b8b097ea1 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2056,7 +2056,6 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
LLVMValueRef attr_number;
unsigned chan;
LLVMValueRef i, j;
-   bool interp = !LLVMIsUndef(interp_param);
 
attr_number = LLVMConstInt(ctx->ac.i32, attr, false);
 
@@ -2070,7 +2069,7 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
 * fs.interp cannot be used on integers, because they can be equal
 * to NaN.
 */
-   if (interp) {
+   if (interp_param) {
interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
ctx->ac.v2f32, "");
 
@@ -2083,7 +2082,7 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
for (chan = 0; chan < 4; chan++) {
LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
 
-   if (interp) {
+   if (interp_param) {
result[chan] = ac_build_fs_interp(&ctx->ac,
  llvm_chan,
  attr_number,
@@ -2095,7 +2094,6 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
  attr_number,
  prim_mask);
result[chan] = LLVMBuildBitCast(ctx->ac.builder, 
result[chan], ctx->ac.i32, "");
-   result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, 
result[chan], LLVMTypeOf(interp_param), "");
}
}
 }
@@ -2123,10 +2121,6 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
 
interp = lookup_interp_param(&ctx->abi, 
variable->data.interpolation, interp_type);
}
-   bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
-   LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
-   if (interp == NULL)
-   interp = LLVMGetUndef(type);
 
for (unsigned i = 0; i < attrib_count; ++i)
ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp;
@@ -2187,7 +2181,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
if (ctx->shader_info->info.ps.uses_input_attachments ||
ctx->shader_info->info.needs_multiview_view_index) {
ctx->input_mask |= 1ull << VARYING_SLOT_LAYER;
-   ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = 
LLVMGetUndef(ctx->ac.i32);
+   ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = 
NULL;
}
 
for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
@@ -2203,7 +2197,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
interp_fs_input(ctx, index, interp_param, 
ctx->abi.prim_mask,
inputs);
 
-   if (LLVMIsUndef(interp_param))
+   if (!interp_param)
ctx->shader_info->fs.flat_shaded_mask |= 1u << 
index;
if (i >= VARYING_SLOT_VAR0)
ctx->abi.fs_input_attr_indices[i - 
VARYING_SLOT_VAR0] = index;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 33/41] ac/nir, radv: create an array of varying output types

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c   | 68 +++
 src/amd/common/ac_shader_abi.h|  1 +
 src/amd/vulkan/radv_nir_to_llvm.c |  3 ++
 3 files changed, 72 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index defbfdf4297..5821c18aeb1 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -4238,6 +4238,68 @@ static void visit_cf_list(struct ac_nir_context *ctx,
}
 }
 
+static unsigned traverse_var_component_slots(struct ac_llvm_context *ctx, bool 
vs_in,
+struct nir_variable *var, unsigned 
cur_offset,
+const struct glsl_type *cur_type,
+void (*cb)(struct ac_llvm_context 
*, unsigned, enum glsl_base_type, void *),
+void *cbdata)
+{
+   if (glsl_type_is_struct(cur_type)) {
+   for (unsigned i = 0; i < glsl_get_length(cur_type); i++) {
+   const struct glsl_type *ft = 
glsl_get_struct_field(cur_type, i);
+   cur_offset = traverse_var_component_slots(ctx, vs_in, 
var, cur_offset, ft, cb, cbdata);
+   }
+   return (cur_offset + 3) / 4 * 4;
+   }
+
+   enum glsl_base_type base_type = 
glsl_get_base_type(glsl_without_array_or_matrix(cur_type));
+
+   unsigned stride = 
glsl_get_component_slots(glsl_without_array_or_matrix(cur_type));
+   if (!var->data.compact)
+   stride = (stride + 3) / 4 * 4;
+   unsigned arr_len = MAX2(glsl_get_matrix_columns(cur_type), 1);
+   if (glsl_type_is_array(cur_type))
+   arr_len *= glsl_get_aoa_size(cur_type);
+   for (unsigned i = 0; i < arr_len; i++) {
+   for (unsigned j = 0; j < 
glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); j++) {
+   cb(ctx, cur_offset + var->data.location_frac + j, 
base_type, cbdata);
+   }
+   cur_offset += stride;
+   }
+   return cur_offset;
+}
+
+static void setup_output_type(struct ac_llvm_context *ctx, unsigned index, 
enum glsl_base_type base, void *output_types)
+{
+   LLVMTypeRef type;
+   switch (base) {
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT8:
+   type = ctx->i8;
+   break;
+   case GLSL_TYPE_INT16:
+   case GLSL_TYPE_UINT16:
+   type = ctx->i16;
+   break;
+   case GLSL_TYPE_FLOAT16:
+   type = ctx->f16;
+   break;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_INT64:
+   case GLSL_TYPE_UINT64:
+   type = ctx->i32;
+   break;
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   default:
+   type = ctx->f32;
+   break;
+   }
+   ((LLVMTypeRef*)output_types)[index] = type;
+}
+
 void
 ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
 struct ac_shader_abi *abi,
@@ -4275,6 +4337,9 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
   ac_build_alloca_undef(ctx, type, "");
}
}
+
+   traverse_var_component_slots(ctx, false, variable, output_loc * 4,
+variable->type, &setup_output_type, 
abi->output_types);
 }
 
 static void
@@ -4328,6 +4393,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct 
ac_shader_abi *abi,
 
ctx.main_function = 
LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 
+   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS * 4; i++)
+   ctx.abi->output_types[i] = ac->i32;
+
nir_foreach_variable(variable, &nir->outputs)
ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
 ctx.stage);
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index ee18e6c1923..274deeb13a4 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -69,6 +69,7 @@ struct ac_shader_abi {
LLVMValueRef view_index;
 
LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
+   LLVMTypeRef output_types[AC_LLVM_MAX_OUTPUTS * 4];
 
/* For VS and PS: pre-loaded shader inputs.
 *
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index d3795eec403..8fdaee72036 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3910,6 +3910,9 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler 
*ac_llvm,
ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
ac_setup_rings(&ctx);
 
+   for (unsigned i = 0; i < AC

[Mesa-dev] [PATCH v2 25/41] nir: make bitfield_reverse and ifind_msb work with all integers

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/compiler/nir/nir_opcodes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index dc4cd9ac63d..0f40bd6c548 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -350,7 +350,7 @@ unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, 
"src0 >> 32")
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tuint32, """
+unop("bitfield_reverse", tuint, """
 /* we're not winning any awards for speed here, but that's ok */
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++)
@@ -374,7 +374,7 @@ for (int bit = bit_size - 1; bit >= 0; bit--) {
 }
 """)
 
-unop("ifind_msb", tint32, """
+unop_convert("ifind_msb", tint32, tint, """
 dst = -1;
 for (int bit = 31; bit >= 0; bit--) {
/* If src0 < 0, we're looking for the first 0 bit.
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 32/41] ac/nir: add 8-bit types to glsl_base_to_llvm_type

2019-02-15 Thread Rhys Perry
v2: remove 16-bit additions and rebase

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f6ad1aa7e77..defbfdf4297 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3969,6 +3969,9 @@ glsl_base_to_llvm_type(struct ac_llvm_context *ac,
case GLSL_TYPE_BOOL:
case GLSL_TYPE_SUBROUTINE:
return ac->i32;
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT8:
+   return ac->i8;
case GLSL_TYPE_INT16:
case GLSL_TYPE_UINT16:
return ac->i16;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 31/41] ac/nir: implement 16-bit pack/unpack opcodes

2019-02-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bad1c2a990e..f6ad1aa7e77 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1015,6 +1015,30 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
}
 
+   case nir_op_pack_32_2x16_split: {
+   LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
+   result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, 
"");
+   break;
+   }
+
+   case nir_op_unpack_32_2x16_split_x: {
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+   ctx->ac.v2i16,
+   "");
+   result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+ctx->ac.i32_0, "");
+   break;
+   }
+
+   case nir_op_unpack_32_2x16_split_y: {
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+   ctx->ac.v2i16,
+   "");
+   result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+ctx->ac.i32_1, "");
+   break;
+   }
+
case nir_op_cube_face_coord: {
src[0] = ac_to_float(&ctx->ac, src[0]);
LLVMValueRef results[2];
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores

2019-02-18 Thread Rhys Perry
I don't see a 16-bit version of tbuffer.store in IntrinsicsAMDGPU.td
and simply changing "llvm.amdgcn.tbuffer.store.i32" to
"llvm.amdgcn.tbuffer.store.i16" and removing the zext doesn't seem to
work.

On Mon, 18 Feb 2019 at 08:55, Samuel Pitoiset  wrote:
>
> Does this fix anything know? There is a 16-bit version of tbuffer.store,
> maybe we should use it?
>
> On 2/16/19 1:21 AM, Rhys Perry wrote:
> > Signed-off-by: Rhys Perry 
> > ---
> >   src/amd/common/ac_nir_to_llvm.c | 2 ++
> >   1 file changed, 2 insertions(+)
> >
> > diff --git a/src/amd/common/ac_nir_to_llvm.c 
> > b/src/amd/common/ac_nir_to_llvm.c
> > index 89a78b43c6f..b260142c177 100644
> > --- a/src/amd/common/ac_nir_to_llvm.c
> > +++ b/src/amd/common/ac_nir_to_llvm.c
> > @@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context 
> > *ctx,
> >   } else if (num_bytes == 2) {
> >   store_name = "llvm.amdgcn.tbuffer.store.i32";
> >   data_type = ctx->ac.i32;
> > + data = LLVMBuildBitCast(ctx->ac.builder, data, 
> > ctx->ac.i16, "");
> > + data = LLVMBuildZExt(ctx->ac.builder, data, 
> > data_type, "");
> >   LLVMValueRef tbuffer_params[] = {
> >   data,
> >   rsrc,
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-18 Thread Rhys Perry
The CTS is buggy because the input_output_float_64_to_16 tests are run
even though they shouldn't be run because they try to use a
unadvertised (and unimplemented) optional feature.
Some of them crash for unrelated reasons though: load_tess_varyings()
from ac_nir_to_llvm.c doesn't handle 64-bit varyings. So not all of
them would work even if VK_FORMAT_R64_SFLOAT was a implemented vertex
format.

On Mon, 18 Feb 2019 at 08:53, Samuel Pitoiset  wrote:
>
>
> On 2/16/19 1:21 AM, Rhys Perry wrote:
> > This series add support for:
> > - VK_KHR_shader_float16_int8
> > - VK_AMD_gpu_shader_half_float
> > - VK_AMD_gpu_shader_int16
> > - VK_KHR_8bit_storage
> > on VI+. Half floats are disabled on LLVM 7 because of a bug causing large
> > memory usage and long (or unbounded) compilation times with some CTS
> > tests.
> >
> > It is written against the following patch series:
> > - https://patchwork.freedesktop.org/series/53454/ (v4)
> > - https://patchwork.freedesktop.org/series/53660/ (v1)
> >
> > With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega
> > and VI except for
> > dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.*
> > which fails or crashes because of unrelated radv bugs with 64-bit varyings
> > and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even
> > though radv does not support it.
>
> test bug?
>
> The two NIR related patches (22 and 25) should be sent separately,
> otherwise people working on NIR might miss them.
>
> >
> > With LLVM 9, there are no reproducable piglit regressions except for
> > glsl-array-bounds-12.shader_test because of a LLVM bug when
> > SLP vectorization is enabled.
> >
> > With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega
> > and VI except for those with LLVM 9 and a couple of tests because of a
> > LLVM bug after the SLP vectorizer and with the current lack of fallback
> > for 16-bit interpolation on LLVM versions before LLVM 9.
> >
> > With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega
> > and VI except for those with LLVM 9 and a couple of tests because of a
> > LLVM bug after the SLP vectorizer.
> >
> > The SLP vectorization patch is marked as WIP because it exposes LLVM bugs
> > with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and
> > some shader-db test for a game I can't remember. It also over-vectorizes
> > 32-bit code which can cause significant worsening in generated code
> > quality.
> >
> > The 16-bit interpolation patch is marked as WIP because it currently
> > requires intrinsics only available in LLVM 9 and does not have a fallback.
> >
> > A branch on Github containing this series can be found at:
> > https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2
> >
> > v2: rebase
> > v2: implement 16-bit interpolation
> > v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass
> > v2: run vectorization unconditionally on GFX9 and later
> > v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof()
> > v2: remove ac_int_of_size()
> > v2: fix 64-bit visit_load_var()
> > v2: mark VK_KHR_8bit_storage as DONE in features.txt
> > v2: mark SLP vectorization patch as WIP
> > v2: fix C++ style comment
> >
> > Rhys Perry (41):
> >radv: bitcast 16-bit outputs to integers
> >radv: ensure export arguments are always float
> >ac: add various helpers for float16/int16/int8
> >ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >ac/nir: implement 8-bit ssbo stores
> >ac/nir: fix 16-bit ssbo stores
> >ac/nir: implement 8-bit nir_load_const_instr
> >ac/nir: implement 8-bit conversions
> >ac/nir: fix 64-bit nir_op_f2f16_rtz
> >ac/nir: make ac_build_clamp work on all bit sizes
> >ac/nir: make ac_build_fract work on all bit sizes
> >ac/nir: make ac_build_isign work on all bit sizes
> >ac/nir: make ac_build_fsign work on all bit sizes
> >ac/nir: make ac_build_fdiv support 16-bit floats
> >ac/nir: implement half-float nir_op_frcp
> >ac/nir: implement half-float nir_op_frsq
> >ac/nir: implement half-float nir_op_ldexp
> >radv: lower 16-bit flrp
> >ac/nir: support half floats in emit_b2f
> >ac/nir: make emit_b2i work on all bit sizes
> >ac/nir: implement 16-bit shifts
> >compiler/nir: add lowering option for 16-bit ffma
> >ac/nir: implement 16-bit ac_build_ddxy
> >ac/nir: implement 8 and 16 bit ac

[Mesa-dev] [PATCH] nv50/ir, nvc0: add debug options for shader replacement

2018-05-29 Thread Rhys Perry
Changes in v4:
- Move code to nv50_ir_dump.cpp
- Dump headers of nvc0 programs
- Use CRC-32 instead of a truncated SHA1
- Set prog->maxGPR to targ->getFileSize() - 1 and set prog->tlsSize
- Don't compile the program if a replacement is offered
This has the consequence that a program is not dumped when it's replaced
Changes in v3:
- Fixed messed up patch description and diff
- Use the checksum of the TGSI instead of the binary if possible
Changes in v2:
- move "#ifdef DEBUG" from above dumpProgram to above createDumpFilename

The NV50_PROG_DUMP environment variable specifies a (already created)
directory to dump shader binaries, headers and tgsi code. The
NV50_PROG_REPLACE environment variable specifies a (already created)
directory that is searched to find replacement binaries and headers. This
is all much like MESA_SHADER_DUMP_PATH and MESA_SHADER_READ_PATH expect
using CRC-32 checksums instead of program IDs and chip-specific binaries
instead of GLSL.

Signed-off-by: Rhys Perry 
---
 src/gallium/auxiliary/tgsi/tgsi_util.h |   1 +
 src/gallium/drivers/nouveau/Makefile.sources   |   2 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|  40 +++--
 .../drivers/nouveau/codegen/nv50_ir_driver.h   |   1 +
 .../drivers/nouveau/codegen/nv50_ir_dump.cpp   | 171 +
 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h |  70 +
 src/gallium/drivers/nouveau/meson.build|   2 +
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 138 +++--
 8 files changed, 360 insertions(+), 65 deletions(-)
 create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.cpp
 create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h

diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h 
b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 686b90f467..81cf955d8f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_UTIL_H
 #define TGSI_UTIL_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
diff --git a/src/gallium/drivers/nouveau/Makefile.sources 
b/src/gallium/drivers/nouveau/Makefile.sources
index 65f08c7d8d..e867221818 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -114,6 +114,8 @@ NV50_CODEGEN_SOURCES := \
codegen/nv50_ir_build_util.cpp \
codegen/nv50_ir_build_util.h \
codegen/nv50_ir_driver.h \
+   codegen/nv50_ir_dump.cpp \
+   codegen/nv50_ir_dump.h \
codegen/nv50_ir_emit_nv50.cpp \
codegen/nv50_ir_from_tgsi.cpp \
codegen/nv50_ir_graph.cpp \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index c987da9908..b1782bb4f2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -23,6 +23,7 @@
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_target.h"
 #include "codegen/nv50_ir_driver.h"
+#include "codegen/nv50_ir_dump.h"
 
 extern "C" {
 #include "nouveau_debug.h"
@@ -1244,30 +1245,35 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
   prog->print();
 
targ->parseDriverInfo(info);
-   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
 
-   prog->convertToSSA();
+   if (!nv50_ir::replaceProgramCode(prog)) {
+  prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
 
-   if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
-  prog->print();
+  prog->convertToSSA();
 
-   prog->optimizeSSA(info->optLevel);
-   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
+  if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+ prog->print();
 
-   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
-  prog->print();
+  prog->optimizeSSA(info->optLevel);
+  prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
 
-   if (!prog->registerAllocation()) {
-  ret = -4;
-  goto out;
-   }
-   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
+  if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+ prog->print();
 
-   prog->optimizePostRA(info->optLevel);
+  if (!prog->registerAllocation()) {
+ ret = -4;
+ goto out;
+  }
+  prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
 
-   if (!prog->emitBinary(info)) {
-  ret = -5;
-  goto out;
+  prog->optimizePostRA(info->optLevel);
+
+  if (!prog->emitBinary(info)) {
+ ret = -5;
+ goto out;
+  }
+
+  nv50_ir::dumpProgramCodeAndIR(prog);
}
 
 out:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h 

Re: [Mesa-dev] [PATCH v2 4/5] nvc0: add support for programmable sample locations

2018-05-29 Thread Rhys Perry
EvaluateDepthValuesARB()/ResolveDepthValuesNV() is a hint for the
driver to decompress the depth buffer if needed. This can be needed
because the decompressed result can depend on the current sample
locations.

Fiddling around with the current state of the patches, I could not
find a case where it seemed that compressed depth values depended
on the sample locations. I figured the depth values in the test were
rather compressible, but I don't know any details about Nvidia's
depth compression.

I wouldn't mind running a trace of the blob and see if it does
anything though, if you want to be more sure.

As for the MS=1 thing, it's for the unlikely case that someone wants
to create a single sample texture through some other API than OpenGL
or just direct gallium and wants to program the sample locations.
It doesn't matter much, though I think it's pretty harmless.

On Mon, May 28, 2018 at 9:05 PM, Ilia Mirkin  wrote:
> ARB_sample_locaitons has all this stuff about a resolve of some sort
> when you switch around the locations. I don't see anything here about
> that. Thoughts?
>
> Also some more specific comments inline:
>
> On Thu, May 10, 2018 at 12:28 PM, Rhys Perry  wrote:
>> Signed-off-by: Rhys Perry 
>> ---
>>  .../drivers/nouveau/codegen/nv50_ir_driver.h   |   2 +
>>  .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |   7 +
>>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp  |  91 +--
>>  .../nouveau/codegen/nv50_ir_lowering_nvc0.h|   2 +
>>  src/gallium/drivers/nouveau/nv50/nv50_miptree.c|   1 +
>>  src/gallium/drivers/nouveau/nv50/nv50_resource.h   |   1 +
>>  src/gallium/drivers/nouveau/nvc0/nvc0_context.h|  15 +-
>>  src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c|   1 +
>>  src/gallium/drivers/nouveau/nvc0/nvc0_program.c|   3 +
>>  src/gallium/drivers/nouveau/nvc0/nvc0_screen.c |  33 +++-
>>  src/gallium/drivers/nouveau/nvc0/nvc0_state.c  |  17 +-
>>  .../drivers/nouveau/nvc0/nvc0_state_validate.c | 174 
>> +
>>  12 files changed, 301 insertions(+), 46 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h 
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
>> index 3d0782f86b..7c835ceab8 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
>> @@ -73,6 +73,7 @@ struct nv50_ir_prog_symbol
>>  #define NVISA_GK104_CHIPSET0xe0
>>  #define NVISA_GK20A_CHIPSET0xea
>>  #define NVISA_GM107_CHIPSET0x110
>> +#define NVISA_GM200_CHIPSET0x120
>>
>>  struct nv50_ir_prog_info
>>  {
>> @@ -145,6 +146,7 @@ struct nv50_ir_prog_info
>>   bool persampleInvocation;
>>   bool usesSampleMaskIn;
>>   bool readsFramebuffer;
>> + bool readsSampleLocations;
>>} fp;
>>struct {
>>   uint32_t inputOffset; /* base address for user args */
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> index 3c5bad05fe..d7844d7381 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> @@ -1520,6 +1520,10 @@ void Source::scanInstructionSrc(const Instruction& 
>> insn,
>>   info->out[src.getIndex(0)].oread = 1;
>>}
>> }
>> +   if (src.getFile() == TGSI_FILE_SYSTEM_VALUE) {
>> +  if (info->sv[src.getIndex(0)].sn == TGSI_SEMANTIC_SAMPLEPOS)
>> + info->prop.fp.readsSampleLocations = true;
>> +   }
>> if (src.getFile() != TGSI_FILE_INPUT)
>>return;
>>
>> @@ -1560,6 +1564,9 @@ bool Source::scanInstruction(const struct 
>> tgsi_full_instruction *inst)
>> if (insn.getOpcode() == TGSI_OPCODE_FBFETCH)
>>info->prop.fp.readsFramebuffer = true;
>>
>> +   if (insn.getOpcode() == TGSI_OPCODE_INTERP_SAMPLE)
>> +  info->prop.fp.readsSampleLocations = true;
>> +
>> if (insn.dstCount()) {
>>Instruction::DstRegister dst = insn.getDst(0);
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> index 29f674b451..5f5298777e 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> @@ -2662,17 +2662,33 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
>>ld-

[Mesa-dev] [PATCH v5] nv50/ir, nvc0: add debug options for shader replacement

2018-05-30 Thread Rhys Perry
Changes in v5:
- Add a forgotten change to fix memory leaks of fname
Changes in v4:
- Move code to nv50_ir_dump.cpp
- Dump headers of nvc0 programs
- Use CRC-32 instead of a truncated SHA1
- Set prog->maxGPR to targ->getFileSize() - 1 and set prog->tlsSize
- Don't compile the program if a replacement is offered
This has the consequence that a program is not dumped when it's replaced
Changes in v3:
- Fixed messed up patch description and diff
- Use the checksum of the TGSI instead of the binary if possible
Changes in v2:
- move "#ifdef DEBUG" from above dumpProgram to above createDumpFilename

The NV50_PROG_DUMP environment variable specifies a (already created)
directory to dump shader binaries, headers and tgsi code. The
NV50_PROG_REPLACE environment variable specifies a (already created)
directory that is searched to find replacement binaries and headers. This
is all much like MESA_SHADER_DUMP_PATH and MESA_SHADER_READ_PATH expect
using CRC-32 checksums instead of program IDs and chip-specific binaries
instead of GLSL.

Signed-off-by: Rhys Perry 
---
 src/gallium/auxiliary/tgsi/tgsi_util.h |   1 +
 src/gallium/drivers/nouveau/Makefile.sources   |   2 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|  40 +++--
 .../drivers/nouveau/codegen/nv50_ir_driver.h   |   1 +
 .../drivers/nouveau/codegen/nv50_ir_dump.cpp   | 174 +
 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h |  70 +
 src/gallium/drivers/nouveau/meson.build|   2 +
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c| 138 ++--
 8 files changed, 363 insertions(+), 65 deletions(-)
 create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.cpp
 create mode 100644 src/gallium/drivers/nouveau/codegen/nv50_ir_dump.h

diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h 
b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 686b90f467..81cf955d8f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_UTIL_H
 #define TGSI_UTIL_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
diff --git a/src/gallium/drivers/nouveau/Makefile.sources 
b/src/gallium/drivers/nouveau/Makefile.sources
index 65f08c7d8d..e867221818 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -114,6 +114,8 @@ NV50_CODEGEN_SOURCES := \
codegen/nv50_ir_build_util.cpp \
codegen/nv50_ir_build_util.h \
codegen/nv50_ir_driver.h \
+   codegen/nv50_ir_dump.cpp \
+   codegen/nv50_ir_dump.h \
codegen/nv50_ir_emit_nv50.cpp \
codegen/nv50_ir_from_tgsi.cpp \
codegen/nv50_ir_graph.cpp \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index c987da9908..b1782bb4f2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -23,6 +23,7 @@
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_target.h"
 #include "codegen/nv50_ir_driver.h"
+#include "codegen/nv50_ir_dump.h"
 
 extern "C" {
 #include "nouveau_debug.h"
@@ -1244,30 +1245,35 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
   prog->print();
 
targ->parseDriverInfo(info);
-   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
 
-   prog->convertToSSA();
+   if (!nv50_ir::replaceProgramCode(prog)) {
+  prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
 
-   if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
-  prog->print();
+  prog->convertToSSA();
 
-   prog->optimizeSSA(info->optLevel);
-   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
+  if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+ prog->print();
 
-   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
-  prog->print();
+  prog->optimizeSSA(info->optLevel);
+  prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
 
-   if (!prog->registerAllocation()) {
-  ret = -4;
-  goto out;
-   }
-   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
+  if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+ prog->print();
 
-   prog->optimizePostRA(info->optLevel);
+  if (!prog->registerAllocation()) {
+ ret = -4;
+ goto out;
+  }
+  prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
 
-   if (!prog->emitBinary(info)) {
-  ret = -5;
-  goto out;
+  prog->optimizePostRA(info->optLevel);
+
+  if (!prog->emitBinary(info)) {
+ ret = -5;
+ goto out;
+  }
+
+  nv50_ir::dumpProgramCodeAndIR(prog);
}
 
 out:
diff --gi

[Mesa-dev] [PATCH v3 0/5] Implement ARB_sample_locations for nvc0

2018-06-01 Thread Rhys Perry
This patch set adds support for GL_ARB_sample_locations in mesa core, gallium,
the mesa OpenGL state tracker and the nvc0 driver.

Changes in v3:
- Fix non-althabetical order of new extensions in extensions_table.h
- Implement glEvaluateDepthValuesARB()/glResolveDepthValuesNV()
- Stylistic changes and addition of comments in the nvc0 code
- Renamed patch 5 and added GL_*_sample_locations to the release notes
Changes in v2:
- various minor changes/cleanups (mostly formatting and style changes)
- improve error handling
- don't expose the ARB_* variant on ES
- expose NV_sample_locations so the feature is available on ES
- decouple framebuffer and sample location state in the state tracker and nvc0
- rebase to upstream master

Rhys Perry (5):
  mesa: add support for ARB_sample_locations
  gallium: add support for programmable sample locations
  st/mesa: add support for ARB_sample_locations
  nvc0: add support for programmable sample locations
  docs: document addition of GL_ARB_sample_locations for nvc0

 docs/features.txt  |   2 +-
 docs/relnotes/18.2.0.html  |   2 +-
 src/gallium/auxiliary/util/u_framebuffer.c |  30 +++
 src/gallium/auxiliary/util/u_framebuffer.h |   5 +
 src/gallium/docs/source/context.rst|  14 ++
 src/gallium/docs/source/screen.rst |   3 +
 src/gallium/drivers/etnaviv/etnaviv_screen.c   |   1 +
 src/gallium/drivers/freedreno/freedreno_screen.c   |   1 +
 src/gallium/drivers/i915/i915_screen.c |   1 +
 src/gallium/drivers/llvmpipe/lp_screen.c   |   1 +
 .../drivers/nouveau/codegen/nv50_ir_driver.h   |   2 +
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |   7 +
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp  | 102 +++-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h|   2 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c |   1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c |   1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_context.h|  15 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c|   3 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c |  32 +++
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c  |  17 +-
 .../drivers/nouveau/nvc0/nvc0_state_validate.c | 152 +---
 src/gallium/drivers/nouveau/nvc0/nvc0_surface.c|  12 +
 src/gallium/drivers/r300/r300_screen.c |   1 +
 src/gallium/drivers/r600/r600_pipe.c   |   1 +
 src/gallium/drivers/radeonsi/si_get.c  |   1 +
 src/gallium/drivers/softpipe/sp_screen.c   |   1 +
 src/gallium/drivers/svga/svga_screen.c |   1 +
 src/gallium/drivers/swr/swr_screen.cpp |   1 +
 src/gallium/drivers/v3d/v3d_screen.c   |   1 +
 src/gallium/drivers/vc4/vc4_screen.c   |   1 +
 src/gallium/drivers/virgl/virgl_screen.c   |   1 +
 src/gallium/include/pipe/p_context.h   |  41 +++-
 src/gallium/include/pipe/p_defines.h   |   1 +
 src/gallium/include/pipe/p_screen.h|  11 +
 src/gallium/include/pipe/p_state.h |   1 +
 src/mapi/glapi/gen/gl_API.xml  | 104 +
 src/mesa/main/config.h |   9 +
 src/mesa/main/dd.h |   8 +
 src/mesa/main/extensions_table.h   |   2 +
 src/mesa/main/fbobject.c   | 256 ++---
 src/mesa/main/fbobject.h   |  20 ++
 src/mesa/main/framebuffer.c|  10 +
 src/mesa/main/get.c|  31 +++
 src/mesa/main/get_hash_params.py   |   6 +
 src/mesa/main/mtypes.h |   9 +
 src/mesa/main/multisample.c|  18 ++
 src/mesa/main/tests/dispatch_sanity.cpp|  10 +
 src/mesa/state_tracker/st_atom.h   |   2 +-
 src/mesa/state_tracker/st_atom_list.h  |   2 +-
 src/mesa/state_tracker/st_atom_msaa.c  |  77 ++-
 src/mesa/state_tracker/st_cb_fbo.c |  14 ++
 src/mesa/state_tracker/st_cb_msaa.c|  27 +++
 src/mesa/state_tracker/st_context.c|   7 +-
 src/mesa/state_tracker/st_context.h|   6 +
 src/mesa/state_tracker/st_extensions.c |   1 +
 55 files changed, 1004 insertions(+), 84 deletions(-)

-- 
2.14.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 2/5] gallium: add support for programmable sample locations

2018-06-01 Thread Rhys Perry
Signed-off-by: Rhys Perry 
Reviewed-by: Brian Paul  (v2)
Reviewed-by: Marek Olšák  (v2)
---
 src/gallium/auxiliary/util/u_framebuffer.c   | 30 +
 src/gallium/auxiliary/util/u_framebuffer.h   |  5 +++
 src/gallium/docs/source/context.rst  | 14 
 src/gallium/docs/source/screen.rst   |  3 ++
 src/gallium/drivers/etnaviv/etnaviv_screen.c |  1 +
 src/gallium/drivers/freedreno/freedreno_screen.c |  1 +
 src/gallium/drivers/i915/i915_screen.c   |  1 +
 src/gallium/drivers/llvmpipe/lp_screen.c |  1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   |  1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   |  1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   |  1 +
 src/gallium/drivers/r300/r300_screen.c   |  1 +
 src/gallium/drivers/r600/r600_pipe.c |  1 +
 src/gallium/drivers/radeonsi/si_get.c|  1 +
 src/gallium/drivers/softpipe/sp_screen.c |  1 +
 src/gallium/drivers/svga/svga_screen.c   |  1 +
 src/gallium/drivers/swr/swr_screen.cpp   |  1 +
 src/gallium/drivers/v3d/v3d_screen.c |  1 +
 src/gallium/drivers/vc4/vc4_screen.c |  1 +
 src/gallium/drivers/virgl/virgl_screen.c |  1 +
 src/gallium/include/pipe/p_context.h | 41 ++--
 src/gallium/include/pipe/p_defines.h |  1 +
 src/gallium/include/pipe/p_screen.h  | 11 +++
 src/gallium/include/pipe/p_state.h   |  1 +
 24 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_framebuffer.c 
b/src/gallium/auxiliary/util/u_framebuffer.c
index c2948a5cfb..5bafddc726 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -240,3 +240,33 @@ util_framebuffer_get_num_samples(const struct 
pipe_framebuffer_state *fb)
 
return 1;
 }
+
+
+/**
+ * Flip the sample location state along the Y axis.
+ */
+void
+util_sample_locations_flip_y(struct pipe_screen *screen, unsigned fb_height,
+ unsigned samples, uint8_t *locations)
+{
+   unsigned row, i, shift, grid_width, grid_height;
+   uint8_t new_locations[
+  PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE *
+  PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32];
+
+   screen->get_sample_pixel_grid(screen, samples, &grid_width, &grid_height);
+
+   shift = fb_height % grid_height;
+
+   for (row = 0; row < grid_height; row++) {
+  unsigned row_size = grid_width * samples;
+  for (i = 0; i < row_size; i++) {
+ unsigned dest_row = grid_height - row - 1;
+ /* this relies on unsigned integer wraparound behaviour */
+ dest_row = (dest_row - shift) % grid_height;
+ new_locations[dest_row * row_size + i] = locations[row * row_size + 
i];
+  }
+   }
+
+   memcpy(locations, new_locations, grid_width * grid_height * samples);
+}
diff --git a/src/gallium/auxiliary/util/u_framebuffer.h 
b/src/gallium/auxiliary/util/u_framebuffer.h
index c73942c9c1..877e6e393f 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.h
+++ b/src/gallium/auxiliary/util/u_framebuffer.h
@@ -64,6 +64,11 @@ extern unsigned
 util_framebuffer_get_num_samples(const struct pipe_framebuffer_state *fb);
 
 
+extern void
+util_sample_locations_flip_y(struct pipe_screen *screen, unsigned fb_height,
+ unsigned samples, uint8_t *locations);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/docs/source/context.rst 
b/src/gallium/docs/source/context.rst
index e8e80dcbc3..20d0df7931 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -68,6 +68,9 @@ objects. They all follow simple, one-method binding calls, 
e.g.
   that this takes effect even if multisampling is not explicitly enabled if
   the frambuffer surface(s) are multisampled.  Also, this mask is AND-ed
   with the optional fragment shader sample mask output (when emitted).
+* ``set_sample_locations`` sets the sample locations used for rasterization.
+  ```get_sample_position``` still returns the default locations. When NULL,
+  the default locations are used.
 * ``set_min_samples`` sets the minimum number of samples that must be run.
 * ``set_clip_state``
 * ``set_polygon_stipple``
@@ -270,6 +273,17 @@ format.
 multi-byte element value starting at offset bytes from resource start, going
 for size bytes. It is guaranteed that size % clear_value_size == 0.
 
+Evaluating Depth Buffers
+
+
+``evaluate_depth_buffer`` is a hint to decompress the current depth buffer
+assuming the current sample locations to avoid problems that could arise when
+using programmable sample locations.
+
+If a depth buffer is rendered with different sample location state than
+what is current at the time of reading the depth buffer, the values may differ
+because depth buffer compression can depend the sample locations.
+
 
 Uploading
 ^
diff --

[Mesa-dev] [PATCH v3 1/5] mesa: add support for ARB_sample_locations

2018-06-01 Thread Rhys Perry
Signed-off-by: Rhys Perry 
Reviewed-by: Brian Paul  (v2)
Reviewed-by: Marek Olšák  (v2)
---
 src/mapi/glapi/gen/gl_API.xml   | 104 +
 src/mesa/main/config.h  |   9 ++
 src/mesa/main/dd.h  |   8 +
 src/mesa/main/extensions_table.h|   2 +
 src/mesa/main/fbobject.c| 256 
 src/mesa/main/fbobject.h|  20 +++
 src/mesa/main/framebuffer.c |  10 ++
 src/mesa/main/get.c |  31 
 src/mesa/main/get_hash_params.py|   6 +
 src/mesa/main/mtypes.h  |   9 ++
 src/mesa/main/multisample.c |  18 +++
 src/mesa/main/tests/dispatch_sanity.cpp |  10 ++
 12 files changed, 455 insertions(+), 28 deletions(-)

diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 8ad45970c9..49807e1ea5 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -10881,6 +10881,110 @@
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 
 
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 81573bfbf2..6a2f766222 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -315,4 +315,13 @@
 #define MAX_CLIPPED_VERTICES ((2 * (6 + MAX_CLIP_PLANES))+1)
 
 
+/** For GL_ARB_sample_locations - maximum of SAMPLE_LOCATION_PIXEL_GRID_*_ARB 
*/
+#define MAX_SAMPLE_LOCATION_GRID_SIZE 4
+
+/* It is theoretically possible for Consts.MaxSamples to be >32 but
+ * other code seems to assume that is not the case.
+ */
+#define MAX_SAMPLE_LOCATION_TABLE_SIZE \
+   (MAX_SAMPLE_LOCATION_GRID_SIZE * MAX_SAMPLE_LOCATION_GRID_SIZE * 32)
+
 #endif /* MESA_CONFIG_H_INCLUDED */
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 9f9606ac6b..1b048d3ff8 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -787,6 +787,14 @@ struct dd_function_table {
   GLenum target, GLsizei numAttachments,
   const GLenum *attachments);
 
+   /**
+* \name Functions for GL_ARB_sample_locations
+*/
+   void (*GetProgrammableSampleCaps)(struct gl_context *ctx,
+ const struct gl_framebuffer *fb,
+ GLuint *bits, GLuint *width, GLuint 
*height);
+   void (*EvaluateDepthValues)(struct gl_context *ctx);
+
/**
 * \name Query objects
 */
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 9207e3f8c6..ab1fd170bd 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -103,6 +103,7 @@ EXT(ARB_provoking_vertex, 
EXT_provoking_vertex
 EXT(ARB_query_buffer_object , ARB_query_buffer_object  
  , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_robust_buffer_access_behavior   , 
ARB_robust_buffer_access_behavior  , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_robustness  , dummy_true   
  , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_sample_locations, ARB_sample_locations 
  , GLL, GLC,  x ,  x , 2015)
 EXT(ARB_sample_shading  , ARB_sample_shading   
  , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_sampler_objects , dummy_true   
  , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_seamless_cube_map   , ARB_seamless_cube_map
  , GLL, GLC,  x ,  x , 2009)
@@ -350,6 +351,7 @@ EXT(NV_read_buffer  , dummy_true
 EXT(NV_read_depth   , dummy_true   
  ,  x ,  x ,  x , ES2, 2011)
 EXT(NV_read_depth_stencil   , dummy_true   
  ,  x ,  x ,  x , ES2, 2011)
 EXT(NV_read_stencil , dummy_true   
  ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_sample_locations , ARB_sample_locations 
  , GLL, GLC,  x , ES2, 2015)
 EXT(NV_texgen_reflection, dummy_true   
  , GLL,  x ,  x ,  x , 1999)
 EXT(NV_texture_barrier  , NV_texture_barrier   
  , GLL, GLC,  x ,  x , 2009)
 EXT(NV_texture_env_combine4 , NV_texture_env_combine4  
  , GLL,  x ,  x ,

[Mesa-dev] [PATCH v3 3/5] st/mesa: add support for ARB_sample_locations

2018-06-01 Thread Rhys Perry
Signed-off-by: Rhys Perry 
Reviewed-by: Brian Paul  (v2)
Reviewed-by: Marek Olšák  (v2)
---
 src/mesa/state_tracker/st_atom.h   |  2 +-
 src/mesa/state_tracker/st_atom_list.h  |  2 +-
 src/mesa/state_tracker/st_atom_msaa.c  | 77 +-
 src/mesa/state_tracker/st_cb_fbo.c | 14 +++
 src/mesa/state_tracker/st_cb_msaa.c| 27 
 src/mesa/state_tracker/st_context.c|  7 ++--
 src/mesa/state_tracker/st_context.h|  6 +++
 src/mesa/state_tracker/st_extensions.c |  1 +
 8 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index 2567ad30df..96e128d38c 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -86,7 +86,7 @@ enum {
  ST_NEW_CS_SAMPLERS)
 
 #define ST_NEW_FRAMEBUFFER  (ST_NEW_FB_STATE | \
- ST_NEW_SAMPLE_MASK | \
+ ST_NEW_SAMPLE_STATE | \
  ST_NEW_SAMPLE_SHADING)
 
 #define ST_NEW_VERTEX_PROGRAM(st, p) (p->affected_states | \
diff --git a/src/mesa/state_tracker/st_atom_list.h 
b/src/mesa/state_tracker/st_atom_list.h
index 5391d4710c..e1aebc91e7 100644
--- a/src/mesa/state_tracker/st_atom_list.h
+++ b/src/mesa/state_tracker/st_atom_list.h
@@ -34,7 +34,7 @@ ST_STATE(ST_NEW_FS_IMAGES, st_bind_fs_images)
 ST_STATE(ST_NEW_FB_STATE, st_update_framebuffer_state) /* depends on 
update_*_texture and bind_*_images */
 ST_STATE(ST_NEW_BLEND, st_update_blend) /* depends on update_framebuffer_state 
*/
 ST_STATE(ST_NEW_RASTERIZER, st_update_rasterizer) /* depends on 
update_framebuffer_state */
-ST_STATE(ST_NEW_SAMPLE_MASK, st_update_sample_mask) /* depends on 
update_framebuffer_state */
+ST_STATE(ST_NEW_SAMPLE_STATE, st_update_sample_state) /* depends on 
update_framebuffer_state */
 ST_STATE(ST_NEW_SAMPLE_SHADING, st_update_sample_shading)
 ST_STATE(ST_NEW_SCISSOR, st_update_scissor) /* depends on 
update_framebuffer_state */
 ST_STATE(ST_NEW_VIEWPORT, st_update_viewport) /* depends on 
update_framebuffer_state */
diff --git a/src/mesa/state_tracker/st_atom_msaa.c 
b/src/mesa/state_tracker/st_atom_msaa.c
index 556c7c5889..c6affec552 100644
--- a/src/mesa/state_tracker/st_atom_msaa.c
+++ b/src/mesa/state_tracker/st_atom_msaa.c
@@ -33,13 +33,84 @@
 #include "st_program.h"
 
 #include "cso_cache/cso_context.h"
+#include "util/u_framebuffer.h"
 #include "main/framebuffer.h"
 
 
-/* Update the sample mask for MSAA.
+/**
+ * Update the sample locations
+ */
+static void
+update_sample_locations(struct st_context *st)
+{
+   struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+
+   if (!st->ctx->Extensions.ARB_sample_locations)
+  return;
+
+   if (fb->ProgrammableSampleLocations) {
+  unsigned grid_width, grid_height, size, pixel, sample_index;
+  unsigned samples = st->state.fb_num_samples;
+  bool sample_location_pixel_grid = fb->SampleLocationPixelGrid;
+  uint8_t locations[
+ PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE *
+ PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32];
+
+  st->pipe->screen->get_sample_pixel_grid(
+ st->pipe->screen, samples, &grid_width, &grid_height);
+  size = grid_width * grid_height * samples;
+
+  /**
+   * when a dimension is greater than MAX_SAMPLE_LOCATION_GRID_SIZE,
+   * st->ctx->Driver.GetSamplePixelGrid() returns 1 for both dimensions.
+   */
+  if (grid_width > MAX_SAMPLE_LOCATION_GRID_SIZE ||
+  grid_height > MAX_SAMPLE_LOCATION_GRID_SIZE)
+ sample_location_pixel_grid = false;
+
+  for (pixel = 0; pixel < grid_width * grid_height; pixel++) {
+ for (sample_index = 0; sample_index < samples; sample_index++) {
+int table_index = sample_index;
+float x = 0.5f, y = 0.5f;
+uint8_t loc;
+if (sample_location_pixel_grid)
+   table_index = pixel * samples + sample_index;
+if (fb->SampleLocationTable) {
+   x = fb->SampleLocationTable[table_index*2];
+   y = fb->SampleLocationTable[table_index*2+1];
+}
+if (st->state.fb_orientation == Y_0_BOTTOM)
+   y = 1.0 - y;
+
+loc = roundf(CLAMP(x * 16.0f, 0.0f, 15.0f));
+loc |= (int)roundf(CLAMP(y * 16.0f, 0.0f, 15.0f)) << 4;
+locations[pixel * samples + sample_index] = loc;
+ }
+  }
+
+  util_sample_locations_flip_y(
+ st->pipe->screen, st->state.fb_height, samples, locations);
+
+  if (!st->state.enable_sample_locations ||
+  st->state.sample_locations_samples != samples ||
+  memcmp(locations, st->state.sample_locations, size) != 0) {
+ st->pipe->set_sample_locations( st->pipe, size, locations);
+ 

[Mesa-dev] [PATCH v3 4/5] nvc0: add support for programmable sample locations

2018-06-01 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_driver.h   |   2 +
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |   7 +
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp  | 102 --
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h|   2 +
 src/gallium/drivers/nouveau/nvc0/nvc0_context.h|  15 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c|   3 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c |  33 -
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c  |  17 ++-
 .../drivers/nouveau/nvc0/nvc0_state_validate.c | 152 +
 src/gallium/drivers/nouveau/nvc0/nvc0_surface.c|  12 ++
 10 files changed, 299 insertions(+), 46 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 3d0782f86b..7c835ceab8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -73,6 +73,7 @@ struct nv50_ir_prog_symbol
 #define NVISA_GK104_CHIPSET0xe0
 #define NVISA_GK20A_CHIPSET0xea
 #define NVISA_GM107_CHIPSET0x110
+#define NVISA_GM200_CHIPSET0x120
 
 struct nv50_ir_prog_info
 {
@@ -145,6 +146,7 @@ struct nv50_ir_prog_info
  bool persampleInvocation;
  bool usesSampleMaskIn;
  bool readsFramebuffer;
+ bool readsSampleLocations;
   } fp;
   struct {
  uint32_t inputOffset; /* base address for user args */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 3c5bad05fe..d7844d7381 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1520,6 +1520,10 @@ void Source::scanInstructionSrc(const Instruction& insn,
  info->out[src.getIndex(0)].oread = 1;
   }
}
+   if (src.getFile() == TGSI_FILE_SYSTEM_VALUE) {
+  if (info->sv[src.getIndex(0)].sn == TGSI_SEMANTIC_SAMPLEPOS)
+ info->prop.fp.readsSampleLocations = true;
+   }
if (src.getFile() != TGSI_FILE_INPUT)
   return;
 
@@ -1560,6 +1564,9 @@ bool Source::scanInstruction(const struct 
tgsi_full_instruction *inst)
if (insn.getOpcode() == TGSI_OPCODE_FBFETCH)
   info->prop.fp.readsFramebuffer = true;
 
+   if (insn.getOpcode() == TGSI_OPCODE_INTERP_SAMPLE)
+  info->prop.fp.readsSampleLocations = true;
+
if (insn.dstCount()) {
   Instruction::DstRegister dst = insn.getDst(0);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 29f674b451..5723847234 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2662,17 +2662,33 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
   ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   break;
case SV_SAMPLE_POS: {
-  Value *off = new_LValue(func, FILE_GPR);
-  ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
+  Value *sampleID = bld.getScratch();
+  ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
   ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
-  bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
-  bld.mkLoad(TYPE_F32,
- i->getDef(0),
- bld.mkSymbol(
-   FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
-   TYPE_U32, prog->driver->io.sampleInfoBase +
-   4 * sym->reg.data.sv.index),
- off);
+  Value *offset = calculateSampleOffset(sampleID);
+
+  assert(prog->driver->prop.fp.readsSampleLocations);
+
+  if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
+ bld.mkLoad(TYPE_F32,
+i->getDef(0),
+bld.mkSymbol(
+  FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+  TYPE_U32, prog->driver->io.sampleInfoBase),
+offset);
+ bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
+   bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
+ bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
+ bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), 
bld.mkImm(1.0f / 16.0f));
+  } else {
+ bld.mkLoad(TYPE_F32,
+i->getDef(0),
+bld.mkSymbol(
+  FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+  TYPE_U32, prog->driver->io.sampleInfoBase +
+  4 * sym->reg.data.sv.index),
+offset);
+  }
   break;
}
case SV_SAMPLE_MASK: {
@@ -2832,6 +2848,69 @@ NVC0Lo

[Mesa-dev] [PATCH v3 5/5] docs: document addition of GL_ARB_sample_locations for nvc0

2018-06-01 Thread Rhys Perry
Signed-off-by: Rhys Perry 
Reviewed-by: Brian Paul  (v2)
---
 docs/features.txt | 2 +-
 docs/relnotes/18.2.0.html | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index e786bbecf4..2eac14fb32 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -305,7 +305,7 @@ Khronos, ARB, and OES extensions that are not part of any 
OpenGL or OpenGL ES ve
   GL_ARB_parallel_shader_compilenot started, but 
Chia-I Wu did some related work in 2014
   GL_ARB_post_depth_coverageDONE (i965, nvc0)
   GL_ARB_robustness_isolation   not started
-  GL_ARB_sample_locations   not started
+  GL_ARB_sample_locations   DONE (nvc0)
   GL_ARB_seamless_cubemap_per_texture   DONE (i965, nvc0, 
radeonsi, r600, softpipe, swr)
   GL_ARB_shader_ballot  DONE (i965/gen8+, 
nvc0, radeonsi)
   GL_ARB_shader_clock   DONE (i965/gen7+, 
nv50, nvc0, r600, radeonsi)
diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html
index f3bdb6605c..1e24d9c9de 100644
--- a/docs/relnotes/18.2.0.html
+++ b/docs/relnotes/18.2.0.html
@@ -44,7 +44,7 @@ Note: some of the new features are only available with 
certain drivers.
 
 
 
-TBD
+GL_ARB_sample_locations and GL_NV_sample_locations on nvc0 (GM200+)
 
 
 Bug fixes
-- 
2.14.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nv50/ir: fix image stores with indirect handles

2018-06-05 Thread Rhys Perry
Having this if statement here prevented the next if statement from being
reached in the case of image stores, which is needed for instructions with
indirect bindless handles like "STORE TEMP[ADDR[2].x+1](1) ...".

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 3c5bad05fe..7712963c53 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1563,6 +1563,11 @@ bool Source::scanInstruction(const struct 
tgsi_full_instruction *inst)
if (insn.dstCount()) {
   Instruction::DstRegister dst = insn.getDst(0);
 
+  if (insn.getOpcode() == TGSI_OPCODE_STORE &&
+  dst.getFile() != TGSI_FILE_MEMORY) {
+ info->io.globalAccess |= 0x2;
+  }
+
   if (dst.getFile() == TGSI_FILE_OUTPUT) {
  if (dst.isIndirect(0))
 for (unsigned i = 0; i < info->numOutputs; ++i)
@@ -1580,10 +1585,6 @@ bool Source::scanInstruction(const struct 
tgsi_full_instruction *inst)
  if (isEdgeFlagPassthrough(insn))
 info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
   } else
-  if (dst.getFile() != TGSI_FILE_MEMORY &&
-  insn.getOpcode() == TGSI_OPCODE_STORE) {
- info->io.globalAccess |= 0x2;
-  } else
   if (dst.getFile() == TGSI_FILE_TEMPORARY) {
  if (dst.isIndirect(0))
 indirectTempArrays.insert(dst.getArrayId());
 
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/6] Fix Various Compilation Issues With Bindless

2018-06-06 Thread Rhys Perry
Previously, there were some errors in the compiler's implementation of
ARB_bindless_texture, mostly related to usage of bound image or sampler
handles allowed by ARB_bindless_texture, resulting in assertions or
compilation errors. This series fixes following issues found in mesa:
- Assertions when casting bound handles to uvec2
- Compilation errors when using the ?: operator with bound handles
- Assertions creating a constant image/sampler handle
   - For example: image2D(uvec2(5, 6))
- Inlining of function calls with rvalues other than dereferences to
  handle uniforms passed into them creates assertion failures
- Usage of bound handles as l-values

In order to create bindless handles from bound images or samplers, two new
TGSI opcodes needed to be added: SAMP2HND and IMG2HND. These are used when
casting bound handles or when using them as l-values (e.g. using them with
the ?: operator).

This series has the following limitations because I don't have the
hardware needed to test the needed changes:
- radeonsi and gallivm do not handle SAMP2HND and IMG2HND
- similar instructions/intrinsics for nir have not been added
- the tgsi to nir conversion code does not handle SAMP2HND and IMG2HND
- IMG2HND with Kepler is not implemented
Usage of bound handles as l-values and casting them is handled better than
before though.

Some tests for these changes have been posted on the piglit mailing list.

Rhys Perry (6):
  gallium: add new SAMP2HND and IMG2HND opcodes
  nv50/ir: add support for SAMP2HND on gk104+ and IMG2HND on gm107+
  glsl_to_tgsi: allow bound samplers and images to be used as l-values
  glsl: allow ?: operator with images and samplers when bindless is enabled
  glsl,glsl_to_tgsi: fix sampler/image constants
  glsl: fix function inlining with opaque parameters

 src/compiler/glsl/ast_to_hir.cpp   |  8 ++-
 src/compiler/glsl/ir.cpp   | 32 +-
 src/compiler/glsl/opt_function_inlining.cpp| 52 +---
 src/gallium/auxiliary/tgsi/tgsi_info.c |  2 +
 src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h |  4 +-
 src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h   |  3 +
 src/gallium/docs/source/tgsi.rst   | 25 
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|  2 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.h  |  2 +
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 22 +++
 .../drivers/nouveau/codegen/nv50_ir_inlines.h  |  4 +-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp  | 25 
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h|  1 +
 .../drivers/nouveau/codegen/nv50_ir_print.cpp  |  2 +
 .../drivers/nouveau/codegen/nv50_ir_target.cpp |  7 ++-
 src/gallium/include/pipe/p_shader_tokens.h |  2 +
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 69 --
 src/mesa/state_tracker/st_glsl_to_tgsi_private.h   |  1 +
 18 files changed, 239 insertions(+), 24 deletions(-)

-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/6] glsl: fix function inlining with opaque parameters

2018-06-06 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/compiler/glsl/opt_function_inlining.cpp | 52 -
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/compiler/glsl/opt_function_inlining.cpp 
b/src/compiler/glsl/opt_function_inlining.cpp
index 04690b6cf4..52f57da936 100644
--- a/src/compiler/glsl/opt_function_inlining.cpp
+++ b/src/compiler/glsl/opt_function_inlining.cpp
@@ -131,6 +131,18 @@ ir_save_lvalue_visitor::visit_enter(ir_dereference_array 
*deref)
return visit_stop;
 }
 
+static bool
+should_replace_variable(ir_variable *sig_param, ir_rvalue *param) {
+   /* For opaque types, we want the inlined variable references
+* referencing the passed in variable, since that will have
+* the location information, which an assignment of an opaque
+* variable wouldn't.
+*/
+   return sig_param->type->contains_opaque() &&
+  param->is_dereference() &&
+  sig_param->data.mode == ir_var_function_in;
+}
+
 void
 ir_call::generate_inline(ir_instruction *next_ir)
 {
@@ -155,12 +167,8 @@ ir_call::generate_inline(ir_instruction *next_ir)
   ir_rvalue *param = (ir_rvalue *) actual_node;
 
   /* Generate a new variable for the parameter. */
-  if (sig_param->type->contains_opaque()) {
-/* For opaque types, we want the inlined variable references
- * referencing the passed in variable, since that will have
- * the location information, which an assignment of an opaque
- * variable wouldn't.  Fix it up below.
- */
+  if (should_replace_variable(sig_param, param)) {
+ /* Actual replacement happens below */
 parameters[i] = NULL;
   } else {
 parameters[i] = sig_param->clone(ctx, ht);
@@ -242,10 +250,9 @@ ir_call::generate_inline(ir_instruction *next_ir)
   ir_rvalue *const param = (ir_rvalue *) actual_node;
   ir_variable *sig_param = (ir_variable *) formal_node;
 
-  if (sig_param->type->contains_opaque()) {
+  if (should_replace_variable(sig_param, param)) {
 ir_dereference *deref = param->as_dereference();
 
-assert(deref);
 do_variable_replacement(&new_instructions, sig_param, deref);
   }
}
@@ -351,6 +358,9 @@ public:
virtual ir_visitor_status visit_leave(ir_dereference_array *);
virtual ir_visitor_status visit_leave(ir_dereference_record *);
virtual ir_visitor_status visit_leave(ir_texture *);
+   virtual ir_visitor_status visit_leave(ir_assignment *);
+   virtual ir_visitor_status visit_leave(ir_expression *);
+   virtual ir_visitor_status visit_leave(ir_return *);
 
void replace_deref(ir_dereference **deref);
void replace_rvalue(ir_rvalue **rvalue);
@@ -391,6 +401,32 @@ ir_variable_replacement_visitor::visit_leave(ir_texture 
*ir)
return visit_continue;
 }
 
+ir_visitor_status
+ir_variable_replacement_visitor::visit_leave(ir_assignment *ir)
+{
+   replace_deref(&ir->lhs);
+   replace_rvalue(&ir->rhs);
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_variable_replacement_visitor::visit_leave(ir_expression *ir)
+{
+   for (uint8_t i = 0; i < ir->num_operands; i++)
+  replace_rvalue(&ir->operands[i]);
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_variable_replacement_visitor::visit_leave(ir_return *ir)
+{
+   replace_rvalue(&ir->value);
+
+   return visit_continue;
+}
+
 ir_visitor_status
 ir_variable_replacement_visitor::visit_leave(ir_dereference_array *ir)
 {
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/6] glsl: allow ?: operator with images and samplers when bindless is enabled

2018-06-06 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/compiler/glsl/ast_to_hir.cpp | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 3bf581571e..8a7dd62506 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1850,9 +1850,11 @@ ast_expression::do_hir(exec_list *instructions,
*   expressions; such use results in a compile-time error."
*/
   if (type->contains_opaque()) {
- _mesa_glsl_error(&loc, state, "opaque variables cannot be operands "
-  "of the ?: operator");
- error_emitted = true;
+ if (!(state->has_bindless() && (type->is_image() || 
type->is_sampler( {
+_mesa_glsl_error(&loc, state, "variables of type %s cannot be "
+ "operands of the ?: operator", type->name);
+error_emitted = true;
+ }
   }
 
   ir_constant *cond_val = op[0]->constant_expression_value(ctx);
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/6] nv50/ir: add support for SAMP2HND on gk104+ and IMG2HND on gm107+

2018-06-06 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|  2 ++
 src/gallium/drivers/nouveau/codegen/nv50_ir.h  |  2 ++
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 22 +++
 .../drivers/nouveau/codegen/nv50_ir_inlines.h  |  4 ++--
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp  | 25 ++
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h|  1 +
 .../drivers/nouveau/codegen/nv50_ir_print.cpp  |  2 ++
 .../drivers/nouveau/codegen/nv50_ir_target.cpp |  7 +++---
 8 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index c987da9908..7c1c76a912 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -903,6 +903,8 @@ TexInstruction::TexInstruction(Function *fn, operation op)
 
if (op == OP_TXF)
   sType = TYPE_U32;
+   if (op == OP_SAMP2HND || op == OP_IMG2HND)
+  setType(TYPE_U32);
 }
 
 TexInstruction::~TexInstruction()
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index f4f3c70888..97aa8d1109 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -134,6 +134,8 @@ enum operation
OP_SUCLAMP, // clamp surface coordinates
OP_SUEAU,   // surface effective address
OP_SUQ, // surface query
+   OP_SAMP2HND, // convert bound texture to bindless handle
+   OP_IMG2HND, // convert bound image to bindless handle
OP_MADSP,   // special integer multiply-add
OP_TEXBAR, // texture dependency barrier
OP_DFDX,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 3c5bad05fe..8149c72dd1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -3570,6 +3570,28 @@ Converter::handleInstruction(const struct 
tgsi_full_instruction *insn)
   handleTXQ(dst0, TXQ_TYPE, 0);
   std::swap(dst0[0], dst0[2]);
   break;
+   case TGSI_OPCODE_IMG2HND:
+   case TGSI_OPCODE_SAMP2HND:
+  if (!tgsi.getDst(0).isMasked(1))
+ mkOp1(OP_MOV, TYPE_U32, dst0[1], mkImm(0));
+
+  if (!tgsi.getDst(0).isMasked(0)) {
+ bool is_image = tgsi.getOpcode() == TGSI_OPCODE_IMG2HND;
+
+ TexInstruction *texi = new_TexInstruction(
+func, is_image ? OP_IMG2HND : OP_SAMP2HND);
+ texi->setDef(0, dst0[0]);
+ if (is_image)
+texi->tex.target = tgsi.getImageTarget();
+ else
+texi->tex.target = tgsi.getTexture(code, 0);
+ texi->tex.r = tgsi.getSrc(0).getIndex(0);
+ if (tgsi.getSrc(0).isIndirect(0))
+texi->setIndirectR(fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 
NULL));
+
+ bb->insertTail(texi);
+  }
+  break;
case TGSI_OPCODE_FBFETCH:
   handleFBFETCH(dst0);
   break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
index 4cb53ab42e..0262ae9d1f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
@@ -311,14 +311,14 @@ const FlowInstruction *Instruction::asFlow() const
 
 TexInstruction *Instruction::asTex()
 {
-   if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ)
+   if ((op >= OP_TEX && op <= OP_SULEA) || (op >= OP_SUQ && op <= OP_IMG2HND))
   return static_cast(this);
return NULL;
 }
 
 const TexInstruction *Instruction::asTex() const
 {
-   if ((op >= OP_TEX && op <= OP_SULEA) || op == OP_SUQ)
+   if ((op >= OP_TEX && op <= OP_SULEA) || (op >= OP_SUQ && op <= OP_IMG2HND))
   return static_cast(this);
return NULL;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 29f674b451..c2cc120147 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1347,6 +1347,27 @@ NVC0LoweringPass::handleBUFQ(Instruction *bufq)
return true;
 }
 
+bool
+NVC0LoweringPass::handle2HND(TexInstruction *i)
+{
+   assert(targ->getChipset() >= NVISA_GK104_CHIPSET);
+   assert(!i->tex.bindless);
+   bool is_sampler = i->op == OP_SAMP2HND;
+
+   if (is_sampler || targ->getChipset() >= NVISA_GM107_CHIPSET) {
+  //Sampler or image on GM107+
+  uint16_t slot = (is_sampler ? 0 : 32) + i->tex.r;
+  Value *hnd = loadTexHandle(i->getIndirectR(), slot);
+  bld.mkOp1(OP_MOV, TYPE_U32, i->getDef(0), hnd);
+   } else {
+  //Image on NVE4/GK104
+  assert(!"not implemented&

[Mesa-dev] [PATCH 5/6] glsl, glsl_to_tgsi: fix sampler/image constants

2018-06-06 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/compiler/glsl/ir.cpp   | 32 --
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 14 ++---
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp
index e3134eaa1c..1d1a56ae9a 100644
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -820,6 +820,10 @@ ir_constant::ir_constant(const struct glsl_type *type, 
exec_list *value_list)
for (unsigned i = 0; i < type->components(); i++)
   this->value.b[i] = value->value.b[0];
break;
+case GLSL_TYPE_SAMPLER:
+case GLSL_TYPE_IMAGE:
+   this->value.u64[0] = value->value.u64[0];
+   break;
 default:
assert(!"Should not get here.");
break;
@@ -939,6 +943,8 @@ ir_constant::get_bool_component(unsigned i) const
case GLSL_TYPE_FLOAT: return ((int)this->value.f[i]) != 0;
case GLSL_TYPE_BOOL:  return this->value.b[i];
case GLSL_TYPE_DOUBLE: return this->value.d[i] != 0.0;
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return this->value.u64[i] != 0;
case GLSL_TYPE_INT64:  return this->value.i64[i] != 0;
default:  assert(!"Should not get here."); break;
@@ -959,6 +965,8 @@ ir_constant::get_float_component(unsigned i) const
case GLSL_TYPE_FLOAT: return this->value.f[i];
case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1.0f : 0.0f;
case GLSL_TYPE_DOUBLE: return (float) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return (float) this->value.u64[i];
case GLSL_TYPE_INT64:  return (float) this->value.i64[i];
default:  assert(!"Should not get here."); break;
@@ -979,6 +987,8 @@ ir_constant::get_double_component(unsigned i) const
case GLSL_TYPE_FLOAT: return (double) this->value.f[i];
case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1.0 : 0.0;
case GLSL_TYPE_DOUBLE: return this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return (double) this->value.u64[i];
case GLSL_TYPE_INT64:  return (double) this->value.i64[i];
default:  assert(!"Should not get here."); break;
@@ -999,6 +1009,8 @@ ir_constant::get_int_component(unsigned i) const
case GLSL_TYPE_FLOAT: return (int) this->value.f[i];
case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
case GLSL_TYPE_DOUBLE: return (int) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return (int) this->value.u64[i];
case GLSL_TYPE_INT64:  return (int) this->value.i64[i];
default:  assert(!"Should not get here."); break;
@@ -1019,6 +1031,8 @@ ir_constant::get_uint_component(unsigned i) const
case GLSL_TYPE_FLOAT: return (unsigned) this->value.f[i];
case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
case GLSL_TYPE_DOUBLE: return (unsigned) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return (unsigned) this->value.u64[i];
case GLSL_TYPE_INT64:  return (unsigned) this->value.i64[i];
default:  assert(!"Should not get here."); break;
@@ -1039,6 +1053,8 @@ ir_constant::get_int64_component(unsigned i) const
case GLSL_TYPE_FLOAT: return (int64_t) this->value.f[i];
case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
case GLSL_TYPE_DOUBLE: return (int64_t) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return (int64_t) this->value.u64[i];
case GLSL_TYPE_INT64:  return this->value.i64[i];
default:  assert(!"Should not get here."); break;
@@ -1059,6 +1075,8 @@ ir_constant::get_uint64_component(unsigned i) const
case GLSL_TYPE_FLOAT: return (uint64_t) this->value.f[i];
case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
case GLSL_TYPE_DOUBLE: return (uint64_t) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64: return this->value.u64[i];
case GLSL_TYPE_INT64:  return (uint64_t) this->value.i64[i];
default:  assert(!"Should not get here."); break;
@@ -1110,6 +1128,8 @@ ir_constant::copy_offset(ir_constant *src, int offset)
case GLSL_TYPE_INT:
case GLSL_TYPE_FLOAT:
case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
case GLSL_TYPE_UINT64:
case GLSL_TYPE_INT64:
case GLSL_TYPE_BOOL: {
@@ -1132,7 +1152,9 @@ ir_constant::copy_offset(ir_constant *src, int offset)
 case GLSL_TYPE_DOUBLE:
value.d[i+offset] = src->get_double_component(i);
break;
- case GLSL

[Mesa-dev] [PATCH 3/6] glsl_to_tgsi: allow bound samplers and images to be used as l-values

2018-06-06 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp   | 55 +++-
 src/mesa/state_tracker/st_glsl_to_tgsi_private.h |  1 +
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b321112cf8..7938753453 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -316,6 +316,7 @@ public:
   st_src_reg *indirect,
   unsigned *location);
st_src_reg canonicalize_gather_offset(st_src_reg offset);
+   bool handle_bound_deref(ir_dereference *ir);
 
bool try_emit_mad(ir_expression *ir,
   int mul_operand);
@@ -2439,10 +2440,15 @@ st_translate_interp_loc(ir_variable *var)
 void
 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 {
-   variable_storage *entry = find_variable_storage(ir->var);
+   variable_storage *entry;
ir_variable *var = ir->var;
bool remove_array;
 
+   if (handle_bound_deref(ir->as_dereference()))
+  return;
+
+   entry = find_variable_storage(ir->var);
+
if (!entry) {
   switch (var->data.mode) {
   case ir_var_uniform:
@@ -2669,6 +2675,9 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
bool is_2D = false;
ir_variable *var = ir->variable_referenced();
 
+   if (handle_bound_deref(ir->as_dereference()))
+  return;
+
/* We only need the logic provided by st_glsl_storage_type_size()
 * for arrays of structs. Indirect sampler and image indexing is handled
 * elsewhere.
@@ -2768,6 +2777,9 @@ glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
ir_variable *var = ir->record->variable_referenced();
int offset = 0;
 
+   if (handle_bound_deref(ir->as_dereference()))
+  return;
+
ir->record->accept(this);
 
assert(ir->field_idx >= 0);
@@ -4110,6 +4122,45 @@ 
glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset)
 
return offset;
 }
+ 
+bool
+glsl_to_tgsi_visitor::handle_bound_deref(ir_dereference *ir)
+{
+   ir_variable *var = ir->variable_referenced();
+
+   if (!var || var->data.mode != ir_var_uniform || var->data.bindless ||
+   !(ir->type->is_image() || ir->type->is_sampler()))
+  return false;
+
+   //Convert from bound sampler/image to bindless handle
+   bool is_image = ir->type->is_image();
+   st_src_reg resource(is_image ? PROGRAM_IMAGE : PROGRAM_SAMPLER, 0, 
GLSL_TYPE_UINT);
+   uint16_t index = 0;
+   unsigned array_size = 1, base = 0;
+   st_src_reg reladdr;
+   get_deref_offsets(ir, &array_size, &base, &index, &reladdr, true);
+
+   resource.index = index;
+   if (reladdr.file != PROGRAM_UNDEFINED) {
+  resource.reladdr = ralloc(mem_ctx, st_src_reg);
+  *resource.reladdr = reladdr;
+  emit_arl(ir, sampler_reladdr, reladdr);
+   }
+
+   this->result = get_temp(glsl_type::uvec2_type);
+   st_dst_reg dst(this->result);
+   dst.writemask = WRITEMASK_XY;
+
+   glsl_to_tgsi_instruction *inst = emit_asm(
+  ir, is_image ? TGSI_OPCODE_IMG2HND : TGSI_OPCODE_SAMP2HND, dst);
+
+   inst->tex_target = ir->type->sampler_index();
+   inst->resource = resource;
+   inst->sampler_array_size = array_size;
+   inst->sampler_base = base;
+
+   return true;
+}
 
 void
 glsl_to_tgsi_visitor::visit(ir_texture *ir)
@@ -5904,6 +5955,7 @@ compile_tgsi_instruction(struct st_translate *t,
case TGSI_OPCODE_TXL2:
case TGSI_OPCODE_TG4:
case TGSI_OPCODE_LODQ:
+   case TGSI_OPCODE_SAMP2HND:
   if (inst->resource.file == PROGRAM_SAMPLER) {
  src[num_src] = t->samplers[inst->resource.index];
   } else {
@@ -5942,6 +5994,7 @@ compile_tgsi_instruction(struct st_translate *t,
case TGSI_OPCODE_ATOMUMAX:
case TGSI_OPCODE_ATOMIMIN:
case TGSI_OPCODE_ATOMIMAX:
+   case TGSI_OPCODE_IMG2HND:
   for (i = num_src - 1; i >= 0; i--)
  src[i + 1] = src[i];
   num_src++;
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi_private.h 
b/src/mesa/state_tracker/st_glsl_to_tgsi_private.h
index c482828edd..fccb7041cf 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi_private.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi_private.h
@@ -179,6 +179,7 @@ is_resource_instruction(unsigned opcode)
case TGSI_OPCODE_ATOMUMAX:
case TGSI_OPCODE_ATOMIMIN:
case TGSI_OPCODE_ATOMIMAX:
+   case TGSI_OPCODE_IMG2HND:
   return true;
default:
   return false;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/6] gallium: add new SAMP2HND and IMG2HND opcodes

2018-06-06 Thread Rhys Perry
This commit does not add support for the opcodes in gallivm or tgsi_to_nir.c

Signed-off-by: Rhys Perry 
---
 src/gallium/auxiliary/tgsi/tgsi_info.c |  2 ++
 src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h |  4 ++--
 src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h   |  3 +++
 src/gallium/docs/source/tgsi.rst   | 25 +
 src/gallium/include/pipe/p_shader_tokens.h |  2 ++
 5 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c 
b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 4aa658785c..bbe1a21e43 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -153,6 +153,8 @@ tgsi_opcode_infer_type(enum tgsi_opcode opcode)
case TGSI_OPCODE_POPC:
case TGSI_OPCODE_LSB:
case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_IMG2HND:
+   case TGSI_OPCODE_SAMP2HND:
   return TGSI_TYPE_UNSIGNED;
case TGSI_OPCODE_ARL:
case TGSI_OPCODE_ARR:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h 
b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
index 1b2803cf3f..c3787c2fbb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
@@ -162,8 +162,8 @@ OPCODE(1, 1, COMP, IABS)
 OPCODE(1, 1, COMP, ISSG)
 OPCODE(1, 2, OTHR, LOAD)
 OPCODE(1, 2, OTHR, STORE, .is_store = 1)
-OPCODE_GAP(163) /* removed */
-OPCODE_GAP(164) /* removed */
+OPCODE(1, 1, OTHR, IMG2HND)
+OPCODE(1, 1, OTHR, SAMP2HND, .is_tex = 1)
 OPCODE_GAP(165) /* removed */
 OPCODE(0, 0, OTHR, BARRIER)
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h 
b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index 9a13fa6684..54a1ee15b6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -160,6 +160,9 @@ OP13(UCMP)
 OP11(IABS)
 OP11(ISSG)
 
+OP11(IMG2HND)
+OP11(SAMP2HND)
+
 OP12(IMUL_HI)
 OP12(UMUL_HI)
 
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 9e956586c4..a4a78e6267 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2592,6 +2592,31 @@ For these opcodes, the resource can be a BUFFER, IMAGE, 
or MEMORY.
   barrier in between.
 
 
+.. _bindlessopcodes:
+
+Bindless Opcodes
+
+
+These opcodes are for working with bindless sampler or image handles and
+require PIPE_CAP_BINDLESS_TEXTURE.
+
+.. opcode:: IMG2HND - Get a bindless handle for a image
+
+  Syntax: ``IMG2HND dst, image``
+
+  Example: ``IMG2HND TEMP[0], IMAGE[0]``
+
+  Sets 'dst' to a bindless handle for 'image'.
+
+.. opcode:: SAMP2HND - Get a bindless handle for a sampler view
+
+  Syntax: ``SAMP2HND dst, sampler``
+
+  Example: ``SAMP2HND TEMP[0], SVIEW[0]``
+
+  Sets 'dst' to a bindless handle for 'sampler'.
+
+
 .. _threadsyncopcodes:
 
 Inter-thread synchronization opcodes
diff --git a/src/gallium/include/pipe/p_shader_tokens.h 
b/src/gallium/include/pipe/p_shader_tokens.h
index f4e45c2560..08ed08156e 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -506,6 +506,8 @@ enum tgsi_opcode {
 
TGSI_OPCODE_LOAD   = 161,
TGSI_OPCODE_STORE  = 162,
+   TGSI_OPCODE_IMG2HND= 163,
+   TGSI_OPCODE_SAMP2HND   = 164,
/* gap */
TGSI_OPCODE_BARRIER= 166,
 
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 0/6] Fix Various Compilation Issues With Bindless

2018-06-06 Thread Rhys Perry
Oops, I meant r-values, not l-values.
Seems to meaning of the word in my head changed at some point.

On Wed, Jun 6, 2018 at 8:55 PM, Rhys Perry  wrote:
> Previously, there were some errors in the compiler's implementation of
> ARB_bindless_texture, mostly related to usage of bound image or sampler
> handles allowed by ARB_bindless_texture, resulting in assertions or
> compilation errors. This series fixes following issues found in mesa:
> - Assertions when casting bound handles to uvec2
> - Compilation errors when using the ?: operator with bound handles
> - Assertions creating a constant image/sampler handle
>- For example: image2D(uvec2(5, 6))
> - Inlining of function calls with rvalues other than dereferences to
>   handle uniforms passed into them creates assertion failures
> - Usage of bound handles as l-values
>
> In order to create bindless handles from bound images or samplers, two new
> TGSI opcodes needed to be added: SAMP2HND and IMG2HND. These are used when
> casting bound handles or when using them as l-values (e.g. using them with
> the ?: operator).
>
> This series has the following limitations because I don't have the
> hardware needed to test the needed changes:
> - radeonsi and gallivm do not handle SAMP2HND and IMG2HND
> - similar instructions/intrinsics for nir have not been added
> - the tgsi to nir conversion code does not handle SAMP2HND and IMG2HND
> - IMG2HND with Kepler is not implemented
> Usage of bound handles as l-values and casting them is handled better than
> before though.
>
> Some tests for these changes have been posted on the piglit mailing list.
>
> Rhys Perry (6):
>   gallium: add new SAMP2HND and IMG2HND opcodes
>   nv50/ir: add support for SAMP2HND on gk104+ and IMG2HND on gm107+
>   glsl_to_tgsi: allow bound samplers and images to be used as l-values
>   glsl: allow ?: operator with images and samplers when bindless is enabled
>   glsl,glsl_to_tgsi: fix sampler/image constants
>   glsl: fix function inlining with opaque parameters
>
>  src/compiler/glsl/ast_to_hir.cpp   |  8 ++-
>  src/compiler/glsl/ir.cpp   | 32 +-
>  src/compiler/glsl/opt_function_inlining.cpp| 52 +---
>  src/gallium/auxiliary/tgsi/tgsi_info.c |  2 +
>  src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h |  4 +-
>  src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h   |  3 +
>  src/gallium/docs/source/tgsi.rst   | 25 
>  src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|  2 +
>  src/gallium/drivers/nouveau/codegen/nv50_ir.h  |  2 +
>  .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 22 +++
>  .../drivers/nouveau/codegen/nv50_ir_inlines.h  |  4 +-
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp  | 25 
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.h|  1 +
>  .../drivers/nouveau/codegen/nv50_ir_print.cpp  |  2 +
>  .../drivers/nouveau/codegen/nv50_ir_target.cpp |  7 ++-
>  src/gallium/include/pipe/p_shader_tokens.h |  2 +
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 69 
> --
>  src/mesa/state_tracker/st_glsl_to_tgsi_private.h   |  1 +
>  18 files changed, 239 insertions(+), 24 deletions(-)
>
> --
> 2.14.4
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 00/16] Move the Mesa Website to Sphinx

2018-06-08 Thread Rhys Perry
Might be good to do something like this: https://codepen.io/anon/pen/ERNdYJ
So that those with NoScript or something won't have gears constantly
rotating on their screen.

On Fri, Jun 8, 2018 at 2:25 PM, Erik Faye-Lund  wrote:
> On Fri, Jun 8, 2018 at 2:06 PM, Rob Clark  wrote:
>> On Fri, Jun 8, 2018 at 3:02 AM, Jordan Justen  wrote:
>>> On Thu, Jun 7, 2018 at 2:56 AM Eero Tamminen  
>>> wrote:
 On 07.06.2018 12:01, Erik Faye-Lund wrote:
 > Just as a fun toy, I decided to give an animated SVG "variation" of
 > this a go myself:
 >
 > https://codepen.io/kusma/pen/vrXppL
 >
 > The actual SVG can be found here:
 >
 > https://gitlab.freedesktop.org/snippets/492
 >
 > The gears were generated by this python script, based on the glxgears
 > source code:
 >
 > https://gitlab.freedesktop.org/snippets/491
 >
 > Now, dropping this onto the black background doesn't work that well,
 > as it gets a bit bland, so it's probably better to add back the colors
 > then.
 >
 > Also, I'm not really sure if animation is a good idea or not.

 Maybe it could be a link target for the static logo?

 (Kind of website "easter egg").

 > But I definitely think logos should be vector rather than raster ;)

 For Mesa, WebGL would be more fitting implementation than SVG though...
>>>
>>> https://github.com/gears3d/gears3d.github.io/blob/master/webgl10.js
>>>
>>> One comment I would have for any animation on the main pages (as
>>> opposed to a separate 'easter egg' page), it probably should be
>>> significantly slower moving than the traditional 70 degrees / second.
>>> The faster animation would be distracting on the main pages.
>>>
>>
>> so one idea, which I think isn't too over the top, is to have the
>> static mesa-gears logo in top corner, but clicking on it starts/stops
>> the animation (just toggle between static and animated svg, I guess?)
>
> Good idea. I updated the codepen to do a variation of that; it rotates
> as long as the mouse hovers it.
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nv50/ir: Improve performance of signed division by powers of two

2018-06-08 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 29 +++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 39177bd044..7a18a5fe73 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1095,10 +1095,35 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue 
&imm0, int s)
  i->op = OP_MOV;
  i->setSrc(1, NULL);
   } else
+  if (imm0.reg.data.s32 == -1) {
+ i->op = OP_NEG;
+ i->setSrc(1, NULL);
+  } else
   if (i->dType == TYPE_U32 && imm0.isPow2()) {
  i->op = OP_SHR;
  i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
   } else
+  if (i->dType == TYPE_S32 && 
util_is_power_of_two_or_zero(abs(imm0.reg.data.s32))) {
+ Value *a = i->getSrc(0);
+ int32_t b = imm0.reg.data.s32;
+
+ if (b < 0) {
+a = bld.getSSA();
+bld.mkOp1(OP_NEG, TYPE_S32, a, i->getSrc(0));
+b = -b;
+ }
+
+ Value *sign = bld.getSSA();
+ Value *tmp0 = bld.getSSA();
+ Value *tmp1 = bld.getSSA();
+ bld.mkOp2(OP_SHR, TYPE_U32, sign, a, bld.mkImm(31));
+ bld.mkOp2(OP_ADD, TYPE_U32, tmp0, a, bld.mkImm(b - 1));
+ bld.mkOp3(OP_SELP, TYPE_U32, tmp1, tmp0, a, sign);
+
+ i->op = OP_SHR;
+ i->setSrc(0, tmp1);
+ i->setSrc(1, bld.mkImm(util_logbase2(b)));
+  } else
   if (i->dType == TYPE_U32) {
  Instruction *mul;
  Value *tA, *tB;
@@ -1129,10 +1154,6 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue 
&imm0, int s)
 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
 
  delete_Instruction(prog, i);
-  } else
-  if (imm0.reg.data.s32 == -1) {
- i->op = OP_NEG;
- i->setSrc(1, NULL);
   } else {
  LValue *tA, *tB;
  LValue *tD;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2] nv50/ir: improve performance of signed division by powers of two

2018-06-09 Thread Rhys Perry
Changes in v2:
- Stylistic changes
- Use OP_SLCT instead of OP_SELP which only worked by luck
- Fix issues in edge cases

Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 30 +++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 39177bd044..d636eb130a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1095,10 +1095,36 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue 
&imm0, int s)
  i->op = OP_MOV;
  i->setSrc(1, NULL);
   } else
+  if (imm0.reg.data.s32 == -1) {
+ i->op = OP_NEG;
+ i->setSrc(1, NULL);
+  } else
   if (i->dType == TYPE_U32 && imm0.isPow2()) {
  i->op = OP_SHR;
  i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
   } else
+  if (i->dType == TYPE_S32 && 
util_is_power_of_two_or_zero(llabs(imm0.reg.data.s32))) {
+ Value *a = i->getSrc(0);
+ int64_t absb = llabs(imm0.reg.data.s32);
+
+ Value *sign = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), a, 
bld.mkImm(31));
+ Value *adjusted = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), a,
+  bld.loadImm(bld.getSSA(), 
(uint32_t)(absb - 1)));
+
+ Value *selected = bld.getSSA();
+ bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, selected, TYPE_U32, adjusted, a, 
sign);
+
+ if (imm0.reg.data.s32 < 0) {
+i->op = OP_NEG;
+i->setSrc(0, bld.mkOp2v(
+   OP_SHR, TYPE_S32, bld.getSSA(), selected, 
bld.mkImm(util_logbase2(absb;
+i->setSrc(1, NULL);
+ } else {
+i->op = OP_SHR;
+i->setSrc(0, selected);
+i->setSrc(1, bld.mkImm(util_logbase2(absb)));
+ }
+  } else
   if (i->dType == TYPE_U32) {
  Instruction *mul;
  Value *tA, *tB;
@@ -1129,10 +1155,6 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue 
&imm0, int s)
 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
 
  delete_Instruction(prog, i);
-  } else
-  if (imm0.reg.data.s32 == -1) {
- i->op = OP_NEG;
- i->setSrc(1, NULL);
   } else {
  LValue *tA, *tB;
  LValue *tD;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nv50/ir: fix TargetNVC0::insnCanLoadOffset()

2018-06-11 Thread Rhys Perry
Previously, TargetNVC0::insnCanLoadOffset() returned whether the offset
could be set to a specific value. The IndirectPropagation pass expected
it to return whether the offset could be increased.

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 954aec0a2f..8938d19f6c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -415,6 +415,7 @@ bool
 TargetNVC0::insnCanLoadOffset(const Instruction *insn, int s, int offset) const
 {
const ValueRef& ref = insn->src(s);
+   offset += insn->src(s).get()->reg.data.offset;
if (ref.getFile() == FILE_MEMORY_CONST &&
(insn->op != OP_LOAD || insn->subOp != NV50_IR_SUBOP_LDC_IS))
   return offset >= -0x8000 && offset < 0x8000;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/6] Fix Various Compilation Issues With Bindless

2018-06-11 Thread Rhys Perry
Ping to those who seem appropriate for this patch in case it was forgotten
or missed.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nv50/ir: handle SHLADD in IndirectPropagation

2018-06-11 Thread Rhys Perry
An alternative solution to the problem fixed in
0bd83d0 ("nv50/ir: move LateAlgebraicOpt to the very end"). Should be
useful in the future and seems to make dolphin ubershaders a bit smaller.

total instructions in shared programs : 226722 -> 226464 (-0.11%)
total gprs used in shared programs: 19378 -> 19378 (0.00%)
total shared used in shared programs  : 0 -> 0 (0.00%)
total local used in shared programs   : 0 -> 0 (0.00%)

local sharedgpr   inst  bytes
helped   0   0   0  51  51
  hurt   0   0   0   0   0

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 39177bd044..4d0589214d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -283,6 +283,8 @@ class IndirectPropagation : public Pass
 {
 private:
virtual bool visit(BasicBlock *);
+
+   BuildUtil bld;
 };
 
 bool
@@ -294,6 +296,8 @@ IndirectPropagation::visit(BasicBlock *bb)
for (Instruction *i = bb->getEntry(); i; i = next) {
   next = i->next;
 
+  bld.setPosition(i, false);
+
   for (int s = 0; i->srcExists(s); ++s) {
  Instruction *insn;
  ImmediateValue imm;
@@ -325,6 +329,14 @@ IndirectPropagation::visit(BasicBlock *bb)
 i->setIndirect(s, 0, NULL);
 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+ } else if (insn->op == OP_SHLADD) {
+if (!insn->src(2).getImmediate(imm) ||
+!targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+   continue;
+i->setIndirect(s, 0, bld.mkOp2v(
+   OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), 
insn->getSrc(1)));
+i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+i->src(s).get()->reg.data.offset += imm.reg.data.u32;
  }
   }
}
@@ -3797,11 +3809,11 @@ Program::optimizeSSA(int level)
RUN_PASS(2, AlgebraicOpt, run);
RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
RUN_PASS(1, ConstantFolding, foldAll);
+   RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(1, Split64BitOpPreRA, run);
RUN_PASS(1, LoadPropagation, run);
RUN_PASS(1, IndirectPropagation, run);
RUN_PASS(2, MemoryOpt, run);
-   RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(2, LocalCSE, run);
RUN_PASS(0, DeadCodeElim, buryAll);
 
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 0/2] nv50/ir: SHLADD related improvements

2018-06-12 Thread Rhys Perry
This series implements an alternative solution to the problem fixed in
0bd83d0 ("nv50/ir: move LateAlgebraicOpt to the very end"). Overall, it
slightly helps various shaders while slightly hurting a few others.

Effects of both patches:

total instructions in shared programs : 5265148 -> 5256901 (-0.16%)
total gprs used in shared programs: 624346 -> 624328 (-0.00%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 20952 -> 20952 (0.00%)

local sharedgpr   inst  bytes 
helped   0   0  7120162016
  hurt   0   0  52  19  19

Rhys Perry (2):
  nv50/ir: handle SHLADD in IndirectPropagation
  nv50/ir: move LateAlgebraicOpt back to right after ConstantFolding

 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 2/2] nv50/ir: move LateAlgebraicOpt back to right after ConstantFolding

2018-06-12 Thread Rhys Perry
Reverts 3072bbe ("nv50/ir: move LateAlgebraicOpt to the very end") since
SHLADD is now handled in IndirectPropagation.

total instructions in shared programs : 5264804 -> 5256901 (-0.15%)
total gprs used in shared programs: 624341 -> 624328 (-0.00%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 20952 -> 20952 (0.00%)

local sharedgpr   inst  bytes
helped   0   0  6919931993
  hurt   0   0  52  32      32

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 83fb15ca34..4d0589214d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -3809,11 +3809,11 @@ Program::optimizeSSA(int level)
RUN_PASS(2, AlgebraicOpt, run);
RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
RUN_PASS(1, ConstantFolding, foldAll);
+   RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(1, Split64BitOpPreRA, run);
RUN_PASS(1, LoadPropagation, run);
RUN_PASS(1, IndirectPropagation, run);
RUN_PASS(2, MemoryOpt, run);
-   RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(2, LocalCSE, run);
RUN_PASS(0, DeadCodeElim, buryAll);
 
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 1/2] nv50/ir: handle SHLADD in IndirectPropagation

2018-06-12 Thread Rhys Perry
An alternative solution to the problem fixed in
0bd83d0 ("nv50/ir: move LateAlgebraicOpt to the very end").

total instructions in shared programs : 5265148 -> 5264804 (-0.01%)
total gprs used in shared programs: 624346 -> 624341 (-0.00%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 20952 -> 20952 (0.00%)

local sharedgpr   inst  bytes
helped   0   0   2  31  31
  hurt   0   0   0   0   0

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 12 
 1 file changed, 12 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 39177bd044..83fb15ca34 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -283,6 +283,8 @@ class IndirectPropagation : public Pass
 {
 private:
virtual bool visit(BasicBlock *);
+
+   BuildUtil bld;
 };
 
 bool
@@ -294,6 +296,8 @@ IndirectPropagation::visit(BasicBlock *bb)
for (Instruction *i = bb->getEntry(); i; i = next) {
   next = i->next;
 
+  bld.setPosition(i, false);
+
   for (int s = 0; i->srcExists(s); ++s) {
  Instruction *insn;
  ImmediateValue imm;
@@ -325,6 +329,14 @@ IndirectPropagation::visit(BasicBlock *bb)
 i->setIndirect(s, 0, NULL);
 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+ } else if (insn->op == OP_SHLADD) {
+if (!insn->src(2).getImmediate(imm) ||
+!targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+   continue;
+i->setIndirect(s, 0, bld.mkOp2v(
+   OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), 
insn->getSrc(1)));
+i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+i->src(s).get()->reg.data.offset += imm.reg.data.u32;
  }
   }
}
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] nv50/ir: add preliminary support for OP_XMAD

2018-06-13 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|  3 ++-
 src/gallium/drivers/nouveau/codegen/nv50_ir.h  | 14 
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 12 +--
 .../drivers/nouveau/codegen/nv50_ir_print.cpp  | 20 +
 .../drivers/nouveau/codegen/nv50_ir_target.cpp |  7 +++---
 .../nouveau/codegen/nv50_ir_target_gm107.cpp   |  1 +
 .../nouveau/codegen/nv50_ir_target_nv50.cpp|  5 +++--
 .../nouveau/codegen/nv50_ir_target_nvc0.cpp| 25 --
 8 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 49425b98b9..99bf8de370 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -53,7 +53,8 @@ Modifier Modifier::operator*(const Modifier m) const
   b &= ~NV50_IR_MOD_NEG;
 
a = (this->bits ^ b)  & (NV50_IR_MOD_NOT | NV50_IR_MOD_NEG);
-   c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT);
+   c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT |
+NV50_IR_MOD_H1 | NV50_IR_MOD_SEXT);
 
return Modifier(a | c);
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index f4f3c70888..4deaf09989 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -58,6 +58,7 @@ enum operation
OP_FMA,
OP_SAD, // abs(src0 - src1) + src2
OP_SHLADD,
+   OP_XMAD, // extended multiply-add (GM107+), does a lot of things
OP_ABS,
OP_NEG,
OP_NOT,
@@ -251,6 +252,13 @@ enum operation
 #define NV50_IR_SUBOP_VOTE_ALL 0
 #define NV50_IR_SUBOP_VOTE_ANY 1
 #define NV50_IR_SUBOP_VOTE_UNI 2
+#define NV50_IR_SUBOP_XMAD_PSL (1 << 0)
+#define NV50_IR_SUBOP_XMAD_MRG (1 << 1)
+#define NV50_IR_SUBOP_XMAD_CLO (1 << 2)
+#define NV50_IR_SUBOP_XMAD_CHI (2 << 2)
+#define NV50_IR_SUBOP_XMAD_CSFU (3 << 2)
+#define NV50_IR_SUBOP_XMAD_CBCC (4 << 2)
+#define NV50_IR_SUBOP_XMAD_CMODE_MASK (0x7 << 2)
 
 #define NV50_IR_SUBOP_MINMAX_LOW  1
 #define NV50_IR_SUBOP_MINMAX_MED  2
@@ -527,6 +535,9 @@ struct Storage
 #define NV50_IR_MOD_SAT (1 << 2)
 #define NV50_IR_MOD_NOT (1 << 3)
 #define NV50_IR_MOD_NEG_ABS (NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)
+// modifiers only for XMAD
+#define NV50_IR_MOD_H1   (1 << 4)
+#define NV50_IR_MOD_SEXT (1 << 5)
 
 #define NV50_IR_INTERP_MODE_MASK   0x3
 #define NV50_IR_INTERP_LINEAR  (0 << 0)
@@ -556,11 +567,14 @@ public:
inline Modifier operator&(const Modifier m) const { return bits & m.bits; }
inline Modifier operator|(const Modifier m) const { return bits | m.bits; }
inline Modifier operator^(const Modifier m) const { return bits ^ m.bits; }
+   inline Modifier operator~() const { return ~bits; }
 
operation getOp() const;
 
inline int neg() const { return (bits & NV50_IR_MOD_NEG) ? 1 : 0; }
inline int abs() const { return (bits & NV50_IR_MOD_ABS) ? 1 : 0; }
+   inline int h1() const { return (bits & NV50_IR_MOD_H1) ? 1 : 0; }
+   inline int sext() const { return (bits & NV50_IR_MOD_SEXT) ? 1 : 0; }
 
inline operator bool() const { return bits ? true : false; }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 4d0589214d..a43b481a01 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -191,9 +191,16 @@ void
 LoadPropagation::checkSwapSrc01(Instruction *insn)
 {
const Target *targ = prog->getTarget();
-   if (!targ->getOpInfo(insn).commutative)
-  if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
+   if (!targ->getOpInfo(insn).commutative) {
+  if (insn->op != OP_SET && insn->op != OP_SLCT &&
+  insn->op != OP_SUB && insn->op != OP_XMAD)
  return;
+  // XMAD is only commutative if both the CBCC and MRG flags are not set.
+  if (insn->op == OP_XMAD && (insn->subOp & 0x1c) == 
NV50_IR_SUBOP_XMAD_CBCC)
+ return;
+  if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
+ return;
+   }
if (insn->src(1).getFile() != FILE_GPR)
   return;
// This is the special OP_SET used for alphatesting, we can't reverse its
@@ -488,6 +495,7 @@ Modifier::applyTo(ImmediateValue& imm) const
  imm.reg.data.s32 = -imm.reg.data.s32;
   if (bits & NV50_IR_MOD_NOT)
  imm.reg.data.s32 = ~imm.reg.data.s32;
+  // NOTE: applying the h1 and sext modifiers is confusing and not very 
useful
   break;
 
c

[Mesa-dev] [PATCH 4/4] nv50/ir: further optimize multiplication by immediates

2018-06-13 Thread Rhys Perry
Strongly mitigates the harm from the previous commit, which made many
integer multiplications much more heavy on the register and instruction
count.

total instructions in shared programs : 5294693 -> 5268293 (-0.50%)
total gprs used in shared programs: 624962 -> 624196 (-0.12%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 21048 -> 20952 (-0.46%)

local sharedgpr   inst  bytes
helped   1   0 36817721772
  hurt   0   0  74  23  23

Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 123 ++---
 src/util/bitscan.h |  26 +
 2 files changed, 135 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 84cb5eb04b..aaad4db479 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -371,6 +371,10 @@ private:
void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
 
CmpInstruction *findOriginForTestWithZero(Value *);
+ 
+   Value *createMulMethod1(Value *a, unsigned b, Value *c);
+   Value *createMulMethod2(Value *a, unsigned b, Value *c);
+   Value *createMul(Value *a, unsigned b, Value *c);
 
unsigned int foldCount;
 
@@ -946,6 +950,97 @@ ConstantFolding::opnd3(Instruction *i, ImmediateValue 
&imm2)
   return;
}
 }
+ 
+Value *
+ConstantFolding::createMulMethod1(Value *a, unsigned b, Value *c)
+{
+   if (b == 1)
+  return a;
+
+   // Basically constant folded shift and add multiplication.
+   Value *res = c ? c : bld.loadImm(NULL, 0u);
+   bool resZero = !c;
+   unsigned ashift = 0;
+   while (b) {
+  if ((b & 1) && ashift) {
+ if (resZero)
+res = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, 
bld.mkImm(ashift));
+ else
+res = bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, 
bld.mkImm(ashift), res);
+ resZero = false;
+  } else if (b & 1) {
+ if (resZero)
+res = a;
+ else
+res = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), res, a);
+ resZero = false;
+  }
+  b >>= 1;
+  ashift++;
+   }
+   return res;
+}
+
+Value *
+ConstantFolding::createMulMethod2(Value *a, unsigned b, Value *c)
+{
+   uint64_t b2 = u_next_power_of_two(b);
+   unsigned b2shift = ffsll(b2) - 1;
+   if (b2 != b) { // a * b2 - a * (b2 - b)
+  // mul1 = a * (b2 - b)
+  Value *mul1 = createMulMethod1(a, b2 - b, NULL);
+
+  if (b2shift < 32 && c) { // a * b2 - mul1 + c (implemented as a * b2 + c 
- mul1)
+ return bld.mkOp2v(OP_SUB, TYPE_U32, bld.getSSA(),
+   bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(),
+  a, bld.mkImm(b2shift), c),
+   mul1);
+  } else
+  if (b2shift < 32) { // a * b2 - mul1
+ Value *res = bld.getSSA();
+ Instruction *i = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, 
bld.mkImm(b2shift), mul1);
+ if (bld.getProgram()->getTarget()->isModSupported(i, 2, 
NV50_IR_MOD_NEG))
+i->src(2).mod *= Modifier(NV50_IR_MOD_NEG);
+ else
+i->setSrc(2, bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1));
+ return res;
+  } else
+  if (c) { // - mul1 + c (implemented as c - mul1)
+ return bld.mkOp2v(OP_SUB, TYPE_U32, bld.getSSA(), c, mul1);
+  } else { // - mul1
+ return bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1);
+  }
+   } else {
+  if (c) // a * b2 + c
+ return bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, 
bld.mkImm(b2shift), c);
+  else // a * b2
+ return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, 
bld.loadImm(NULL, b2shift));
+   }
+}
+
+Value *
+ConstantFolding::createMul(Value *a, unsigned b, Value *c)
+{
+   unsigned cost[2];
+
+   // Estimate cost for first method (a << i) + (b << j) + ...
+   cost[0] = u_bit_count64(b >> 1);
+
+   // Estimate cost for second method (a << i) - ((a << j) + (a << k) + ...)
+   uint64_t rounded_b = u_next_power_of_two(b);
+   cost[1] = rounded_b == b ? 1 : (u_bit_count64((rounded_b - b) >> 1) + 2);
+   if (c) cost[1]++;
+
+   // The general method, multiplication by XMADs, costs three instructions.
+   // So nothing larger than that or it could be making things worse.
+   if (cost[0] > 3 && cost[1] > 3)
+  return NULL;
+
+   if (cost[0] < cost[1])
+  return createMulMethod1(a, b, c);
+   else
+  return createMulMethod2(a, b, c);
+}
 
 void
 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
@@ -1034,13 +1129,13 @

[Mesa-dev] [PATCH 3/4] nv50/ir: optimize imul/imad to xmads

2018-06-13 Thread Rhys Perry
This hits the shader-db numbers a good bit, though a few xmads is way
faster than an imul or imad and the cost is mitigated by the next commit,
which optimizes many multiplications by immediates into shorter and less
register heavy instructions than the xmads.

total instructions in shared programs : 5256901 -> 5294693 (0.72%)
total gprs used in shared programs: 624328 -> 624962 (0.10%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 20952 -> 21048 (0.46%)

local sharedgpr   inst  bytes
helped   0   0  39   0   0
  hurt   1   0 33422772277

Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 53 ++
 1 file changed, 53 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index a43b481a01..84cb5eb04b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2246,13 +2246,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
 // 
=
 
 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
 class LateAlgebraicOpt : public Pass
 {
 private:
virtual bool visit(Instruction *);
 
void handleADD(Instruction *);
+   void handleMULMAD(Instruction *);
bool tryADDToSHLADD(Instruction *);
+
+   BuildUtil bld;
 };
 
 void
@@ -2312,6 +2317,49 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
 
return true;
 }
+ 
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+void
+LateAlgebraicOpt::handleMULMAD(Instruction *i)
+{
+   // TODO: handle NV50_IR_SUBOP_MUL_HIGH
+   if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
+  return;
+   if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
+  return;
+   if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
+  return;
+
+   assert(!i->src(0).mod);
+   assert(!i->src(1).mod);
+   assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
+
+   bld.setPosition(i, true);
+
+   Value *a = i->getSrc(0);
+   Value *b = i->getSrc(1);
+   Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
+
+   Value *tmp0 = bld.getSSA();
+   Value *tmp1 = bld.getSSA();
+
+   Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
+   insn->setPredicate(i->cc, i->getPredicate());
+
+   insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
+   insn->setPredicate(i->cc, i->getPredicate());
+   insn->src(1).mod = NV50_IR_MOD_H1;
+   insn->subOp = NV50_IR_SUBOP_XMAD_MRG;
+
+   insn = bld.mkOp3(OP_XMAD, TYPE_U32, i->getDef(0), b, tmp1, tmp0);
+   insn->setPredicate(i->cc, i->getPredicate());
+   insn->src(0).mod = NV50_IR_MOD_H1;
+   insn->src(1).mod = NV50_IR_MOD_H1;
+   insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
+
+   delete_Instruction(prog, i);
+}
 
 bool
 LateAlgebraicOpt::visit(Instruction *i)
@@ -2320,6 +2368,11 @@ LateAlgebraicOpt::visit(Instruction *i)
case OP_ADD:
   handleADD(i);
   break;
+   case OP_MUL:
+   case OP_MAD:
+   case OP_FMA:
+  handleMULMAD(i);
+  break;
default:
   break;
}
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/4] nv50/ir: Improve Performance of Integer Multiplication

2018-06-13 Thread Rhys Perry
This series improve the performance of integer multiplication by removing
much usage of the very slow IMAD and IMUL. It depends on the
SHLADD/IndirectPropagation patches.

The first and second patch add support for the XMAD instruction in codegen

The third patch replaces most IMADs and IMULs with a sequence of XMADs.
This is far faster but increases the total instructions in the shader-db
by 0.72%.

This number is significantly lowered with the next patch. It replaces many
multiplications with instructions that should be as fast or faster than
the XMAD approach. They are also typically be smaller and less register
heavy, so they decrease the total instruction count by -0.50%.

This series gives about a ~50% speedup in fragment-heavy scenaries with
Dolphin 5.0. All timings were made with interesting looking fifos from
Dolphin's bugtracker:
 Wind Waker: 18 FPS -> 26 FPS at 3x internal resolution
 Wind Waker:  8 FPS -> 11 FPS at 5x internal resolution
   Paper Mario?: 26 FPS -> 42 FPS at 5x internal resolution
SpongeBob Movie: 19 FPS -> 30 FPS at 5x internal resolution

Unigine Heaven and Unigine Valley seems to run the same at low quality with
no anti-aliasing and no tessellation. SuperTuxKart and 0 A.D. also show no
change.

It's possible these patches may break something, especially the fourth
one. Piglit shows no functionality regressions though they should probably
be tested for improvements or breakage with actual applications.

These patches can also be found on my github:
https://github.com/pendingchaos/mesa/tree/nv-xmad-v1

The final changes in shader-db are as follows:

total instructions in shared programs : 5256901 -> 5268293 (0.22%)
total gprs used in shared programs: 624328 -> 624196 (-0.02%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 20952 -> 20952 (0.00%)

local sharedgpr   inst  bytes 
helped   0   0 255 680 680 
  hurt   0   0 128    14841484 

Rhys Perry (4):
  nv50/ir: add preliminary support for OP_XMAD
  gm107/ir: add support for OP_XMAD on GM107+
  nv50/ir: optimize imul/imad to xmads
  nv50/ir: further optimize multiplication by immediates

 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp|   3 +-
 src/gallium/drivers/nouveau/codegen/nv50_ir.h  |  14 ++
 .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp |  61 +++
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 188 +++--
 .../drivers/nouveau/codegen/nv50_ir_print.cpp  |  20 +++
 .../drivers/nouveau/codegen/nv50_ir_target.cpp |   7 +-
 .../nouveau/codegen/nv50_ir_target_gm107.cpp   |   5 +
 .../nouveau/codegen/nv50_ir_target_nv50.cpp|   5 +-
 .../nouveau/codegen/nv50_ir_target_nvc0.cpp|  26 ++-
 src/util/bitscan.h |  26 +++
 10 files changed, 331 insertions(+), 24 deletions(-)

-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] gm107/ir: add support for OP_XMAD on GM107+

2018-06-13 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 61 ++
 .../nouveau/codegen/nv50_ir_target_gm107.cpp   |  6 ++-
 .../nouveau/codegen/nv50_ir_target_nvc0.cpp|  1 +
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 26826d6360..8ace77aa59 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -155,6 +155,7 @@ private:
void emitIMUL();
void emitIMAD();
void emitISCADD();
+   void emitXMAD();
void emitIMNMX();
void emitICMP();
void emitISET();
@@ -1881,6 +1882,63 @@ CodeEmitterGM107::emitISCADD()
emitGPR (0x08, insn->src(0));
emitGPR (0x00, insn->def(0));
 }
+ 
+void
+CodeEmitterGM107::emitXMAD()
+{
+   assert(insn->src(0).getFile() == FILE_GPR);
+
+   bool constbuf = false;
+   bool psl_mrg = true;
+   bool immediate = false;
+   if (insn->src(2).getFile() == FILE_MEMORY_CONST) {
+  assert(insn->src(1).getFile() == FILE_GPR);
+  constbuf = true;
+  psl_mrg = false;
+  emitInsn(0x5100);
+  emitGPR(0x27, insn->src(1));
+  emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(2));
+   } else if (insn->src(1).getFile() == FILE_MEMORY_CONST) {
+  assert(insn->src(2).getFile() == FILE_GPR);
+  constbuf = true;
+  emitInsn(0x4e00);
+  emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1));
+  emitGPR(0x27, insn->src(2));
+   } else if (insn->src(1).getFile() == FILE_IMMEDIATE) {
+  assert(insn->src(2).getFile() == FILE_GPR);
+  assert(!insn->src(1).mod.h1());
+  immediate = false;
+  emitInsn(0x3600);
+  emitIMMD(0x14, 19, insn->src(1));
+  emitGPR(0x27, insn->src(2));
+   } else {
+  assert(insn->src(1).getFile() == FILE_GPR);
+  assert(insn->src(2).getFile() == FILE_GPR);
+  emitInsn(0x5b00);
+  emitGPR(0x14, insn->src(1));
+  emitGPR(0x27, insn->src(2));
+   }
+
+   if (insn->src(0).mod.sext())
+  emitField(0x30, 2, insn->src(1).mod.sext() ? 3 : 1);
+   else
+  emitField(0x30, 2, insn->src(1).mod.sext() ? 2 : 0);
+   emitField(0x35, 1, insn->src(0).mod.h1());
+   if (!immediate)
+  emitField(constbuf ? 0x34 : 0x23, 1, insn->src(1).mod.h1());
+
+   if (psl_mrg) {
+  emitField(constbuf ? 0x37 : 0x24, 1, insn->subOp & 
NV50_IR_SUBOP_XMAD_PSL ? 1 : 0);
+  emitField(constbuf ? 0x38 : 0x25, 1, insn->subOp & 
NV50_IR_SUBOP_XMAD_MRG ? 1 : 0);
+   }
+   emitField(0x32, constbuf ? 2 : 3, (insn->subOp >> 2) & 0x7);
+
+   emitX(constbuf ? 0x36 : 0x26);
+   emitCC(0x2f);
+
+   emitGPR(0x0, insn->def(0));
+   emitGPR(0x8, insn->src(0));
+}
 
 void
 CodeEmitterGM107::emitIMNMX()
@@ -3253,6 +3311,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
case OP_SHLADD:
   emitISCADD();
   break;
+   case OP_XMAD:
+  emitXMAD();
+  break;
case OP_MIN:
case OP_MAX:
   if (isFloatType(insn->dType)) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index 24a1cbb8da..f918fbfdd3 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -60,8 +60,11 @@ TargetGM107::isOpSupported(operation op, DataType ty) const
case OP_SQRT:
case OP_DIV:
case OP_MOD:
-   case OP_XMAD:
   return false;
+   case OP_XMAD:
+  if (isFloatType(ty))
+ return false;
+  break;
default:
   break;
}
@@ -230,6 +233,7 @@ TargetGM107::getLatency(const Instruction *insn) const
case OP_SUB:
case OP_VOTE:
case OP_XOR:
+   case OP_XMAD:
   if (insn->dType != TYPE_F64)
  return 6;
   break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 66efa0135f..3b96c71f44 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -161,6 +161,7 @@ static const struct opProperties _initPropsGM107[] = {
{ OP_SUSTP,   0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
{ OP_SUREDB,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
{ OP_SUREDP,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
+   { OP_XMAD,0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
 };
 
 void TargetNVC0::initProps(const struct opProperties *props, int size)
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/4] nv50/ir: Improve Performance of Integer Multiplication

2018-06-13 Thread Rhys Perry
Forgot to CC you.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/5] mesa, glsl: add support for EXT_shader_image_load_formatted

2018-06-15 Thread Rhys Perry
Signed-off-by: Rhys Perry 
---
 src/compiler/glsl/ast_to_hir.cpp | 5 +
 src/compiler/glsl/glsl_parser_extras.cpp | 1 +
 src/compiler/glsl/glsl_parser_extras.h   | 7 +++
 src/mesa/main/extensions_table.h | 1 +
 src/mesa/main/mtypes.h   | 1 +
 5 files changed, 15 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index dd60a2a87f..09ce5a44e6 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3461,6 +3461,11 @@ apply_image_qualifier_to_variable(const struct 
ast_type_qualifier *qual,
   }
 
   var->data.image_format = qual->image_format;
+   } else if (state->has_image_load_formatted()) {
+  if (var->data.mode == ir_var_uniform &&
+  state->EXT_shader_image_load_formatted_warn) {
+ _mesa_glsl_warning(loc, state, "GL_EXT_image_load_formatted used");
+  }
} else {
   if (var->data.mode == ir_var_uniform) {
  if (state->es_shader) {
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp 
b/src/compiler/glsl/glsl_parser_extras.cpp
index 04eba980e0..187bc0f18e 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -714,6 +714,7 @@ static const _mesa_glsl_extension 
_mesa_glsl_supported_extensions[] = {
EXT(EXT_separate_shader_objects),
EXT(EXT_shader_framebuffer_fetch),
EXT(EXT_shader_framebuffer_fetch_non_coherent),
+   EXT(EXT_shader_image_load_formatted)
EXT(EXT_shader_integer_mix),
EXT_AEP(EXT_shader_io_blocks),
EXT(EXT_shader_samples_identical),
diff --git a/src/compiler/glsl/glsl_parser_extras.h 
b/src/compiler/glsl/glsl_parser_extras.h
index 59a173418b..2818cdbb07 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -343,6 +343,11 @@ struct _mesa_glsl_parse_state {
   return ARB_bindless_texture_enable;
}
 
+   bool has_image_load_formatted() const
+   {
+  return EXT_shader_image_load_formatted_enable;
+   }
+
void process_version_directive(YYLTYPE *locp, int version,
   const char *ident);
 
@@ -790,6 +795,8 @@ struct _mesa_glsl_parse_state {
bool EXT_shader_framebuffer_fetch_warn;
bool EXT_shader_framebuffer_fetch_non_coherent_enable;
bool EXT_shader_framebuffer_fetch_non_coherent_warn;
+   bool EXT_shader_image_load_formatted_enable;
+   bool EXT_shader_image_load_formatted_warn;
bool EXT_shader_integer_mix_enable;
bool EXT_shader_integer_mix_warn;
bool EXT_shader_io_blocks_enable;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 79ef228b69..ac6acbb5ad 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -254,6 +254,7 @@ EXT(EXT_separate_shader_objects , dummy_true
 EXT(EXT_separate_specular_color , dummy_true   
  , GLL,  x ,  x ,  x , 1997)
 EXT(EXT_shader_framebuffer_fetch, EXT_shader_framebuffer_fetch 
  , GLL, GLC,  x , ES2, 2013)
 EXT(EXT_shader_framebuffer_fetch_non_coherent, 
EXT_shader_framebuffer_fetch_non_coherent, GLL, GLC,  x, ES2, 2018)
+EXT(EXT_shader_image_load_formatted , EXT_shader_image_load_formatted  
  , GLL, GLC,  x ,  x , 2014)
 EXT(EXT_shader_integer_mix  , EXT_shader_integer_mix   
  , GLL, GLC,  x ,  30, 2013)
 EXT(EXT_shader_io_blocks, dummy_true   
  ,  x ,  x ,  x ,  31, 2014)
 EXT(EXT_shader_samples_identical, EXT_shader_samples_identical 
  , GLL, GLC,  x ,  31, 2015)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 482c42a4b2..4d0fdfe8e7 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -4179,6 +4179,7 @@ struct gl_extensions
GLboolean EXT_provoking_vertex;
GLboolean EXT_semaphore;
GLboolean EXT_semaphore_fd;
+   GLboolean EXT_shader_image_load_formatted;
GLboolean EXT_shader_integer_mix;
GLboolean EXT_shader_samples_identical;
GLboolean EXT_stencil_two_side;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


  1   2   3   4   >