Re: [Mesa-dev] [PATCH 1/3] Mesa: Add support for GL_OES_texture_*float* extensions.

2015-01-27 Thread kalyan kondapally
Thanks Tapani. I was waiting to see if their are any more comments
before addressing Emil's review.

Br,
Kalyan

On Mon, Jan 26, 2015 at 11:30 PM, Tapani Pälli  wrote:
>
>
> On 01/08/2015 06:30 AM, Kalyan Kondapally wrote:
>>
>> This patch series adds support for following GLES2 Texture Float
>> extensions:
>> 1)GL_OES_texture_float,
>> 2)GL_OES_texture_half_float,
>> 3)GL_OES_texture_float_linear,
>> 4)GL_OES_texture_half_float_linear.
>>
>> This patch adds basic infrastructure and needed boolean flags to advertise
>> support for these extensions, by default the support is disabled. Next
>> patch
>> in the series introduces support for HALF_FLOAT_OES token.
>>
>> Signed-off-by: Kevin Rogovin 
>> Signed-off-by: Kalyan Kondapally 
>> ---
>>   src/mesa/main/extensions.c |  4 +++
>>   src/mesa/main/mtypes.h |  6 +
>>   src/mesa/main/teximage.c   | 64
>> ++
>>   src/mesa/main/texobj.c | 58
>> +
>>   4 files changed, 132 insertions(+)
>>
>> diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
>> index 0df04c2..6833fcf 100644
>> --- a/src/mesa/main/extensions.c
>> +++ b/src/mesa/main/extensions.c
>> @@ -314,6 +314,10 @@ static const struct extension extension_table[] = {
>>  { "GL_OES_texture_3D",  o(EXT_texture3D),
>> ES2, 2005 },
>>  { "GL_OES_texture_cube_map",
>> o(ARB_texture_cube_map), ES1,   2007 },
>>  { "GL_OES_texture_env_crossbar",
>> o(ARB_texture_env_crossbar), ES1,   2005 },
>> +   { "GL_OES_texture_float",   o(OES_texture_float),
>> ES2, 2005 },
>> +   { "GL_OES_texture_float_linear",
>> o(OES_texture_float_linear),   ES2, 2005 },
>> +   { "GL_OES_texture_half_float",
>> o(OES_texture_half_float), ES2, 2005 },
>> +   { "GL_OES_texture_half_float_linear",
>> o(OES_texture_half_float_linear),  ES2, 2005 },
>>  { "GL_OES_texture_mirrored_repeat", o(dummy_true),
>> ES1,   2005 },
>>  { "GL_OES_texture_npot",
>> o(ARB_texture_non_power_of_two), ES1 | ES2, 2005 },
>>  { "GL_OES_vertex_array_object", o(dummy_true),
>> ES1 | ES2, 2010 },
>> diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
>> index b95dfb9..8553a6c 100644
>> --- a/src/mesa/main/mtypes.h
>> +++ b/src/mesa/main/mtypes.h
>> @@ -1220,6 +1220,8 @@ struct gl_texture_object
>>  GLboolean Purgeable;/**< Is the buffer purgeable under memory
>>   pressure? */
>>  GLboolean Immutable;/**< GL_ARB_texture_storage */
>> +   GLboolean _IsFloat; /**< GL_OES_float_texture */
>> +   GLboolean _IsHalfFloat; /**< GL_OES_half_float_texture */
>>
>>  GLuint MinLevel;/**< GL_ARB_texture_view */
>>  GLuint MinLayer;/**< GL_ARB_texture_view */
>> @@ -3858,6 +3860,10 @@ struct gl_extensions
>>  GLboolean OES_draw_texture;
>>  GLboolean OES_depth_texture_cube_map;
>>  GLboolean OES_EGL_image_external;
>> +   GLboolean OES_texture_float;
>> +   GLboolean OES_texture_half_float;
>> +   GLboolean OES_texture_float_linear;
>> +   GLboolean OES_texture_half_float_linear;
>>  GLboolean OES_compressed_ETC1_RGB8_texture;
>>  GLboolean extension_sentinel;
>>  /** The extension string */
>> diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
>> index 916a389..45ffc95 100644
>> --- a/src/mesa/main/teximage.c
>> +++ b/src/mesa/main/teximage.c
>> @@ -62,7 +62,58 @@
>>*/
>>   #define NEW_COPY_TEX_STATE (_NEW_BUFFERS | _NEW_PIXEL)
>>
>> +/**
>> + * Returns a corresponding internal floating point format for a given
>> base
>> + * format as specifed by OES_texture_float. In case of GL_FLOAT, the
>> internal
>> + * format needs to be a 32 bit component and in case of GL_HALF_FLOAT_OES
>> it
>> + * needs to be a 16 bit component.
>> + *
>> + * For example, given base format GL_RGBA, type GL_Float return
>> GL_RGBA32F_ARB.
>> + */
>> +static GLenum
>> +adjust_for_oes_float_texture(GLenum format, GLenum type)
>> +{
>> +   switch (type) {
>> +   case GL_FLOAT:
>> +  switch (format) {
>> +  case GL_RGBA:
>> + return GL_RGBA32F;
>> +  case GL_RGB:
>> + return GL_RGB32F;
>> +  case GL_ALPHA:
>> + return GL_ALPHA32F_ARB;
>> +  case GL_LUMINANCE:
>> + return GL_LUMINANCE32F_ARB;
>> +  case GL_LUMINANCE_ALPHA:
>> + return GL_LUMINANCE_ALPHA32F_ARB;
>> +  default:
>> + break;
>> +  }
>> +  break;
>>
>> +   case GL_HALF_FLOAT_OES:
>> +  switch (format) {
>> +  case GL_RGBA:
>> + return GL_RGBA16F;
>> +  case GL_RGB:
>> + return GL_RGB16F;
>> +  case GL_ALPHA:
>> + return GL_ALPHA16F_ARB;
>> +  case GL_LUMINANCE:
>> + return GL_LUMINANCE16F_ARB;
>> +  case GL_

[Mesa-dev] [PATCH] r600g: add polygon stipple support

2015-01-27 Thread Dave Airlie
From: Dave Airlie 

This use the gallium poly stipple helper to modify
the fragment shader and sets up the appropriate state
around it.

This renders the polys test from mesa demos properly.
Fixes:
https://bugs.freedesktop.org/show_bug.cgi?id=25280

TODO:
should this be in radeon common code? (radeonsi)

Signed-off-by: Dave Airlie 
---
 src/gallium/drivers/r600/evergreen_state.c   |  5 -
 src/gallium/drivers/r600/r600_pipe.c |  6 ++
 src/gallium/drivers/r600/r600_pipe.h |  8 
 src/gallium/drivers/r600/r600_shader.c   | 11 +++
 src/gallium/drivers/r600/r600_shader.h   |  2 ++
 src/gallium/drivers/r600/r600_state_common.c | 27 +++
 6 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index ea58aea..7a0ba7e 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -482,7 +482,7 @@ static void *evergreen_create_rs_state(struct pipe_context 
*ctx,
S_028810_DX_LINEAR_ATTR_CLIP_ENA(1) |
S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard);
rs->multisample_enable = state->multisample;
-
+   rs->poly_stipple_enable = state->poly_stipple_enable;
/* offset */
rs->offset_units = state->offset_units;
rs->offset_scale = state->offset_scale * 12.0f;
@@ -864,6 +864,9 @@ static void evergreen_emit_clip_state(struct r600_context 
*rctx, struct r600_ato
 static void evergreen_set_polygon_stipple(struct pipe_context *ctx,
 const struct pipe_poly_stipple *state)
 {
+   struct r600_context *rctx = (struct r600_context *)ctx;
+   rctx->poly_stipple = *state;
+   rctx->pstipple_update = true;
 }
 
 static void evergreen_get_scissor_rect(struct r600_context *rctx,
diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index b6f7859..20cfc25 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -35,6 +35,7 @@
 #include "util/u_simple_shaders.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_math.h"
+#include "util/u_pstipple.h"
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "radeon/radeon_video.h"
@@ -100,6 +101,10 @@ static void r600_destroy_context(struct pipe_context 
*context)
u_suballocator_destroy(rctx->allocator_fetch_shader);
}
 
+   if (rctx->pstipple.sampler)
+   rctx->b.b.delete_sampler_state(&rctx->b.b, 
rctx->pstipple.sampler);
+   pipe_resource_reference(&rctx->pstipple.texture, NULL);
+   pipe_sampler_view_reference(&rctx->pstipple.sampler_view, NULL);
r600_release_command_buffer(&rctx->start_cs_cmd);
 
FREE(rctx->start_compute_cs_cmd.buf);
@@ -200,6 +205,7 @@ static struct pipe_context *r600_create_context(struct 
pipe_screen *screen, void
util_blitter_set_texture_multisample(rctx->blitter, rscreen->has_msaa);
rctx->blitter->draw_rectangle = r600_draw_rectangle;
 
+   rctx->pstipple.sampler = util_pstipple_create_sampler(&rctx->b.b);
r600_begin_new_cs(rctx);
r600_query_init_backend_mask(&rctx->b); /* this emits commands and must 
be last */
 
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index e110efe..cb910e9 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -264,6 +264,7 @@ struct r600_rasterizer_state {
booloffset_enable;
boolscissor_enable;
boolmultisample_enable;
+   boolpoly_stipple_enable;
 };
 
 struct r600_poly_offset_state {
@@ -485,6 +486,13 @@ struct r600_context {
 
void*sb_context;
struct r600_isa *isa;
+   struct pipe_poly_stipple poly_stipple;
+   bool pstipple_update;
+   struct {
+   struct pipe_resource *texture;
+   struct pipe_sampler_state *sampler;
+   struct pipe_sampler_view *sampler_view;
+   } pstipple;
 };
 
 static INLINE void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 16e820e..0bbfe52 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -36,6 +36,7 @@
 #include "tgsi/tgsi_dump.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_pstipple.h"
 #include 
 #include 
 
@@ -1824,7 +1825,13 @@ static int r600_shader_from_tgsi(struct r600_context 
*rctx,
 
r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
   rscreen->has_compressed_msaa_texturing);
+
+   if (key.polygon_stipple)
+   to

[Mesa-dev] [PATCH v2] st/clover: Pass target instead of target.begin() to std::string()

2015-01-27 Thread Michel Dänzer
From: Michel Dänzer 

Fixes reading beyond allocated memory:

==1936== Invalid read of size 1
==1936==at 0x4C2C1B4: strlen (vg_replace_strmem.c:412)
==1936==by 0x9E00C30: std::basic_string, 
std::allocator >::basic_string(char const*, std::allocator const&) 
(in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.20)
==1936==by 0x5B44FAE: clover::compile_program_llvm(clover::compat::string 
const&, clover::compat::vector > const&, pipe_shader_ir, clover::compat::string 
const&, clover::compat::string const&, clover::compat::string&) 
(invocation.cpp:698)
==1936==by 0x5B39A20: 
clover::program::build(clover::ref_vector const&, char const*, 
clover::compat::vector > const&) (program.cpp:63)
==1936==by 0x5B20152: clBuildProgram (program.cpp:182)
==1936==by 0x400F41: main (hello_world.c:109)
==1936==  Address 0x56fee1f is 0 bytes after a block of size 15 alloc'd
==1936==at 0x4C28C20: malloc (vg_replace_malloc.c:296)
==1936==by 0x5B398F0: alloc (compat.hpp:59)
==1936==by 0x5B398F0: vector > (compat.hpp:98)
==1936==by 0x5B398F0: string > (compat.hpp:327)
==1936==by 0x5B398F0: 
clover::program::build(clover::ref_vector const&, char const*, 
clover::compat::vector > const&) (program.cpp:63)
==1936==by 0x5B20152: clBuildProgram (program.cpp:182)
==1936==by 0x400F41: main (hello_world.c:109)

Signed-off-by: Michel Dänzer 
---

v2: Just use target instead of target.begin(), as suggested by Francisco
Jerez

 src/gallium/state_trackers/clover/llvm/invocation.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 7a0be53..edeed56 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -699,9 +699,9 @@ clover::compile_program_llvm(const compat::string &source,
  debug_options, 0);
 
std::vector kernels;
-   size_t processor_str_len = std::string(target.begin()).find_first_of("-");
-   std::string processor(target.begin(), 0, processor_str_len);
-   std::string triple(target.begin(), processor_str_len + 1,
+   size_t processor_str_len = std::string(target).find_first_of("-");
+   std::string processor(target, 0, processor_str_len);
+   std::string triple(target, processor_str_len + 1,
   target.size() - processor_str_len - 1);
clang::LangAS::Map address_spaces;
llvm::LLVMContext llvm_ctx;
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] st/clover: Use std::string for target IR string parameter

2015-01-27 Thread Michel Dänzer
On 22.01.2015 19:10, Francisco Jerez wrote:
> Michel Dänzer  writes:
> 
>> From: Michel Dänzer 
>>
>> That's what device::ir_target() returns. Fixes reading beyond allocated
>> memory:
>>
>> ==1936== Invalid read of size 1
>> ==1936==at 0x4C2C1B4: strlen (vg_replace_strmem.c:412)
>> ==1936==by 0x9E00C30: std::basic_string, 
>> std::allocator >::basic_string(char const*, std::allocator 
>> const&) (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.20)
>> ==1936==by 0x5B44FAE: 
>> clover::compile_program_llvm(clover::compat::string const&, 
>> clover::compat::vector> clover::compat::string> > const&, pipe_shader_ir, clover::compat::string 
>> const&, clover::compat::string const&, clover::compat::string&) 
>> (invocation.cpp:698)
>> ==1936==by 0x5B39A20: 
>> clover::program::build(clover::ref_vector const&, char 
>> const*, clover::compat::vector> clover::compat::string> > const&) (program.cpp:63)
>> ==1936==by 0x5B20152: clBuildProgram (program.cpp:182)
>> ==1936==by 0x400F41: main (hello_world.c:109)
>> ==1936==  Address 0x56fee1f is 0 bytes after a block of size 15 alloc'd
>> ==1936==at 0x4C28C20: malloc (vg_replace_malloc.c:296)
>> ==1936==by 0x5B398F0: alloc (compat.hpp:59)
>> ==1936==by 0x5B398F0: vector > (compat.hpp:98)
>> ==1936==by 0x5B398F0: string > (compat.hpp:327)
>> ==1936==by 0x5B398F0: 
>> clover::program::build(clover::ref_vector const&, char 
>> const*, clover::compat::vector> clover::compat::string> > const&) (program.cpp:63)
>> ==1936==by 0x5B20152: clBuildProgram (program.cpp:182)
>> ==1936==by 0x400F41: main (hello_world.c:109)
>>
>> Signed-off-by: Michel Dänzer 
> 
> Hi Michel, apparently the problem is this line:
> 
> | size_t processor_str_len = std::string(target.begin()).find_first_of("-");
> 
> That assumes that compat::string::begin() is null-terminated, which is
> not necessarily the case.
> 
> Unfortunately I don't think we make that argument an std::string if we
> still want to support building with LLVM < 3.5, since it may lead to ABI
> issues.  Can you change the line to use the conversion operator instead
> for the time being, like:
> 
> | size_t processor_str_len = std::string(target).find_first_of("-");

Thanks for the suggestion, implemented in v2.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer



signature.asc
Description: OpenPGP digital signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: Enable VGPR spilling for all shader types v3

2015-01-27 Thread Michel Dänzer
On 24.01.2015 11:56, Tom Stellard wrote:
> On Thu, Jan 22, 2015 at 11:27:32AM +0900, Michel Dänzer wrote:
>> On 21.01.2015 21:12, Marek Olšák wrote:
>>> We also had a case when the CPU accidentally corrupted shaders,
>>> because the shaders were mapped after textures and a CPU texture
>>> upload overflowed and overwrote shaders. I suppose we should have
>>> unmapped the shaders.
>>
>> Sounds like a good idea.
>>
>>
>> Tom, for now I suggest this solution, summarized from Marek's previous
>> descriptions:
>>
>> (At least) for shaders which have relocations, keep a copy of the
>> machine code in malloced memory. When the relocated values change,
>> update them in the malloced memory, allocate a new BO, map it, copy the
>> machine code from the malloced memory to the BO, replace any existing
>> shader BO with the new one and invalidate the shader state.
>>
> 
> Hi,
> 
> Attached is a WIP patch attempting to implement it this way.
> Unfortunately, I was unable to get it working, so I wanted to
> submit it for review in case someone can spot what I'm doing wrong.
> 
> You can find the broken code wrapped in #if 0 in the
> si_update_scratch_buffer() function in si_state_shaders.c
> 
> Based on the dmesg output and other tests I've done, it appears
> that the GPU is still executing the shader code from the old bo
> which does not contain the relocations.
> 
> The code in the #else branch works fine, but it updates the existing
> bo in place rather than creating a new one.
> 
> Any idea what I've done wrong?

Some more comments in addition to what I mentioned before:


> @@ -221,6 +222,11 @@ struct si_context {
>   int last_prim;
>   int last_multi_vgt_param;
>   int last_rast_prim;
> +
> + /* Scratch buffer */
> + boolean emit_scratch_reloc ;
> + unsignedscratch_waves;
> + unsignedspi_tmpring_size;
>  };
>  
>  /* si_blit.c */

No space before the semicolon.

Also, there's a few extraneous blank lines being added, e.g. at the
beginning of blocks or between statements where there already is a blank
line.


> + /* The LLVM shader backend should be reporting aligned scratch_sizes. */
> + assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
> + "scratch size should already be aligned correctly.");
> +
> + sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
> + S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);

I think these are only necessary in the (scratch_needed_size >
current_scratch_buffer_size) case?

The (scratch_needed_size > current_scratch_buffer_size) case also needs
to set sctx->emit_scratch_reloc = true.


Other than that, the change looks good to me.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] r600g: add polygon stipple support

2015-01-27 Thread Dave Airlie
On 27 January 2015 at 18:04, Dave Airlie  wrote:
> From: Dave Airlie 
>
> This use the gallium poly stipple helper to modify
> the fragment shader and sets up the appropriate state
> around it.
>
> This renders the polys test from mesa demos properly.
> Fixes:
> https://bugs.freedesktop.org/show_bug.cgi?id=25280
>
> TODO:
> should this be in radeon common code? (radeonsi)

My attempt to use set_sampler_views runs afowl of r600 not being able
to handle start != 0 so I'll probably have to attempt to fix that.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2] st/clover: Pass target instead of target.begin() to std::string()

2015-01-27 Thread Francisco Jerez
Michel Dänzer  writes:

> From: Michel Dänzer 
>
> Fixes reading beyond allocated memory:
>
> ==1936== Invalid read of size 1
> ==1936==at 0x4C2C1B4: strlen (vg_replace_strmem.c:412)
> ==1936==by 0x9E00C30: std::basic_string, 
> std::allocator >::basic_string(char const*, std::allocator 
> const&) (in /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.20)
> ==1936==by 0x5B44FAE: clover::compile_program_llvm(clover::compat::string 
> const&, clover::compat::vector clover::compat::string> > const&, pipe_shader_ir, clover::compat::string 
> const&, clover::compat::string const&, clover::compat::string&) 
> (invocation.cpp:698)
> ==1936==by 0x5B39A20: 
> clover::program::build(clover::ref_vector const&, char 
> const*, clover::compat::vector clover::compat::string> > const&) (program.cpp:63)
> ==1936==by 0x5B20152: clBuildProgram (program.cpp:182)
> ==1936==by 0x400F41: main (hello_world.c:109)
> ==1936==  Address 0x56fee1f is 0 bytes after a block of size 15 alloc'd
> ==1936==at 0x4C28C20: malloc (vg_replace_malloc.c:296)
> ==1936==by 0x5B398F0: alloc (compat.hpp:59)
> ==1936==by 0x5B398F0: vector > (compat.hpp:98)
> ==1936==by 0x5B398F0: string > (compat.hpp:327)
> ==1936==by 0x5B398F0: 
> clover::program::build(clover::ref_vector const&, char 
> const*, clover::compat::vector clover::compat::string> > const&) (program.cpp:63)
> ==1936==by 0x5B20152: clBuildProgram (program.cpp:182)
> ==1936==by 0x400F41: main (hello_world.c:109)
>
> Signed-off-by: Michel Dänzer 

Looks good,
Reviewed-by: Francisco Jerez 

> ---
>
> v2: Just use target instead of target.begin(), as suggested by Francisco
> Jerez
>
>  src/gallium/state_trackers/clover/llvm/invocation.cpp | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
> b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> index 7a0be53..edeed56 100644
> --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
> +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> @@ -699,9 +699,9 @@ clover::compile_program_llvm(const compat::string &source,
>   debug_options, 0);
>  
> std::vector kernels;
> -   size_t processor_str_len = std::string(target.begin()).find_first_of("-");
> -   std::string processor(target.begin(), 0, processor_str_len);
> -   std::string triple(target.begin(), processor_str_len + 1,
> +   size_t processor_str_len = std::string(target).find_first_of("-");
> +   std::string processor(target, 0, processor_str_len);
> +   std::string triple(target, processor_str_len + 1,
>target.size() - processor_str_len - 1);
> clang::LangAS::Map address_spaces;
> llvm::LLVMContext llvm_ctx;
> -- 
> 2.1.4


pgpltW0wIwxeF.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radeonsi: Enable VGPR spilling for all shader types v3

2015-01-27 Thread Marek Olšák
On Sat, Jan 24, 2015 at 4:32 AM, Michel Dänzer  wrote:
> On 24.01.2015 11:56, Tom Stellard wrote:
>> On Thu, Jan 22, 2015 at 11:27:32AM +0900, Michel Dänzer wrote:
>>>
>>> Tom, for now I suggest this solution, summarized from Marek's previous
>>> descriptions:
>>>
>>> (At least) for shaders which have relocations, keep a copy of the
>>> machine code in malloced memory. When the relocated values change,
>>> update them in the malloced memory, allocate a new BO, map it, copy the
>>> machine code from the malloced memory to the BO, replace any existing
>>> shader BO with the new one and invalidate the shader state.
>>>
>>
>> Hi,
>>
>> Attached is a WIP patch attempting to implement it this way.
>> Unfortunately, I was unable to get it working, so I wanted to
>> submit it for review in case someone can spot what I'm doing wrong.
>>
>> You can find the broken code wrapped in #if 0 in the
>> si_update_scratch_buffer() function in si_state_shaders.c
>>
>> Based on the dmesg output and other tests I've done, it appears
>> that the GPU is still executing the shader code from the old bo
>> which does not contain the relocations.
>>
>> The code in the #else branch works fine, but it updates the existing
>> bo in place rather than creating a new one.
>>
>> Any idea what I've done wrong?
>
> [...]
>
>> + /* Update the shaders, so they are using the latest scratch.  
>> The
>> +  * scratch buffer may have been changed since these shaders 
>> were
>> +  * last used, so we still need to try to update them, even if
>> +  * they require scratch buffers smaller than the current size.
>> +  */
>> + if (si_update_scratch_buffer(sctx, sctx->ps_shader))
>> + sctx->emitted.named.ps = NULL;
>> + if (si_update_scratch_buffer(sctx, sctx->gs_shader))
>> + sctx->emitted.named.gs = NULL;
>> + if (si_update_scratch_buffer(sctx, sctx->vs_shader))
>> + sctx->emitted.named.vs = NULL;
>
> Does this work instead?
>
> if (si_update_scratch_buffer(sctx, sctx->ps_shader))
> si_pm4_bind_state(sctx, ps, 
> sctx->ps_shader->current->pm4);
> if (si_update_scratch_buffer(sctx, sctx->gs_shader))
> si_pm4_bind_state(sctx, gs, 
> sctx->gs_shader->current->pm4);
> if (si_update_scratch_buffer(sctx, sctx->vs_shader))
> si_pm4_bind_state(sctx, vs, 
> sctx->vs_shader->current->pm4);

Yes, this should work. Reallocating a pm4 state (by calling
si_shader_init_pm4_state) isn't enough to use it. It must be bound
too, because it's a brand new state.

The old pm4 state is leaked though. The attached diff should fix it.

Marek
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c 
b/src/gallium/drivers/radeonsi/si_pm4.c
index 954eb6e..0acb257 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -103,6 +103,13 @@ void si_pm4_add_bo(struct si_pm4_state *state,
state->bo_priority[idx] = priority;
 }
 
+void si_pm4_free_state_simple(struct si_pm4_state *state)
+{
+   for (int i = 0; i < state->nbo; ++i)
+   r600_resource_reference(&state->bo[i], NULL);
+   FREE(state);
+}
+
 void si_pm4_free_state(struct si_context *sctx,
   struct si_pm4_state *state,
   unsigned idx)
@@ -114,10 +121,7 @@ void si_pm4_free_state(struct si_context *sctx,
sctx->emitted.array[idx] = NULL;
}
 
-   for (int i = 0; i < state->nbo; ++i) {
-   r600_resource_reference(&state->bo[i], NULL);
-   }
-   FREE(state);
+   si_pm4_free_state_simple(state);
 }
 
 unsigned si_pm4_dirty_dw(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h 
b/src/gallium/drivers/radeonsi/si_pm4.h
index c4d6098..d215882 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -71,6 +71,7 @@ void si_pm4_add_bo(struct si_pm4_state *state,
   enum radeon_bo_usage usage,
   enum radeon_bo_priority priority);
 
+void si_pm4_free_state_simple(struct si_pm4_state *state);
 void si_pm4_free_state(struct si_context *sctx,
   struct si_pm4_state *state,
   unsigned idx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 810aba8..53ffb3c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -326,6 +326,9 @@ static void si_shader_ps(struct si_screen *sscreen, struct 
si_shader *shader)
 static void si_shader_init_pm4_state(struct si_screen *sscreen,
 struct si_shader *shader)
 {
+   if (shader->pm4)
+   si_pm4_free_state_simple(shader->pm4);
+
switch (shader->selector->type) {
  

Re: [Mesa-dev] [PATCH 1/2] r600g: move selecting the pixel shader earlier.

2015-01-27 Thread Glenn Kennard

On Tue, 27 Jan 2015 04:46:32 +0100, Dave Airlie  wrote:


From: Dave Airlie 

In order to detect that a pixel shader has a prim id
input when we have no geometry shader we need to reorder
the shader selection so the pixel shader is selected
first, then the vertex shader key can take into account
the primitive id input requirement and lack of geom shader.

Signed-off-by: Dave Airlie 
---
 src/gallium/drivers/r600/r600_state_common.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_state_common.c  
b/src/gallium/drivers/r600/r600_state_common.c

index 09d8952..1030620 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1170,6 +1170,10 @@ static bool r600_update_derived_state(struct  
r600_context *rctx)

}
}
+   r600_shader_select(ctx, rctx->ps_shader, &ps_dirty);
+   if (unlikely(!rctx->ps_shader->current))
+   return false;
+
update_gs_block_state(rctx, rctx->gs_shader != NULL);
if (rctx->gs_shader) {
@@ -1232,9 +1236,6 @@ static bool r600_update_derived_state(struct  
r600_context *rctx)

}
}
-   r600_shader_select(ctx, rctx->ps_shader, &ps_dirty);
-   if (unlikely(!rctx->ps_shader->current))
-   return false;
	if (unlikely(ps_dirty || rctx->pixel_shader.shader !=  
rctx->ps_shader->current ||
 		rctx->rasterizer->sprite_coord_enable !=  
rctx->ps_shader->current->sprite_coord_enable ||



Reviewed-by: Glenn Kennard 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] r600g: add support for primitive id without geom shader

2015-01-27 Thread Glenn Kennard

On Tue, 27 Jan 2015 04:46:33 +0100, Dave Airlie  wrote:


From: Dave Airlie 

GLSL 1.50 specifies a fragment shader may have a primitive id
input without a geometry shader present.

On r600 hw there is a special GS scenario for this, you have
to enable GS_SCENARIO_A and pass the primitive id through
the vertex shader which operates in GS_A mode.

This is a first pass attempt at this, and passes the piglit
tests that test for this.

TODO: r600/700 support.


r600/r700 should be just copy/paste of the evergreen_emit_shader_stages()  
changes into r600_emit_shader_stages().



make nicer.

Signed-off-by: Dave Airlie 
---
 src/gallium/drivers/r600/evergreen_state.c   |  6 +
 src/gallium/drivers/r600/r600_hw_context.c   |  2 +-
 src/gallium/drivers/r600/r600_shader.c   | 33  


 src/gallium/drivers/r600/r600_shader.h   |  4 
 src/gallium/drivers/r600/r600_state_common.c |  5 +
 5 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c  
b/src/gallium/drivers/r600/evergreen_state.c

index 36b86aa..bc1fa48 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2111,6 +2111,12 @@ static void evergreen_emit_shader_stages(struct  
r600_context *rctx, struct r600_

uint32_t v = 0, v2 = 0, primid = 0;
+   if (rctx->vs_shader->current->shader.vs_as_gs_a) {
+   fprintf(stderr, "got fs prim id with no geom\n");


Stray debug print


+   v2 = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
+   primid = 1;
+   }
+
if (state->geom_enable) {
uint32_t cut_val;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c  
b/src/gallium/drivers/r600/r600_hw_context.c

index ccc5a8b..cd57eed 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -315,9 +315,9 @@ void r600_begin_new_cs(struct r600_context *ctx)
ctx->stencil_ref.atom.dirty = true;
ctx->vertex_fetch_shader.atom.dirty = true;
ctx->export_shader.atom.dirty = true;
+   ctx->shader_stages.atom.dirty = true;
if (ctx->gs_shader) {
ctx->geometry_shader.atom.dirty = true;
-   ctx->shader_stages.atom.dirty = true;
ctx->gs_rings.atom.dirty = true;
}
ctx->vertex_shader.atom.dirty = true;
diff --git a/src/gallium/drivers/r600/r600_shader.c  
b/src/gallium/drivers/r600/r600_shader.c

index 471df91..8841b08 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -596,6 +596,20 @@ static int select_twoside_color(struct  
r600_shader_ctx *ctx, int front, int back

return 0;
 }
+static int vs_add_primid_output(struct r600_shader_ctx *ctx)
+{
+   int i;
+   i = ctx->shader->noutput++;
+   ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
+   ctx->shader->output[i].sid = 0;
+   ctx->shader->output[i].gpr = 0;
+   ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
+   ctx->shader->output[i].write_mask = 0x4;
+   ctx->shader->output[i].spi_sid = ctx->shader->ps_prim_id_input;
+
+   return 0;
+}
+
 static int tgsi_declaration(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_declaration *d =  
&ctx->parse.FullToken.FullDeclaration;
@@ -626,6 +640,11 @@ static int tgsi_declaration(struct r600_shader_ctx  
*ctx)

case TGSI_SEMANTIC_POSITION:
ctx->fragcoord_input = i;
break;
+   case TGSI_SEMANTIC_PRIMID:
+   /* set this for now */
+   ctx->shader->gs_prim_id_input = true;
+   ctx->shader->ps_prim_id_input = i;
+   break;
}
if (ctx->bc->chip_class >= EVERGREEN) {
if ((r = evergreen_interp_input(ctx, i)))
@@ -1800,6 +1819,9 @@ static int r600_shader_from_tgsi(struct  
r600_context *rctx,

ctx.shader = shader;
ctx.native_integers = true;
+   shader->vs_as_gs_a = key.vs_as_gs_a;
+   if (key.vs_as_gs_a)
+   shader->ps_prim_id_input = key.vs_prim_id_out;
shader->vs_as_es = key.vs_as_es;
r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
@@ -1938,6 +1960,10 @@ static int r600_shader_from_tgsi(struct  
r600_context *rctx,

ctx.nliterals = 0;
ctx.literals = NULL;
shader->fs_write_all = FALSE;
+
+   if (shader->vs_as_gs_a)
+   vs_add_primid_output(&ctx);
+
while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
tgsi_parse_token(&ctx.parse);
switch (ctx.parse.FullToken.Token.Type) {
@@ -2335,7 +2361,14 @@ static int r600_shader_from_tgsi(struct  
r600_context *rctx,

o

Re: [Mesa-dev] [PATCH 0/2] Use target attribute for architecture optimizations

2015-01-27 Thread Marc Dietrich
Am Freitag, 16. Januar 2015, 13:42:53 schrieb Marc Dietrich:
> The GCC specific target function attribute or pragma can be used to enable
> architecture depended optimisation options (e.g. SSE) directly in source
> files instead of specifing them on the compiler command line. This is
> useful when linking files with different compiler options which happens
> when using LTO.

no comments?

btw. I tested this series with gcc 4.8/4.9 and clang (3.6). To use lto, you 
have to disable tls, e.g. --disable-gnu-tls. There is some strange 
interference with tls/asm code/lto I'm not able to resolve to make tls 
working.

To test LTO, you have pass special flags to configure:

LTO_GCC="-flto=4 -fuse-ld=gold -fuse-linker-plugin"

CFLAGS="$LTO_GCC"
CXXFLAGS="$LTO_GCC"
LDFLAGS="$LTO_GCC"
CC="gcc-4.9"
CXX="g++-4.9"
LD="ld.gold"
AR="gcc-ar-4.9"
NM="gcc-nm-4.9"
RANLIB="gcc-ranlib-4.9"

Marc


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 88662] unaligned access to gl_dlist_node

2015-01-27 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=88662

--- Comment #2 from Jonathan Gray  ---
Yes, I can confirm that the patch fixes the crash on sparc64.
Tested with 10.4.3 so I had to change

fprintf(f, "NOP\n"); -> printf("NOP\n");

to make it build but it otherwise seems fine, thanks.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 05/11] mesa: Validate internal format and format type first to provide accurate error code

2015-01-27 Thread Eduardo Lima Mitev
On 01/21/2015 03:26 AM, Anuj Phogat wrote:
> On Mon, Jan 19, 2015 at 3:32 AM, Eduardo Lima Mitev  wrote:
>> The specification states that glTexImage2D and glTexImage3D should return
>> GL_INVALID_VALUE if the internal format is invalid, and GL_INVALID_ENUM is
>> the format type is invalid. However, current error check only considers the
>> combination of format, type and internal format; which returns a
>> GL_INVALID_OPERATION error when invalid.
>>
> I did a quick search in es 3.0.4 spec but couldn't find the reference. Could 
> you
> point me to the reference in the spec and may be add it as a comment in the
> code?
> 

Hi,

This behavior is described in the man pages, not any specification, so
it is a mistake. I thoroughly reviewed GLES 3.0.0, 3.0.4, 3.1 and 2.0
(just in case) and there is no explicit mention to this behavior.

The closest I could find was this text in section 8.5. TEXTURE IMAGE
SPECIFICATION (page 203) of the OpenGL 4.5 spec:

"An INVALID_VALUE error is generated if internalformat is not one of
the valid values described above."

(And "above" there is a large definition of all the valid internal
formats, combinations, etc).

So I wonder if there is an implicitly defined behavior that would
justify this wording in the man page for TexImage3D/2D:

* GL_INVALID_ENUM is generated if type is not a type constant.
* GL_INVALID_VALUE is generated if internalFormat is not one of the
accepted resolution and format symbolic constants.

I know that the man pages are not a reliable source to define API
behavior, but since apparently many dEQP tests are written against the
man pages, it would be useful to understand better the link (if exists)
between the man pages and the corresponding wording in a specification.

I will withdraw this patch in the mean time.

Thank you,
Eduardo

>> Fixes 2 dEQP tests:
>> * dEQP-GLES3.functional.negative_api.texture.teximage2d
>> * dEQP-GLES3.functional.negative_api.texture.teximage3d
>> ---


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/5] radeonsi: Avoid leaking memory when rebuilding shader states

2015-01-27 Thread Tom Stellard
From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_pm4.c   | 12 
 src/gallium/drivers/radeonsi/si_pm4.h   |  1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c |  4 
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pm4.c 
b/src/gallium/drivers/radeonsi/si_pm4.c
index 5edf152..e9b9a5f 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -103,6 +103,13 @@ void si_pm4_add_bo(struct si_pm4_state *state,
state->bo_priority[idx] = priority;
 }
 
+void si_pm4_free_state_simple(struct si_pm4_state *state)
+{
+   for (int i = 0; i < state->nbo; ++i)
+   r600_resource_reference(&state->bo[i], NULL);
+   FREE(state);
+}
+
 void si_pm4_free_state(struct si_context *sctx,
   struct si_pm4_state *state,
   unsigned idx)
@@ -114,10 +121,7 @@ void si_pm4_free_state(struct si_context *sctx,
sctx->emitted.array[idx] = NULL;
}
 
-   for (int i = 0; i < state->nbo; ++i) {
-   r600_resource_reference(&state->bo[i], NULL);
-   }
-   FREE(state);
+   si_pm4_free_state_simple(state);
 }
 
 unsigned si_pm4_dirty_dw(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h 
b/src/gallium/drivers/radeonsi/si_pm4.h
index 8680a9e..bfb5562 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -71,6 +71,7 @@ void si_pm4_add_bo(struct si_pm4_state *state,
   enum radeon_bo_usage usage,
   enum radeon_bo_priority priority);
 
+void si_pm4_free_state_simple(struct si_pm4_state *state);
 void si_pm4_free_state(struct si_context *sctx,
   struct si_pm4_state *state,
   unsigned idx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 887680f..3249bcc 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -316,6 +316,10 @@ static void si_shader_ps(struct si_shader *shader)
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
 {
+
+   if (shader->pm4)
+   si_pm4_free_state_simple(shader->pm4);
+
switch (shader->selector->type) {
case PIPE_SHADER_VERTEX:
if (shader->key.vs.as_es)
-- 
2.0.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/5] radeonsi: Add radeon_shader_binary member to struct si_shader

2015-01-27 Thread Tom Stellard
---
 src/gallium/drivers/radeonsi/si_compute.c | 11 +--
 src/gallium/drivers/radeonsi/si_shader.h  |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index ba63afd..840d21f 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -54,7 +54,6 @@ struct si_compute {
unsigned local_size;
unsigned private_size;
unsigned input_size;
-   struct radeon_shader_binary binary;
struct si_shader shader;
unsigned num_user_sgprs;
 
@@ -102,8 +101,8 @@ static void *si_create_compute_state(
}
 #else
 
-   radeon_elf_read(code, header->num_bytes, &program->binary, true);
-   si_shader_binary_read(sctx->screen, &program->shader, &program->binary);
+   radeon_elf_read(code, header->num_bytes, &program->shader.binary, true);
+   si_shader_binary_read(sctx->screen, &program->shader, 
&program->shader.binary);
 
 #endif
program->input_buffer = si_resource_create_custom(sctx->b.b.screen,
@@ -256,7 +255,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
/* Read the config information */
-   si_shader_binary_read_config(&program->binary, shader, pc);
+   si_shader_binary_read_config(&program->shader.binary, shader, pc);
 #endif
 
/* Upload the kernel arguments */
@@ -296,7 +295,7 @@ static void si_launch_grid(
 
/* Patch the shader with the scratch buffer address. */
apply_scratch_relocs(sctx->screen,
-   &program->binary, shader, scratch_buffer_va);
+   &program->shader.binary, shader, scratch_buffer_va);
 
}
 
@@ -481,7 +480,7 @@ static void si_delete_compute_state(struct pipe_context 
*ctx, void* state){
pipe_resource_reference(
(struct pipe_resource **)&program->input_buffer, NULL);
 
-   radeon_shader_binary_free_members(&program->binary, true);
+   radeon_shader_binary_free_members(&program->shader.binary, true);
FREE(program);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 08e344a..6def5c7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -142,6 +142,7 @@ struct si_shader {
struct si_pm4_state *pm4;
struct r600_resource*bo;
struct r600_resource*scratch_bo;
+   struct radeon_shader_binary binary;
unsignednum_sgprs;
unsignednum_vgprs;
unsignedlds_size;
-- 
2.0.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/5] radeonsi/compute: Allocate the scratch buffer during state creation

2015-01-27 Thread Tom Stellard
This moves scratch buffer allocation from si_launch_grid() to
si_create_compute_state().  This helps to reduce the overhead of
launching a kernel and also fixes a bug in the code that would cause
the scratch buffer to be too small if a kernel with smaller scratch size
was launched before a kernel with a larger scratch size.
---
 src/gallium/drivers/radeonsi/si_compute.c | 82 ++-
 src/gallium/drivers/radeonsi/si_shader.c  |  4 +-
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 840d21f..d0966af 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -67,6 +67,49 @@ struct si_compute {
 #endif
 };
 
+static void apply_scratch_relocs(const struct si_screen *sscreen,
+   struct si_shader *shader, uint64_t scratch_va);
+static void init_scratch_buffer(struct si_context *sctx, struct si_compute 
*program)
+{
+   unsigned scratch_bytes = 0;
+   uint64_t scratch_buffer_va;
+   unsigned i;
+
+   /* Compute the scratch buffer size using the maximum number of waves.
+* This way we don't need to recompute it for each kernel launch. */
+   unsigned scratch_waves = 32 * sctx->screen->b.info.max_compute_units;
+   for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
+   unsigned offset =
+   program->shader.binary.global_symbol_offsets[i];
+   unsigned scratch_bytes_needed;
+
+   si_shader_binary_read_config(&program->shader.binary,
+   &program->shader, offset);
+   scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
+   scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
+   }
+
+   if (scratch_bytes == 0)
+   return;
+
+   program->shader.scratch_bo = (struct r600_resource*)
+   si_resource_create_custom(sctx->b.b.screen,
+   PIPE_USAGE_DEFAULT,
+   scratch_bytes * scratch_waves);
+
+   scratch_buffer_va = program->shader.scratch_bo->gpu_address;
+
+   /* apply_scratch_relocs needs scratch_bytes_per_wave to be set
+* to the maximum bytes needed, so it can compute the stride
+* correctly.
+*/
+   program->shader.scratch_bytes_per_wave = scratch_bytes;
+
+   /* Patch the shader with the scratch buffer address. */
+   apply_scratch_relocs(sctx->screen, &program->shader, scratch_buffer_va);
+
+}
+
 static void *si_create_compute_state(
struct pipe_context *ctx,
const struct pipe_compute_state *cso)
@@ -102,6 +145,12 @@ static void *si_create_compute_state(
 #else
 
radeon_elf_read(code, header->num_bytes, &program->shader.binary, true);
+
+   /* init_scratch_buffer patches the shader code with the scratch address,
+* so we need to call it beofre si_shader_binary_read() which uploads
+* the shader code to the GPU.
+*/
+   init_scratch_buffer(sctx, program);
si_shader_binary_read(sctx->screen, &program->shader, 
&program->shader.binary);
 
 #endif
@@ -183,32 +232,27 @@ static unsigned compute_num_waves_for_scratch(
 }
 
 static void apply_scratch_relocs(const struct si_screen *sscreen,
-   const struct radeon_shader_binary *binary,
struct si_shader *shader, uint64_t scratch_va) {
unsigned i;
-   char *ptr;
uint32_t scratch_rsrc_dword0 = scratch_va & 0x;
uint32_t scratch_rsrc_dword1 =
S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
 
-   if (!binary->reloc_count) {
+   if (!shader->binary.reloc_count) {
return;
}
 
-   ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
-   PIPE_TRANSFER_READ_WRITE);
-   for (i = 0 ; i < binary->reloc_count; i++) {
-   const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+   for (i = 0 ; i < shader->binary.reloc_count; i++) {
+   const struct radeon_shader_reloc *reloc = 
&shader->binary.relocs[i];
if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
-   util_memcpy_cpu_to_le32(ptr + reloc->offset,
+   util_memcpy_cpu_to_le32(shader->binary.code + 
reloc->offset,
&scratch_rsrc_dword0, 4);
} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-   util_memcpy_cpu_to_le32(ptr + reloc->offset,
+   util_memcpy_cpu_to_le32(shader->binary.code + 
reloc->offset,
&scratch_rsrc_dword1, 4);
}
}
-   sscree

[Mesa-dev] [PATCH 5/5] radeonsi: Enable VGPR spilling for all shader types v5

2015-01-27 Thread Tom Stellard
v2:
  - Only emit write SPI_TMPRING_SIZE once per packet.
  - Use context global scratch buffer.

v3:
  - Patch shaders using WRITE_DATA packet instead of map/unmap.
  - Emit ICACHE_FLUSH, CS_PARTIAL_FLUSH, PS_PARTIAL_FLUSH, and
VS_PARTIAL_FLUSH when patching shaders.

v4:
  - Code cleanups.
  - Remove unnecessary multiplies.

v5:
  - Patch shaders in system memory and re-upload to vram.
---
 src/gallium/drivers/radeonsi/si_compute.c   |  44 ++--
 src/gallium/drivers/radeonsi/si_hw_context.c|   1 +
 src/gallium/drivers/radeonsi/si_pipe.c  |   9 +-
 src/gallium/drivers/radeonsi/si_pipe.h  |   6 ++
 src/gallium/drivers/radeonsi/si_shader.c|  59 --
 src/gallium/drivers/radeonsi/si_shader.h|   6 +-
 src/gallium/drivers/radeonsi/si_state_draw.c|  15 +++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 137 +++-
 8 files changed, 225 insertions(+), 52 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index d0966af..b525683 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -42,12 +42,6 @@
 #define NUM_USER_SGPRS 4
 #endif
 
-static const char *scratch_rsrc_dword0_symbol =
-   "SCRATCH_RSRC_DWORD0";
-
-static const char *scratch_rsrc_dword1_symbol =
-   "SCRATCH_RSRC_DWORD1";
-
 struct si_compute {
struct si_context *ctx;
 
@@ -67,8 +61,6 @@ struct si_compute {
 #endif
 };
 
-static void apply_scratch_relocs(const struct si_screen *sscreen,
-   struct si_shader *shader, uint64_t scratch_va);
 static void init_scratch_buffer(struct si_context *sctx, struct si_compute 
*program)
 {
unsigned scratch_bytes = 0;
@@ -83,7 +75,7 @@ static void init_scratch_buffer(struct si_context *sctx, 
struct si_compute *prog
program->shader.binary.global_symbol_offsets[i];
unsigned scratch_bytes_needed;
 
-   si_shader_binary_read_config(&program->shader.binary,
+   si_shader_binary_read_config(sctx->screen,
&program->shader, offset);
scratch_bytes_needed = program->shader.scratch_bytes_per_wave;
scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
@@ -106,8 +98,8 @@ static void init_scratch_buffer(struct si_context *sctx, 
struct si_compute *prog
program->shader.scratch_bytes_per_wave = scratch_bytes;
 
/* Patch the shader with the scratch buffer address. */
-   apply_scratch_relocs(sctx->screen, &program->shader, scratch_buffer_va);
-
+   si_shader_apply_scratch_relocs(sctx,
+   &program->shader, scratch_buffer_va);
 }
 
 static void *si_create_compute_state(
@@ -231,30 +223,6 @@ static unsigned compute_num_waves_for_scratch(
return scratch_waves;
 }
 
-static void apply_scratch_relocs(const struct si_screen *sscreen,
-   struct si_shader *shader, uint64_t scratch_va) {
-   unsigned i;
-   uint32_t scratch_rsrc_dword0 = scratch_va & 0x;
-   uint32_t scratch_rsrc_dword1 =
-   S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-   |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
-
-   if (!shader->binary.reloc_count) {
-   return;
-   }
-
-   for (i = 0 ; i < shader->binary.reloc_count; i++) {
-   const struct radeon_shader_reloc *reloc = 
&shader->binary.relocs[i];
-   if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
-   util_memcpy_cpu_to_le32(shader->binary.code + 
reloc->offset,
-   &scratch_rsrc_dword0, 4);
-   } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-   util_memcpy_cpu_to_le32(shader->binary.code + 
reloc->offset,
-   &scratch_rsrc_dword1, 4);
-   }
-   }
-}
-
 static void si_launch_grid(
struct pipe_context *ctx,
const uint *block_layout, const uint *grid_layout,
@@ -299,7 +267,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
/* Read the config information */
-   si_shader_binary_read_config(&program->shader.binary, shader, pc);
+   si_shader_binary_read_config(sctx->screen, shader, pc);
 #endif
 
/* Upload the kernel arguments */
@@ -510,13 +478,15 @@ static void si_delete_compute_state(struct pipe_context 
*ctx, void* state){
LLVMContextDispose(program->llvm_ctx);
}
 #else
+   FREE(program->shader.binary.config);
+   FREE(program->shader.binary.rodata);
+   FREE(program->shader.binary.global_symbol_offsets);
si_shader_destroy(ctx, &program->shader);
 #endif
 
pipe_resource_reference(
(struct pipe_resource **)&program->input_buffer, NULL);
 
-   radeon_shader_binary_free_mem

[Mesa-dev] [PATCH 2/5] radeonsi/compute: Rename si_compute::program to si_compute::shader

2015-01-27 Thread Tom Stellard
---
 src/gallium/drivers/radeonsi/si_compute.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 4427d3b..ba63afd 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -55,7 +55,7 @@ struct si_compute {
unsigned private_size;
unsigned input_size;
struct radeon_shader_binary binary;
-   struct si_shader program;
+   struct si_shader shader;
unsigned num_user_sgprs;
 
struct r600_resource *input_buffer;
@@ -103,7 +103,7 @@ static void *si_create_compute_state(
 #else
 
radeon_elf_read(code, header->num_bytes, &program->binary, true);
-   si_shader_binary_read(sctx->screen, &program->program, 
&program->binary);
+   si_shader_binary_read(sctx->screen, &program->shader, &program->binary);
 
 #endif
program->input_buffer = si_resource_create_custom(sctx->b.b.screen,
@@ -231,7 +231,7 @@ static void si_launch_grid(
uint64_t shader_va;
unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
unsigned i;
-   struct si_shader *shader = &program->program;
+   struct si_shader *shader = &program->shader;
unsigned lds_blocks;
unsigned num_waves_for_scratch;
 
@@ -256,7 +256,7 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
/* Read the config information */
-   si_shader_binary_read_config(&program->binary, &program->program, pc);
+   si_shader_binary_read_config(&program->binary, shader, pc);
 #endif
 
/* Upload the kernel arguments */
@@ -475,7 +475,7 @@ static void si_delete_compute_state(struct pipe_context 
*ctx, void* state){
LLVMContextDispose(program->llvm_ctx);
}
 #else
-   si_shader_destroy(ctx, &program->program);
+   si_shader_destroy(ctx, &program->shader);
 #endif
 
pipe_resource_reference(
-- 
2.0.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 1/3] nir: Add a pass to lower vector phi nodes to scalar phi nodes

2015-01-27 Thread Jason Ekstrand
v2 Jason Ekstrand :
 - Add better comments
 - Use nir_ssa_dest_init and nir_src_for_ssa more places
 - Fix some void * casts
---
 src/glsl/Makefile.sources   |   1 +
 src/glsl/nir/nir.h  |   2 +
 src/glsl/nir/nir_lower_phis_to_scalar.c | 251 
 3 files changed, 254 insertions(+)
 create mode 100644 src/glsl/nir/nir_lower_phis_to_scalar.c

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 96c4ec5..02d0780 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -28,6 +28,7 @@ NIR_FILES = \
nir/nir_lower_global_vars_to_local.c \
nir/nir_lower_locals_to_regs.c \
nir/nir_lower_io.c \
+   nir/nir_lower_phis_to_scalar.c \
nir/nir_lower_samplers.cpp \
nir/nir_lower_system_values.c \
nir/nir_lower_to_source_mods.c \
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 119ca01..cda14aa 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1523,6 +1523,8 @@ void nir_remove_dead_variables(nir_shader *shader);
 void nir_lower_vec_to_movs(nir_shader *shader);
 void nir_lower_alu_to_scalar(nir_shader *shader);
 
+void nir_lower_phis_to_scalar(nir_shader *shader);
+
 void nir_lower_samplers(nir_shader *shader,
 struct gl_shader_program *shader_program,
 struct gl_program *prog);
diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c 
b/src/glsl/nir/nir_lower_phis_to_scalar.c
new file mode 100644
index 000..d8c3f57
--- /dev/null
+++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *Jason Ekstrand (ja...@jlekstrand.net)
+ *
+ */
+
+#include "nir.h"
+
+/*
+ * Implements a pass that lowers vector phi nodes to scalar phi nodes when
+ * we don't think it will hurt anything.
+ */
+
+struct lower_phis_to_scalar_state {
+   void *mem_ctx;
+   void *dead_ctx;
+
+   /* Hash table marking which phi nodes are scalarizable.  The key is
+* pointers to phi instructions and the entry is either NULL for not
+* scalarizable or non-null for scalarizable.
+*/
+   struct hash_table *phi_table;
+};
+
+/* Determines if the given phi node should be lowered.  The only phi nodes
+ * we will scalarize at the moment are those where all of the sources are
+ * scalarizable.
+ *
+ * The reason for this comes down to coalescing.  Since phi sources can't
+ * swizzle, swizzles on phis have to be resolved by inserting a mov right
+ * before the phi.  The choice then becomes between movs to pick off
+ * components for a scalar phi or potentially movs to recombine components
+ * for a vector phi.  The problem is that the movs generated to pick off
+ * the components are almost uncoalescable.  We can't coalesce them in NIR
+ * because we need them to pick off components and we can't coalesce them
+ * in the backend because the source register is a vector and the
+ * destination is a scalar that may be used at other places in the program.
+ * On the other hand, if we have a bunch of scalars going into a vector
+ * phi, the situation is much better.  In this case, if the SSA def is
+ * generated in the predecessor block to the corresponding phi source, the
+ * backend code will be an ALU op into a temporary and then a mov into the
+ * given vector component;  this move can almost certainly be coalesced
+ * away.
+ */
+static bool
+should_lower_phi(nir_phi_instr *phi, struct lower_phis_to_scalar_state *state)
+{
+   /* Already scalar */
+   if (phi->dest.ssa.num_components == 1)
+  return false;
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->phi_table, phi);
+   if (entry)
+  return entry->data != NULL;
+
+   nir_foreach_phi_src(phi, src) {
+  /* Don't know what to do with non-ssa sources */
+  if (!src->src.is_ssa)
+

[Mesa-dev] [Bug 84019] Request commit access to piglit and mesa

2015-01-27 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=84019

--- Comment #4 from Martin Peres  ---
Hello, the request is accepted. However, you gpg key is not currently visible.
I cannot create an account until this is solved.

PS: Sorry for the delay

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 1/3] nir: Add a pass to lower vector phi nodes to scalar phi nodes

2015-01-27 Thread Eric Anholt
Jason Ekstrand  writes:

> On Mon, Jan 26, 2015 at 3:58 PM, Eric Anholt  wrote:
>
>> Jason Ekstrand  writes:
>>
>> > On Mon, Jan 26, 2015 at 11:21 AM, Eric Anholt  wrote:
>> >
>> >> Jason Ekstrand  writes:
>> >> > +  case nir_instr_type_phi: {
>> >> > + nir_phi_instr *src_phi = nir_instr_as_phi(src_instr);
>> >> > +
>> >> > + /* Insert an entry and mark it as scalarizable for now. That
>> >> way
>> >> > +  * we don't recurse forever and a cycle in the depencence
>> graph
>> >> > +  * won't automatically make us fail to scalarize.
>> >> > +  */
>> >> > + entry = _mesa_hash_table_insert(state->phi_table, src_phi,
>> >> (void *)1);
>> >>
>> >> I expect "(void *)1" will give compiler warnings generally.  "(void
>> >> *)(uintptr_t)1" is the usual workaround.
>> >>
>> >> Useful comment, though!
>> >>
>> >> > + bool scalarizable = should_lower_phi(src_phi, state);
>> >> > + entry->data = (void *)scalarizable;
>> >>
>> >> No need to cast to void *, since not C++.
>> >>
>> >
>> > I'm confused.  You want more casting above but less here?  Could you
>> pleas
>> > be a little more specific.
>>
>> Yes, you have to add the cast above to avoid a compiler warning about
>> casting from a small int to a pointer, while casting to void * to assign
>> to a void * is only a c++ism
>>
>
> But I'm casting a bool to void *...

So the same thing: to go from something-that's-not-a-long, you cast
through uintptr_t first.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 1/3] nir: Add a pass to lower vector phi nodes to scalar phi nodes

2015-01-27 Thread Jason Ekstrand
oops, this should have been a v3

On Tue, Jan 27, 2015 at 1:32 PM, Jason Ekstrand 
wrote:

> v2 Jason Ekstrand :
>  - Add better comments
>  - Use nir_ssa_dest_init and nir_src_for_ssa more places
>  - Fix some void * casts
> ---
>  src/glsl/Makefile.sources   |   1 +
>  src/glsl/nir/nir.h  |   2 +
>  src/glsl/nir/nir_lower_phis_to_scalar.c | 251
> 
>  3 files changed, 254 insertions(+)
>  create mode 100644 src/glsl/nir/nir_lower_phis_to_scalar.c
>
> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
> index 96c4ec5..02d0780 100644
> --- a/src/glsl/Makefile.sources
> +++ b/src/glsl/Makefile.sources
> @@ -28,6 +28,7 @@ NIR_FILES = \
> nir/nir_lower_global_vars_to_local.c \
> nir/nir_lower_locals_to_regs.c \
> nir/nir_lower_io.c \
> +   nir/nir_lower_phis_to_scalar.c \
> nir/nir_lower_samplers.cpp \
> nir/nir_lower_system_values.c \
> nir/nir_lower_to_source_mods.c \
> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
> index 119ca01..cda14aa 100644
> --- a/src/glsl/nir/nir.h
> +++ b/src/glsl/nir/nir.h
> @@ -1523,6 +1523,8 @@ void nir_remove_dead_variables(nir_shader *shader);
>  void nir_lower_vec_to_movs(nir_shader *shader);
>  void nir_lower_alu_to_scalar(nir_shader *shader);
>
> +void nir_lower_phis_to_scalar(nir_shader *shader);
> +
>  void nir_lower_samplers(nir_shader *shader,
>  struct gl_shader_program *shader_program,
>  struct gl_program *prog);
> diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c
> b/src/glsl/nir/nir_lower_phis_to_scalar.c
> new file mode 100644
> index 000..d8c3f57
> --- /dev/null
> +++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
> @@ -0,0 +1,251 @@
> +/*
> + * Copyright © 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> next
> + * paragraph) shall be included in all copies or substantial portions of
> the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *Jason Ekstrand (ja...@jlekstrand.net)
> + *
> + */
> +
> +#include "nir.h"
> +
> +/*
> + * Implements a pass that lowers vector phi nodes to scalar phi nodes when
> + * we don't think it will hurt anything.
> + */
> +
> +struct lower_phis_to_scalar_state {
> +   void *mem_ctx;
> +   void *dead_ctx;
> +
> +   /* Hash table marking which phi nodes are scalarizable.  The key is
> +* pointers to phi instructions and the entry is either NULL for not
> +* scalarizable or non-null for scalarizable.
> +*/
> +   struct hash_table *phi_table;
> +};
> +
> +/* Determines if the given phi node should be lowered.  The only phi nodes
> + * we will scalarize at the moment are those where all of the sources are
> + * scalarizable.
> + *
> + * The reason for this comes down to coalescing.  Since phi sources can't
> + * swizzle, swizzles on phis have to be resolved by inserting a mov right
> + * before the phi.  The choice then becomes between movs to pick off
> + * components for a scalar phi or potentially movs to recombine components
> + * for a vector phi.  The problem is that the movs generated to pick off
> + * the components are almost uncoalescable.  We can't coalesce them in NIR
> + * because we need them to pick off components and we can't coalesce them
> + * in the backend because the source register is a vector and the
> + * destination is a scalar that may be used at other places in the
> program.
> + * On the other hand, if we have a bunch of scalars going into a vector
> + * phi, the situation is much better.  In this case, if the SSA def is
> + * generated in the predecessor block to the corresponding phi source, the
> + * backend code will be an ALU op into a temporary and then a mov into the
> + * given vector component;  this move can almost certainly be coalesced
> + * away.
> + */
> +static bool
> +should_lower_phi(nir_phi_instr *phi, struct lower_phis_to_scalar_state
> *state)
> +{
> +   /* Alre

Re: [Mesa-dev] [PATCH v2 1/3] nir: Add a pass to lower vector phi nodes to scalar phi nodes

2015-01-27 Thread Jason Ekstrand
On Tue, Jan 27, 2015 at 2:01 PM, Eric Anholt  wrote:

> Jason Ekstrand  writes:
>
> > On Mon, Jan 26, 2015 at 3:58 PM, Eric Anholt  wrote:
> >
> >> Jason Ekstrand  writes:
> >>
> >> > On Mon, Jan 26, 2015 at 11:21 AM, Eric Anholt 
> wrote:
> >> >
> >> >> Jason Ekstrand  writes:
> >> >> > +  case nir_instr_type_phi: {
> >> >> > + nir_phi_instr *src_phi = nir_instr_as_phi(src_instr);
> >> >> > +
> >> >> > + /* Insert an entry and mark it as scalarizable for now.
> That
> >> >> way
> >> >> > +  * we don't recurse forever and a cycle in the depencence
> >> graph
> >> >> > +  * won't automatically make us fail to scalarize.
> >> >> > +  */
> >> >> > + entry = _mesa_hash_table_insert(state->phi_table,
> src_phi,
> >> >> (void *)1);
> >> >>
> >> >> I expect "(void *)1" will give compiler warnings generally.  "(void
> >> >> *)(uintptr_t)1" is the usual workaround.
> >> >>
> >> >> Useful comment, though!
> >> >>
> >> >> > + bool scalarizable = should_lower_phi(src_phi, state);
> >> >> > + entry->data = (void *)scalarizable;
> >> >>
> >> >> No need to cast to void *, since not C++.
> >> >>
> >> >
> >> > I'm confused.  You want more casting above but less here?  Could you
> >> pleas
> >> > be a little more specific.
> >>
> >> Yes, you have to add the cast above to avoid a compiler warning about
> >> casting from a small int to a pointer, while casting to void * to assign
> >> to a void * is only a c++ism
> >>
> >
> > But I'm casting a bool to void *...
>
> So the same thing: to go from something-that's-not-a-long, you cast
> through uintptr_t first.
>

Yup, figured.  All that should be fixed in the v3 I just sent.  (It's
accidentally named v2)
--Jason
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 84019] Request commit access to piglit and mesa

2015-01-27 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=84019

--- Comment #5 from Martin Peres  ---
The problem may be on my side. Sorry for the noise.

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 05/11] mesa: Validate internal format and format type first to provide accurate error code

2015-01-27 Thread Anuj Phogat
On Tue, Jan 27, 2015 at 9:09 AM, Eduardo Lima Mitev  wrote:
> On 01/21/2015 03:26 AM, Anuj Phogat wrote:
>> On Mon, Jan 19, 2015 at 3:32 AM, Eduardo Lima Mitev  wrote:
>>> The specification states that glTexImage2D and glTexImage3D should return
>>> GL_INVALID_VALUE if the internal format is invalid, and GL_INVALID_ENUM is
>>> the format type is invalid. However, current error check only considers the
>>> combination of format, type and internal format; which returns a
>>> GL_INVALID_OPERATION error when invalid.
>>>
>> I did a quick search in es 3.0.4 spec but couldn't find the reference. Could 
>> you
>> point me to the reference in the spec and may be add it as a comment in the
>> code?
>>
>
> Hi,
>
> This behavior is described in the man pages, not any specification, so
> it is a mistake. I thoroughly reviewed GLES 3.0.0, 3.0.4, 3.1 and 2.0
> (just in case) and there is no explicit mention to this behavior.
>
> The closest I could find was this text in section 8.5. TEXTURE IMAGE
> SPECIFICATION (page 203) of the OpenGL 4.5 spec:
>
> "An INVALID_VALUE error is generated if internalformat is not one of
> the valid values described above."
>
> (And "above" there is a large definition of all the valid internal
> formats, combinations, etc).
>
> So I wonder if there is an implicitly defined behavior that would
> justify this wording in the man page for TexImage3D/2D:
>
> * GL_INVALID_ENUM is generated if type is not a type constant.
> * GL_INVALID_VALUE is generated if internalFormat is not one of the
> accepted resolution and format symbolic constants.
>
> I know that the man pages are not a reliable source to define API
> behavior, but since apparently many dEQP tests are written against the
> man pages, it would be useful to understand better the link (if exists)
> between the man pages and the corresponding wording in a specification.
>
None of the tests in Khronos ES 3.0 CTS expect above errors. I think a
Khronos bug can be raised asking for clarification. Then we'll know it's
an issue with dEQP tests.

> I will withdraw this patch in the mean time.
>
> Thank you,
> Eduardo
>
>>> Fixes 2 dEQP tests:
>>> * dEQP-GLES3.functional.negative_api.texture.teximage2d
>>> * dEQP-GLES3.functional.negative_api.texture.teximage3d
>>> ---
>
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] i965/tex: Don't create read-write textures with non-renderable formats

2015-01-27 Thread Jason Ekstrand
I haven't actually seen this bug in the wild, but it's possible that
someone could ask to do a ST3C PBO download or something.  This protects us
from accidentally creating a render target with a compressed or otherwise
non-renderable format.
---
 src/mesa/drivers/dri/i965/intel_tex.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/intel_tex.c 
b/src/mesa/drivers/dri/i965/intel_tex.c
index 184702f..2d3009a 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -323,6 +323,11 @@ intel_set_texture_storage_for_buffer_object(struct 
gl_context *ctx,
  perf_debug("Bad PBO alignment; fallback to CPU mapping\n");
  return false;
   }
+
+  if (!brw->format_supported_as_render_target[image->TexFormat]) {
+ perf_debug("Non-renderable PBO format; fallback to CPU mapping\n");
+ return false;
+  }
}
 
assert(intel_texobj->mt == NULL);
-- 
2.2.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] i965/gen8: Include the buffer offset when emitting renderbuffer relocs

2015-01-27 Thread Jason Ekstrand
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88792
---

Yes, computing it from the surface state and bo->offset64 is a little
strange when we could just be using mt->offset.  However, that's the way
it's done on all the other gens so I decided to be consistent.

 src/mesa/drivers/dri/i965/gen8_surface_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index 45c35db..16b5a7e 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -450,7 +450,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
drm_intel_bo_emit_reloc(brw->batch.bo,
brw->wm.base.surf_offset[surf_index] + 8 * 4,
mt->bo,
-   0,
+  *((uint64_t *) &surf[8]) - mt->bo->offset64,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER);
 }
-- 
2.2.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 01/22] glsl: Reorder optimization-passes

2015-01-27 Thread Matt Turner
On Sat, Jan 17, 2015 at 10:33 AM, Matt Turner  wrote:
> On Sat, Jan 17, 2015 at 8:31 AM, Thomas Helland
>  wrote:
>> 2015-01-03 22:48 GMT+01:00 Matt Turner :
>>> On Sat, Jan 3, 2015 at 11:18 AM, Thomas Helland
>>>  wrote:
 This allows opt_algebraic to resolve open-coded
 saturates into ir_unop_saturate before we potentially
 mess it up by removing the min or max in min/max-pruning.

 Since we are now emitting more free saturates on i965
 this gives us some decrease in instruction count.

 total instructions in shared programs: 1317459 -> 1317065 (-0.03%)
 instructions in affected programs: 4084 -> 3690 (-9.65%)
 GAINED:0
 LOST:  0
>>>
>>> You're definitely onto something here. On our collection of shaders:
>>>
>>> total instructions in shared programs: 5876617 -> 5875919 (-0.01%)
>>> instructions in affected programs: 9443 -> 8745 (-7.39%)
>>>
>>> with some fragment shaders hurt in Natural Selection 2 and Kerbal Space 
>>> program.
>>>
>>> I'll investigate these.
>>
>> Hi Matt,
>>
>> Don't want to be a nuisance (if that is even the right word?
>> English is not my native tongue), but did you find the
>> time to look at these regressions?
>
> Nuisance is indeed the right word, but you are not being one. :)
>
> I'll definitely look into this. Sorry that I haven't had a chance yet.

I've looked at three hurt shaders now:

kerbal-space-program/556 adds a useless max(x, 0) on top of a sat(),
which will obviously be handled by a later patch in this series.

In natural-selection-2/757 (and presumably 13093 and 13096), two
consecutive assignments' RHS contained (x * (y * z)) and (w * (y * z))
allowing the i965 backend to CSE the y*z. After this patch for some
reason we get ((x * y) * z) and ((w * y) * z) which means we gain one
multiply instruction. Not concerned about that.

unity/42 and 62 (which are now available to you in the open
shader-db!) are also regressed. I looked at 42, but it's really hard
to tell what's going on. In fact, parts of it look like they should be
improved (we recognize lrp in a few new places). They're actually
still hurt after the series through patch 19 (I haven't tested the
later ones). They'd be interesting for you to look at.

Actually, looking at the results for just this patch, I see a bunch of
CSGO & Dota2 shaders around 30 instructions that are cut by about 15%
that I cannot explain. The change in IR doesn't look legal. It might
be exposing a bug elsewhere. I'll have to take a look. :(
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/7] clover: Dump the OpenCL C code earlier

2015-01-27 Thread Francisco Jerez
EdB  writes:

> ---
>  src/gallium/state_trackers/clover/llvm/invocation.cpp | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
> b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> index 10dbe08..510e195 100644
> --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
> +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> @@ -709,6 +709,9 @@ clover::compile_program_llvm(const compat::string &source,
> llvm_ctx.setDiagnosticHandler(diagnostic_handler, &r_log);
>  #endif
>  
> +   if (debug_flags & DBG_CLC)
> +  debug_log(source, ".cl");
> +
> // The input file name must have the .cl extension in order for the
> // CompilerInvocation class to recognize it as an OpenCL source file.
> llvm::Module *mod = compile_llvm(llvm_ctx, source, headers, "input.cl",
> @@ -719,9 +722,6 @@ clover::compile_program_llvm(const compat::string &source,
>  
> optimize(mod, optimization_level, kernels);
>  
> -   if (debug_flags & DBG_CLC)
> -  debug_log(source, ".cl");
> -
> if (debug_flags & DBG_LLVM) {
>std::string log;
>llvm::raw_string_ostream s_log(log);

Thanks, I've pushed patches 1 and 2.

> -- 
> 2.2.0
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


pgppOdDymznCJ.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v4 3/3] i965/fs_nir: Get rid of get_alu_src

2015-01-27 Thread Jason Ekstrand
Originally, get_alu_src was supposed to handle resolving swizzles and
things like that.  However, now that basically every instruction we have
only takes scalar sources, we don't really need it anymore.  The only case
where it's still marginally useful is for the mov and vecN operations that
are left over from SSA form.  We can handle those cases as a special case
easily enough.  As a side-effect, we don't need the vec_to_movs pass
anymore.
---
 src/mesa/drivers/dri/i965/brw_fs.h   |   1 -
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 153 +++
 2 files changed, 95 insertions(+), 59 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 25197cd..b95e2c0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -589,7 +589,6 @@ public:
void nir_emit_texture(nir_tex_instr *instr);
void nir_emit_jump(nir_jump_instr *instr);
fs_reg get_nir_src(nir_src src);
-   fs_reg get_nir_alu_src(nir_alu_instr *instr, unsigned src);
fs_reg get_nir_dest(nir_dest dest);
void emit_percomp(fs_inst *inst, unsigned wr_mask);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 217fe09..92299db 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -140,8 +140,6 @@ fs_visitor::emit_nir_code()
 
nir_convert_from_ssa(nir);
nir_validate_shader(nir);
-   nir_lower_vec_to_movs(nir);
-   nir_validate_shader(nir);
 
/* emit the arrays used for inputs and outputs - load/store intrinsics will
 * be converted to reads/writes of these arrays
@@ -419,6 +417,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
 void
 fs_visitor::nir_emit_cf_list(exec_list *list)
 {
+   exec_list_validate(list);
foreach_list_typed(nir_cf_node, node, node, list) {
   switch (node->type) {
   case nir_cf_node_if:
@@ -540,34 +539,117 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
 {
struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 
-   fs_reg op[3];
fs_reg result = get_nir_dest(instr->dest.dest);
result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type);
 
-   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-  op[i] = get_nir_alu_src(instr, i);
+   fs_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+  op[i] = get_nir_src(instr->src[i].src);
+  op[i].type = 
brw_type_for_nir_type(nir_op_infos[instr->op].input_types[i]);
+  op[i].abs = instr->src[i].abs;
+  op[i].negate = instr->src[i].negate;
+   }
+
+   /* We get a bunch of mov's out of the from_ssa pass and they may still
+* be vectorized.  We'll handle them as a special-case.  We'll also
+* handle vecN here because it's basically the same thing.
+*/
+   bool need_extra_copy = false;
+   switch (instr->op) {
+   case nir_op_vec4:
+  if (!instr->src[3].src.is_ssa &&
+  instr->dest.dest.reg.reg == instr->src[3].src.reg.reg)
+ need_extra_copy = true;
+  /* fall through */
+   case nir_op_vec3:
+  if (!instr->src[2].src.is_ssa &&
+  instr->dest.dest.reg.reg == instr->src[2].src.reg.reg)
+ need_extra_copy = true;
+  /* fall through */
+   case nir_op_vec2:
+  if (!instr->src[1].src.is_ssa &&
+  instr->dest.dest.reg.reg == instr->src[1].src.reg.reg)
+ need_extra_copy = true;
+  /* fall through */
+   case nir_op_imov:
+   case nir_op_fmov: {
+  if (!instr->src[0].src.is_ssa &&
+  instr->dest.dest.reg.reg == instr->src[0].src.reg.reg)
+ need_extra_copy = true;
+
+  fs_reg temp;
+  if (need_extra_copy) {
+ temp = retype(vgrf(4), result.type);
+  } else {
+ temp = result;
+  }
+
+  if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
+ for (unsigned i = 0; i < 4; i++) {
+if (!(instr->dest.write_mask & (1 << i)))
+   continue;
+
+emit(MOV(offset(temp, i),
+ offset(op[0], instr->src[0].swizzle[i])))
+->saturate = instr->dest.saturate;
+ }
+  } else {
+ for (unsigned i = 0; i < 4; i++) {
+if (!(instr->dest.write_mask & (1 << i)))
+   continue;
+
+emit(MOV(offset(temp, i),
+ offset(op[i], instr->src[i].swizzle[0])))
+->saturate = instr->dest.saturate;
+ }
+  }
+
+  /* In this case the source and destination registers were the same,
+   * so we need to insert an extra set of moves in order to deal with
+   * any swizzling.
+   */
+  if (need_extra_copy) {
+ for (unsigned i = 0; i < 4; i++) {
+if (!(instr->dest.write_mask & (1 << i)))
+   continue;
+
+emit(MOV(offset(result, i), offset(temp, i)));
+ }
+  }
+  return;
+   }
+   default:
+  brea

[Mesa-dev] [PATCH v4 2/3] i965/fs: Use NIR's scalarizing abilities and stop handling vectors

2015-01-27 Thread Jason Ekstrand
Now that we can scalarize with NIR, there's no need for all this code
anymore.  Let's get rid of it and just do scalar operations.

v2: run copy prop before lowering phi nodes
---
 src/mesa/drivers/dri/i965/brw_fs.h   |  15 -
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 489 ++-
 2 files changed, 158 insertions(+), 346 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 84e0b9e..25197cd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -592,21 +592,6 @@ public:
fs_reg get_nir_alu_src(nir_alu_instr *instr, unsigned src);
fs_reg get_nir_dest(nir_dest dest);
void emit_percomp(fs_inst *inst, unsigned wr_mask);
-   void emit_percomp(enum opcode op, fs_reg dest, fs_reg src0,
- unsigned wr_mask, bool saturate = false,
- enum brw_predicate predicate = BRW_PREDICATE_NONE,
- enum brw_conditional_mod mod = BRW_CONDITIONAL_NONE);
-   void emit_percomp(enum opcode op, fs_reg dest, fs_reg src0, fs_reg src1,
- unsigned wr_mask, bool saturate = false,
- enum brw_predicate predicate = BRW_PREDICATE_NONE,
- enum brw_conditional_mod mod = BRW_CONDITIONAL_NONE);
-   void emit_math_percomp(enum opcode op, fs_reg dest, fs_reg src0,
-  unsigned wr_mask, bool saturate = false);
-   void emit_math_percomp(enum opcode op, fs_reg dest, fs_reg src0,
-  fs_reg src1, unsigned wr_mask,
-  bool saturate = false);
-   void emit_reduction(enum opcode op, fs_reg dest, fs_reg src,
-   unsigned num_components);
 
int setup_color_payload(fs_reg *dst, fs_reg color, unsigned components);
void emit_alpha_test();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index de0d780..217fe09 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -36,6 +36,10 @@ nir_optimize(nir_shader *nir)
   nir_validate_shader(nir);
   progress |= nir_copy_prop(nir);
   nir_validate_shader(nir);
+  nir_lower_phis_to_scalar(nir);
+  nir_validate_shader(nir);
+  progress |= nir_copy_prop(nir);
+  nir_validate_shader(nir);
   progress |= nir_opt_dce(nir);
   nir_validate_shader(nir);
   progress |= nir_opt_cse(nir);
@@ -85,6 +89,9 @@ fs_visitor::emit_nir_code()
nir_split_var_copies(nir);
nir_validate_shader(nir);
 
+   nir_lower_alu_to_scalar(nir);
+   nir_validate_shader(nir);
+
nir_optimize(nir);
 
/* Lower a bunch of stuff */
@@ -540,20 +547,30 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
   op[i] = get_nir_alu_src(instr, i);
 
+   if (nir_op_infos[instr->op].output_size == 0) {
+  /* We've already scalarized, so we know that we only have one
+   * channel.  The only question is which channel.
+   */
+  assert(_mesa_bitcount(instr->dest.write_mask) == 1);
+  unsigned off = ffs(instr->dest.write_mask) - 1;
+  result = offset(result, off);
+
+  for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+ op[i] = offset(op[i], off);
+   }
+
switch (instr->op) {
case nir_op_fmov:
case nir_op_i2f:
-   case nir_op_u2f: {
-  fs_inst *inst = MOV(result, op[0]);
-  inst->saturate = instr->dest.saturate;
-  emit_percomp(inst, instr->dest.write_mask);
-   }
+   case nir_op_u2f:
+  emit(MOV(result, op[0]))
+  ->saturate = instr->dest.saturate;
   break;
 
case nir_op_imov:
case nir_op_f2i:
case nir_op_f2u:
-  emit_percomp(MOV(result, op[0]), instr->dest.write_mask);
+  emit(MOV(result, op[0]));
   break;
 
case nir_op_fsign: {
@@ -562,55 +579,46 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
  * Predicated OR ORs 1.0 (0x3f80) with the sign bit if val is not
  * zero.
  */
-  emit_percomp(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ),
-   instr->dest.write_mask);
+  emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 
   fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
   op[0].type = BRW_REGISTER_TYPE_UD;
   result.type = BRW_REGISTER_TYPE_UD;
-  emit_percomp(AND(result_int, op[0], fs_reg(0x8000u)),
-   instr->dest.write_mask);
+  emit(AND(result_int, op[0], fs_reg(0x8000u)));
 
-  fs_inst *inst = OR(result_int, result_int, fs_reg(0x3f80u));
-  inst->predicate = BRW_PREDICATE_NORMAL;
-  emit_percomp(inst, instr->dest.write_mask);
+  emit(OR(result_int, result_int, fs_reg(0x3f80u)))
+  ->predicate = BRW_PREDICATE_NORMAL;
   if (instr->dest.saturate) {
- fs_inst *inst = MOV(result, result);
- inst->saturate = true

[Mesa-dev] [PATCH v4 1/3] nir: Add a pass to lower vector phi nodes to scalar phi nodes

2015-01-27 Thread Jason Ekstrand
v2 Jason Ekstrand :
 - Add better comments
 - Use nir_ssa_dest_init and nir_src_for_ssa more places
 - Fix some void * casts

v3 Jason Ekstrand :
 - Rework the way we determine whether or not to sccalarize a phi node to
   make the recursion non-bogus
 - Treat load_const instructions as scalarizable
---
 src/glsl/Makefile.sources   |   1 +
 src/glsl/nir/nir.h  |   2 +
 src/glsl/nir/nir_lower_phis_to_scalar.c | 266 
 3 files changed, 269 insertions(+)
 create mode 100644 src/glsl/nir/nir_lower_phis_to_scalar.c

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index face22e..bf6b70b 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -31,6 +31,7 @@ NIR_FILES = \
nir/nir_lower_global_vars_to_local.c \
nir/nir_lower_locals_to_regs.c \
nir/nir_lower_io.c \
+   nir/nir_lower_phis_to_scalar.c \
nir/nir_lower_samplers.cpp \
nir/nir_lower_system_values.c \
nir/nir_lower_to_source_mods.c \
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 980fdd0..4f58eee 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1526,6 +1526,8 @@ void nir_remove_dead_variables(nir_shader *shader);
 void nir_lower_vec_to_movs(nir_shader *shader);
 void nir_lower_alu_to_scalar(nir_shader *shader);
 
+void nir_lower_phis_to_scalar(nir_shader *shader);
+
 void nir_lower_samplers(nir_shader *shader,
 struct gl_shader_program *shader_program,
 struct gl_program *prog);
diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c 
b/src/glsl/nir/nir_lower_phis_to_scalar.c
new file mode 100644
index 000..a94b8b0
--- /dev/null
+++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *Jason Ekstrand (ja...@jlekstrand.net)
+ *
+ */
+
+#include "nir.h"
+
+/*
+ * Implements a pass that lowers vector phi nodes to scalar phi nodes when
+ * we don't think it will hurt anything.
+ */
+
+struct lower_phis_to_scalar_state {
+   void *mem_ctx;
+   void *dead_ctx;
+
+   /* Hash table marking which phi nodes are scalarizable.  The key is
+* pointers to phi instructions and the entry is either NULL for not
+* scalarizable or non-null for scalarizable.
+*/
+   struct hash_table *phi_table;
+};
+
+static bool
+should_lower_phi(nir_phi_instr *phi, struct lower_phis_to_scalar_state *state);
+
+static bool
+is_phi_src_scalarizable(nir_phi_src *src,
+struct lower_phis_to_scalar_state *state)
+{
+   /* Don't know what to do with non-ssa sources */
+   if (!src->src.is_ssa)
+  return false;
+
+   nir_instr *src_instr = src->src.ssa->parent_instr;
+   switch (src_instr->type) {
+   case nir_instr_type_alu: {
+  nir_alu_instr *src_alu = nir_instr_as_alu(src_instr);
+
+  /* ALU operations with output_size == 0 should be scalarized.  We
+   * will also see a bunch of vecN operations from scalarizing ALU
+   * operations and, since they can easily be copy-propagated, they
+   * are ok too.
+   */
+  return nir_op_infos[src_alu->op].output_size == 0 ||
+ src_alu->op != nir_op_vec2 ||
+ src_alu->op != nir_op_vec3 ||
+ src_alu->op != nir_op_vec4;
+   }
+
+   case nir_instr_type_phi:
+  /* A phi is scalarizable if we're going to lower it */
+  return should_lower_phi(nir_instr_as_phi(src_instr), state);
+
+   case nir_instr_type_load_const:
+  /* These are trivially scalarizable */
+  return true;
+
+   default:
+  /* We can't scalarize this type of instruction */
+  return false;
+   }
+}
+
+/* Determines if the given phi node should be lowered.  The only phi nodes
+ * we will scalarize at the moment are those where all of the sources are
+ * scalarizable.
+ 

Re: [Mesa-dev] [PATCH] radeonsi: Enable VGPR spilling for all shader types v3

2015-01-27 Thread Michel Dänzer
On 27.01.2015 20:00, Marek Olšák wrote:
> 
> The old pm4 state is leaked though.

Right, I wanted to mention that as well but forgot. :}


> The attached diff should fix it.

Looks good.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: fix display list 8-byte alignment issue

2015-01-27 Thread Brian Paul
The _mesa_dlist_alloc() function is only guaranteed to return a pointer
with 4-byte alignment.  On 64-bit systems which don't support unaligned
loads (e.g. SPARC or MIPS) this could lead to a bus error in the VBO code.

The solution is to add a new  _mesa_dlist_alloc_aligned() function which
will return a pointer to an 8-byte aligned address on 64-bit systems.
This is accomplished by inserting a 4-byte NOP instruction in the display
list when needed.

The only place this actually matters is the VBO code where we need to
allocate a 'struct vbo_save_vertex_list' which needs to be 8-byte
aligned (just as if it were malloc'd).

The gears demo and others hit this bug.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88662
Cc: "10.4" 
---
 src/mesa/main/dlist.c   | 72 +
 src/mesa/main/dlist.h   |  3 ++
 src/mesa/vbo/vbo_save_api.c |  5 +++-
 3 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 138d360..dc6070b 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -487,6 +487,7 @@ typedef enum
/* The following three are meta instructions */
OPCODE_ERROR,/* raise compiled-in error */
OPCODE_CONTINUE,
+   OPCODE_NOP,  /* No-op (used for 8-byte alignment */
OPCODE_END_OF_LIST,
OPCODE_EXT_0
 } OpCode;
@@ -1012,13 +1013,16 @@ memdup(const void *src, GLsizei bytes)
  * Allocate space for a display list instruction (opcode + payload space).
  * \param opcode  the instruction opcode (OPCODE_* value)
  * \param bytes   instruction payload size (not counting opcode)
- * \return pointer to allocated memory (the opcode space)
+ * \param align8  does the payload need to be 8-byte aligned?
+ *This is only relevant in 64-bit environments.
+ * \return pointer to allocated memory (the payload will be at pointer+1)
  */
 static Node *
-dlist_alloc(struct gl_context *ctx, OpCode opcode, GLuint bytes)
+dlist_alloc(struct gl_context *ctx, OpCode opcode, GLuint bytes, bool align8)
 {
const GLuint numNodes = 1 + (bytes + sizeof(Node) - 1) / sizeof(Node);
const GLuint contNodes = 1 + POINTER_DWORDS;  /* size of continue info */
+   GLuint nopNode;
Node *n;
 
if (opcode < OPCODE_EXT_0) {
@@ -1032,7 +1036,19 @@ dlist_alloc(struct gl_context *ctx, OpCode opcode, 
GLuint bytes)
   }
}
 
-   if (ctx->ListState.CurrentPos + numNodes + contNodes > BLOCK_SIZE) {
+   if (sizeof(void *) == 8 && align8 && ctx->ListState.CurrentPos % 2 == 0) {
+  /* The opcode would get placed at node[0] and the payload would start
+   * at node[1].  But the payload needs to be at an even offset (8-byte
+   * multiple).
+   */
+  nopNode = 1;
+   }
+   else {
+  nopNode = 0;
+   }
+
+   if (ctx->ListState.CurrentPos + nopNode + numNodes + contNodes
+   > BLOCK_SIZE) {
   /* This block is full.  Allocate a new block and chain to it */
   Node *newblock;
   n = ctx->ListState.CurrentBlock + ctx->ListState.CurrentPos;
@@ -1042,13 +1058,34 @@ dlist_alloc(struct gl_context *ctx, OpCode opcode, 
GLuint bytes)
  _mesa_error(ctx, GL_OUT_OF_MEMORY, "Building display list");
  return NULL;
   }
+
+  /* a fresh block should be 8-byte aligned on 64-bit systems */
+  assert(((GLintptr) newblock) % sizeof(void *) == 0);
+
   save_pointer(&n[1], newblock);
   ctx->ListState.CurrentBlock = newblock;
   ctx->ListState.CurrentPos = 0;
+
+  /* Display list nodes are always 4 bytes.  If we need 8-byte alignment
+   * we have to insert a NOP so that the payload of the real opcode lands
+   * on an even location:
+   *   node[0] = OPCODE_NOP
+   *   node[1] = OPCODE_x;
+   *   node[2] = start of payload
+   */
+  nopNode = sizeof(void *) == 8 && align8;
}
 
n = ctx->ListState.CurrentBlock + ctx->ListState.CurrentPos;
-   ctx->ListState.CurrentPos += numNodes;
+   if (nopNode) {
+  assert(ctx->ListState.CurrentPos % 2 == 0); /* even value */
+  n[0].opcode = OPCODE_NOP;
+  n++;
+  /* The "real" opcode will now be at an odd location and the payload
+   * will be at an even location.
+   */
+   }
+   ctx->ListState.CurrentPos += nopNode + numNodes;
 
n[0].opcode = opcode;
 
@@ -1069,7 +1106,22 @@ dlist_alloc(struct gl_context *ctx, OpCode opcode, 
GLuint bytes)
 void *
 _mesa_dlist_alloc(struct gl_context *ctx, GLuint opcode, GLuint bytes)
 {
-   Node *n = dlist_alloc(ctx, (OpCode) opcode, bytes);
+   Node *n = dlist_alloc(ctx, (OpCode) opcode, bytes, false);
+   if (n)
+  return n + 1;  /* return pointer to payload area, after opcode */
+   else
+  return NULL;
+}
+
+
+/**
+ * Same as _mesa_dlist_alloc(), but return a pointer which is 8-byte
+ * aligned in 64-bit environments, 4-byte aligned otherwise.
+ */
+void *
+_mesa_dlist_alloc_aligned(struct gl_context *ctx, GLuint opcode, GLuint bytes)
+{
+   Node *n = dli

Re: [Mesa-dev] [PATCH 1/2] i965/gen8: Include the buffer offset when emitting renderbuffer relocs

2015-01-27 Thread Kenneth Graunke
On Tuesday, January 27, 2015 02:32:19 PM Jason Ekstrand wrote:
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88792
> ---
> 
> Yes, computing it from the surface state and bo->offset64 is a little
> strange when we could just be using mt->offset.  However, that's the way
> it's done on all the other gens so I decided to be consistent.

Yeah...I've never understood why that was done.  It seems silly to me.
Plus, with the uint64_t casts, it's even uglier.

How about just supplying mt->offset?

Either way,
Reviewed-by: Kenneth Graunke 
and thanks for fixing this!

>  src/mesa/drivers/dri/i965/gen8_surface_state.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
> b/src/mesa/drivers/dri/i965/gen8_surface_state.c
> index 45c35db..16b5a7e 100644
> --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
> @@ -450,7 +450,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
> drm_intel_bo_emit_reloc(brw->batch.bo,
> brw->wm.base.surf_offset[surf_index] + 8 * 4,
> mt->bo,
> -   0,
> +*((uint64_t *) &surf[8]) - mt->bo->offset64,

   ^^^ Tabs?!

> I915_GEM_DOMAIN_RENDER,
> I915_GEM_DOMAIN_RENDER);
>  }

signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] i965/tex: Don't create read-write textures with non-renderable formats

2015-01-27 Thread Kenneth Graunke
On Tuesday, January 27, 2015 02:32:20 PM Jason Ekstrand wrote:
> I haven't actually seen this bug in the wild, but it's possible that
> someone could ask to do a ST3C PBO download or something.  This protects us

S3TC (typo) 

> from accidentally creating a render target with a compressed or otherwise
> non-renderable format.

Reviewed-by: Kenneth Graunke 

> ---
>  src/mesa/drivers/dri/i965/intel_tex.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/intel_tex.c 
> b/src/mesa/drivers/dri/i965/intel_tex.c
> index 184702f..2d3009a 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex.c
> @@ -323,6 +323,11 @@ intel_set_texture_storage_for_buffer_object(struct 
> gl_context *ctx,
>   perf_debug("Bad PBO alignment; fallback to CPU mapping\n");
>   return false;
>}
> +
> +  if (!brw->format_supported_as_render_target[image->TexFormat]) {
> + perf_debug("Non-renderable PBO format; fallback to CPU mapping\n");
> + return false;
> +  }
> }
>  
> assert(intel_texobj->mt == NULL);

signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/5] radeonsi: Enable VGPR spilling for all shader types v5

2015-01-27 Thread Michel Dänzer
On 28.01.2015 04:05, Tom Stellard wrote:
> v2:
>   - Only emit write SPI_TMPRING_SIZE once per packet.
>   - Use context global scratch buffer.
> 
> v3:
>   - Patch shaders using WRITE_DATA packet instead of map/unmap.
>   - Emit ICACHE_FLUSH, CS_PARTIAL_FLUSH, PS_PARTIAL_FLUSH, and
> VS_PARTIAL_FLUSH when patching shaders.
> 
> v4:
>   - Code cleanups.
>   - Remove unnecessary multiplies.
> 
> v5:
>   - Patch shaders in system memory and re-upload to vram.

[...]

> @@ -158,6 +159,12 @@ static struct pipe_context *si_create_context(struct 
> pipe_screen *screen, void *
>sctx->null_const_buf.buffer->width0, 0, 
> false);
>   }
>  
> + /* XXX: This is the maximum value allowed.  I'm not sure how compute
> +  * this for non-cs shaders.  Using the wrong value here can result in
> +  * GPU lockups, but the maximum value seems to always work.
> +  */

Missing 'to' in "I'm not sure how to compute".


> @@ -571,6 +571,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
> pipe_draw_info *info)
>   if (sctx->b.flags)
>   sctx->atoms.s.cache_flush->dirty = true;
>  
> +
> + if (sctx->emit_scratch_reloc) {

Extraneous blank line before the if statement.


> + /* Update the shader state to use the new shader bo. */
> +si_shader_init_pm4_state(shader);

Indentation of the function call looks off.


> +static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
> +{
> +
> + return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader),
> + si_get_scratch_buffer_bytes_per_wave(sctx, 
> sctx->gs_shader),
> + si_get_scratch_buffer_bytes_per_wave(sctx, 
> sctx->vs_shader));
> +}

[...]

> + if (scratch_needed_size > 0) {
> +
> + if (scratch_needed_size > current_scratch_buffer_size) {
> +
> + /* Create a bigger scratch buffer */

Extraneous blank lines at the beginning of the blocks.


> + /* Update the shaders, so they are using the latest scratch.  
> The
> +  * scratch buffer may have been changed since these shaders were
> +  * last used, so we still need to try to update them, even if
> +  * they require scratch buffers smaller than the current size.
> +  */
> + if (si_update_scratch_buffer(sctx, sctx->ps_shader)) {
> + si_pm4_bind_state(sctx, ps, 
> sctx->ps_shader->current->pm4);
> + sctx->emitted.named.ps = NULL;
> + }
> + if (si_update_scratch_buffer(sctx, sctx->gs_shader)) {
> + si_pm4_bind_state(sctx, gs, 
> sctx->gs_shader->current->pm4);
> + sctx->emitted.named.gs = NULL;
> + }
> + if (si_update_scratch_buffer(sctx, sctx->vs_shader)) {
> + si_pm4_bind_state(sctx, vs, 
> sctx->vs_shader->current->pm4);
> + sctx->emitted.named.vs = NULL;
> + }

Setting the emitted states to NULL isn't necessary. I think Marek made
that suggestion based on the assumption that the PM4 states would be
modified instead of replaced by new ones, but I don't think that's
practical.


In patch 4, I spotted a typo: 'beofre'


Other than that, the series is

Reviewed-by: Michel Dänzer 


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] i965/gen8: Include the buffer offset when emitting renderbuffer relocs

2015-01-27 Thread Kristian Høgsberg
On Tue, Jan 27, 2015 at 8:16 PM, Kenneth Graunke  wrote:
> On Tuesday, January 27, 2015 02:32:19 PM Jason Ekstrand wrote:
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88792
>> ---
>>
>> Yes, computing it from the surface state and bo->offset64 is a little
>> strange when we could just be using mt->offset.  However, that's the way
>> it's done on all the other gens so I decided to be consistent.
>
> Yeah...I've never understood why that was done.  It seems silly to me.
> Plus, with the uint64_t casts, it's even uglier.
>
> How about just supplying mt->offset?

I think it's an idiom that's been copied from other relocs where the
reloc value was a little more work to compute.  Eg

   surf[1] = (intel_renderbuffer_get_tile_offsets(irb, &tile_x, &tile_y) +
 mt->bo->offset64 + mt->offset);

and

   drm_intel_bo_emit_reloc(brw->batch.bo,
  brw->wm.base.surf_offset[surf_index] + 4,
  mt->bo,
  surf[1] - mt->bo->offset64,
  I915_GEM_DOMAIN_RENDER,
  I915_GEM_DOMAIN_RENDER);

Kristian

> Either way,
> Reviewed-by: Kenneth Graunke 
> and thanks for fixing this!
>
>>  src/mesa/drivers/dri/i965/gen8_surface_state.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
>> b/src/mesa/drivers/dri/i965/gen8_surface_state.c
>> index 45c35db..16b5a7e 100644
>> --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
>> +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
>> @@ -450,7 +450,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
>> drm_intel_bo_emit_reloc(brw->batch.bo,
>> brw->wm.base.surf_offset[surf_index] + 8 * 4,
>> mt->bo,
>> -   0,
>> +*((uint64_t *) &surf[8]) - mt->bo->offset64,
>
>^^^ Tabs?!
>
>> I915_GEM_DOMAIN_RENDER,
>> I915_GEM_DOMAIN_RENDER);
>>  }
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: Add new fast mtx_t mutex type for basic use cases

2015-01-27 Thread Kristian Høgsberg
While modern pthread mutexes are very fast, they still incur a call to an
external DSO and overhead of the generality and features of pthread mutexes.
Most mutexes in mesa only needs lock/unlock, and the idea here is that we can
inline the atomic operation and make the fast case just two intructions.
Mutexes are subtle and finicky to implement, so we carefully copy the
implementation from Ulrich Dreppers well-written and well-reviewed paper:

  "Futexes Are Tricky"
  http://www.akkadia.org/drepper/futex.pdf

We implement "mutex3", which gives us a mutex that has no syscalls on
uncontended lock or unlock.  Further, the uncontended case boils down to a
cmpxchg and an untaken branch and the uncontended unlock is just a locked decr
and an untaken branch.  We use __builtin_expect() to indicate that contention
is unlikely so that gcc will put the contention code out of the main code
flow.

A fast mutex only supports lock/unlock, can't be recursive or used with
condition variables.  We keep the pthread mutex implementation around as
full_mtx_t for the few places where we use condition variables or recursive
locking.  For platforms or compilers where futex and atomics aren't available,
mtx_t falls back to the pthread mutex.

The pthread mutex lock/unlock overhead shows up on benchmarks for CPU bound
applications.  Most CPU bound cases are helped and some of our internal
bind_buffer_object heavy benchmarks gain up to 10%.

Signed-off-by: Kristian Høgsberg 
---
 include/c11/threads_posix.h  | 145 ---
 include/c11/threads_win32.h  |  25 ++
 src/gallium/auxiliary/os/os_thread.h |  10 +--
 src/mesa/main/mtypes.h   |   4 +-
 src/mesa/main/shared.c   |   4 +-
 src/mesa/main/texobj.c   |   4 +-
 src/mesa/main/texobj.h   |   4 +-
 7 files changed, 171 insertions(+), 25 deletions(-)

diff --git a/include/c11/threads_posix.h b/include/c11/threads_posix.h
index f9c165d..3da4742 100644
--- a/include/c11/threads_posix.h
+++ b/include/c11/threads_posix.h
@@ -59,13 +59,15 @@ Configuration macro:
 #endif
 
 // FIXME: temporary non-standard hack to ease transition
-#define _MTX_INITIALIZER_NP PTHREAD_MUTEX_INITIALIZER
+#define _FULL_MTX_INITIALIZER_NP PTHREAD_MUTEX_INITIALIZER
+
+#define _MTX_INITIALIZER_NP { 0 }
 
 /* types */
 typedef pthread_cond_t  cnd_t;
 typedef pthread_t   thrd_t;
 typedef pthread_key_t   tss_t;
-typedef pthread_mutex_t mtx_t;
+typedef pthread_mutex_t full_mtx_t;
 typedef pthread_once_t  once_flag;
 
 
@@ -135,7 +137,7 @@ cnd_signal(cnd_t *cond)
 
 // 7.25.3.5
 static inline int
-cnd_timedwait(cnd_t *cond, mtx_t *mtx, const xtime *xt)
+cnd_timedwait(cnd_t *cond, full_mtx_t *mtx, const xtime *xt)
 {
 struct timespec abs_time;
 int rt;
@@ -148,7 +150,7 @@ cnd_timedwait(cnd_t *cond, mtx_t *mtx, const xtime *xt)
 
 // 7.25.3.6
 static inline int
-cnd_wait(cnd_t *cond, mtx_t *mtx)
+cnd_wait(cnd_t *cond, full_mtx_t *mtx)
 {
 if (!cond || !mtx) return thrd_error;
 pthread_cond_wait(cond, mtx);
@@ -159,7 +161,7 @@ cnd_wait(cnd_t *cond, mtx_t *mtx)
 /* 7.25.4 Mutex functions */
 // 7.25.4.1
 static inline void
-mtx_destroy(mtx_t *mtx)
+full_mtx_destroy(full_mtx_t *mtx)
 {
 assert(mtx);
 pthread_mutex_destroy(mtx);
@@ -167,7 +169,7 @@ mtx_destroy(mtx_t *mtx)
 
 // 7.25.4.2
 static inline int
-mtx_init(mtx_t *mtx, int type)
+full_mtx_init(full_mtx_t *mtx, int type)
 {
 pthread_mutexattr_t attr;
 if (!mtx) return thrd_error;
@@ -191,7 +193,7 @@ mtx_init(mtx_t *mtx, int type)
 
 // 7.25.4.3
 static inline int
-mtx_lock(mtx_t *mtx)
+full_mtx_lock(full_mtx_t *mtx)
 {
 if (!mtx) return thrd_error;
 pthread_mutex_lock(mtx);
@@ -199,14 +201,14 @@ mtx_lock(mtx_t *mtx)
 }
 
 static inline int
-mtx_trylock(mtx_t *mtx);
+full_mtx_trylock(full_mtx_t *mtx);
 
 static inline void
 thrd_yield(void);
 
 // 7.25.4.4
 static inline int
-mtx_timedlock(mtx_t *mtx, const xtime *xt)
+full_mtx_timedlock(full_mtx_t *mtx, const xtime *xt)
 {
 if (!mtx || !xt) return thrd_error;
 {
@@ -222,7 +224,7 @@ mtx_timedlock(mtx_t *mtx, const xtime *xt)
 #else
 time_t expire = time(NULL);
 expire += xt->sec;
-while (mtx_trylock(mtx) != thrd_success) {
+while (full_mtx_trylock(mtx) != thrd_success) {
 time_t now = time(NULL);
 if (expire < now)
 return thrd_busy;
@@ -236,7 +238,7 @@ mtx_timedlock(mtx_t *mtx, const xtime *xt)
 
 // 7.25.4.5
 static inline int
-mtx_trylock(mtx_t *mtx)
+full_mtx_trylock(full_mtx_t *mtx)
 {
 if (!mtx) return thrd_error;
 return (pthread_mutex_trylock(mtx) == 0) ? thrd_success : thrd_busy;
@@ -244,13 +246,132 @@ mtx_trylock(mtx_t *mtx)
 
 // 7.25.4.6
 static inline int
-mtx_unlock(mtx_t *mtx)
+full_mtx_unlock(full_mtx_t *mtx)
 {
 if (!mtx) return thrd_error;
 pthread_mutex_unlock(mtx);
 return thrd_success;
 }
 
+#if def

Re: [Mesa-dev] [PATCH 0/7] i965 L3 caching and pull constant improvements.

2015-01-27 Thread Kenneth Graunke
On Sunday, January 18, 2015 01:04:02 AM Francisco Jerez wrote:
> This is the first part of a series meant to improve our usage of the L3 cache.
> Currently it's far from ideal since the following objects aren't taking any
> advantage of it:
>  - Pull constants (i.e. UBOs and demoted uniforms)
>  - Buffer textures
>  - Shader scratch space (i.e. register spills and fills)
>  - Atomic counters
>  - (Soon) Images
> 
> This first series addresses the first two issues.  Fixing the last three is
> going to be a bit more difficult because we need to modify the partitioning of
> the L3 cache in order to increase the number of ways assigned to the DC, which
> happens to be zero on boot until Gen8.  That's likely to require kernel
> changes because we don't have any extremely satisfactory API to change that
> from userspace right now.
> 
> The first patch in the series sets the MOCS L3 cacheability bit in the surface
> state structure for buffers so the mentioned memory objects (except the shader
> scratch space that gets its MOCS from elsewhere) have a chance of getting
> cached in L3.
> 
> The fourth patch in the series switches to using the constant cache (which,
> unlike the data cache that was used years ago before we started using the
> sampler, is cached on L3 with the default partitioning on all gens) for
> uniform pull constants loads.  The overall performance numbers I've collected
> are included in the commit message of the same patch for future reference.
> Most of it points at the constant cache being faster than the sampler in a
> number of cases (assuming the L3 caching settings are correct), it's also
> likely to alleviate some cache thrashing caused by the competition with
> textures for the L1/L2 sampler caches, and it allows fetching up to eight
> consecutive owords (128B) with just one message.
> 
> The sixth patch enables 4 oword loads because they're basically for free and
> they avoid some of the shortcomings of the 1 and 2 oword messages (see the
> commit message for more details).  I'll have a look into enabling 8 oword
> loads but it's going to require an analysis pass to avoid wasting bandwidth
> and increasing the register pressure unnecessarily when the shader doesn't
> actually need as many constants.
> 
> We could do something similar for non-uniform offset pull constant loads and
> for both kinds of pull constant loads on the vec4 back-end, but I don't have
> enough performance data to support that yet.

Hi Curro!

Technically, I believe we /are/ taking advantage of the L3 today - the sampler
should be part of the "All Clients" and "Read Only Client Pool" portions of the
L3.  I believe the data port's "Constant Cache" is part of the same L3 region.
However, the sampler has an additional L1/L2 cache.

When you say you "don't have enough performance data" to support doing this in
the vec4 backend, or for non-uniform offset pull loads, do you mean that you
tried it and it wasn't useful, or you just haven't tried it yet?

In my experience, the VS matters a *lot* - skinning shaders tend to use large
arrays of matrices, which get demoted to pull constants.  For example, I
observed huge speedups in GLBenchmark 2.7 EgyptHD (commit 5d8e246ac86b4a94,
in the VS backend), GLBenchmark 2.1 PRO (commit 259b65e2e79), and Trine
(commit 04f5b2f4e454d6 - in the ARBvp backend) by moving from the data cache
to the sampler.

I'd love to see data for applying your new approach in the VS backend.

--Ken

signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965: Add brw_state_reloc() for adding relocations to batch state objects

2015-01-27 Thread Kristian Høgsberg
We have intel_batchbuffer_emit_reloc(), which combines calculating and writing
out the right value to the batch buffer with emitting the relocation.  It
determines the relocation offset from the current batch buffer position and
calculates the value to write in the batch buffer from the bo presumed offset
and the passed in delta.  This commit introduces brw_state_reloc() and
brw_state_reloc64(), which provide similar convenience for batch state
relocations.

Signed-off-by: Kristian Høgsberg 
---
 src/mesa/drivers/dri/i965/brw_cc.c| 11 +---
 src/mesa/drivers/dri/i965/brw_clip_state.c| 11 +---
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 13 ++---
 src/mesa/drivers/dri/i965/brw_sf_state.c  | 20 +++
 src/mesa/drivers/dri/i965/brw_state.h | 10 
 src/mesa/drivers/dri/i965/brw_state_batch.c   | 44 +++
 src/mesa/drivers/dri/i965/brw_vs_state.c  | 32 +++
 src/mesa/drivers/dri/i965/brw_wm_state.c  | 42 --
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c  | 69 ---
 src/mesa/drivers/dri/i965/gen6_blorp.cpp  | 13 +
 src/mesa/drivers/dri/i965/gen6_surface_state.c| 12 +---
 src/mesa/drivers/dri/i965/gen7_blorp.cpp  | 13 +
 src/mesa/drivers/dri/i965/gen7_wm_surface_state.c | 52 ++---
 src/mesa/drivers/dri/i965/gen8_surface_state.c| 59 +--
 src/mesa/drivers/dri/i965/intel_batchbuffer.c |  3 -
 15 files changed, 153 insertions(+), 251 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_cc.c 
b/src/mesa/drivers/dri/i965/brw_cc.c
index 02f5a3a..388e6d0 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -227,17 +227,10 @@ static void upload_cc_unit(struct brw_context *brw)
   cc->cc5.statistics_enable = 1;
 
/* BRW_NEW_CC_VP */
-   cc->cc4.cc_viewport_state_offset = (brw->batch.bo->offset64 +
-  brw->cc.vp_offset) >> 5; /* reloc */
+   brw_state_reloc(brw, &cc->cc4, brw->batch.bo,
+   I915_GEM_DOMAIN_INSTRUCTION, 0, brw->cc.vp_offset);
 
brw->state.dirty.brw |= BRW_NEW_GEN4_UNIT_STATE;
-
-   /* Emit CC viewport relocation */
-   drm_intel_bo_emit_reloc(brw->batch.bo,
-  (brw->cc.state_offset +
-   offsetof(struct brw_cc_unit_state, cc4)),
-  brw->batch.bo, brw->cc.vp_offset,
-  I915_GEM_DOMAIN_INSTRUCTION, 0);
 }
 
 const struct brw_tracked_state brw_cc_unit = {
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c 
b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 09a2523..4bfa06f 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -131,15 +131,8 @@ brw_upload_clip_unit(struct brw_context *brw)
ctx->ViewportArray[0].Height == (float) fb->Height)
{
   clip->clip5.guard_band_enable = 1;
-  clip->clip6.clipper_viewport_state_ptr =
- (brw->batch.bo->offset64 + brw->clip.vp_offset) >> 5;
-
-  /* emit clip viewport relocation */
-  drm_intel_bo_emit_reloc(brw->batch.bo,
-  (brw->clip.state_offset +
-   offsetof(struct brw_clip_unit_state, clip6)),
-  brw->batch.bo, brw->clip.vp_offset,
-  I915_GEM_DOMAIN_INSTRUCTION, 0);
+  brw_state_reloc(brw, &clip->clip6, brw->batch.bo,
+  I915_GEM_DOMAIN_INSTRUCTION, 0, brw->clip.vp_offset);
}
 
/* _NEW_TRANSFORM */
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c 
b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 0fe0853..298a09a 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -99,14 +99,11 @@ brw_emit_sampler_state(struct brw_context *brw,
SET_FIELD(mag_filter, BRW_SAMPLER_MAG_FILTER) |
SET_FIELD(min_filter, BRW_SAMPLER_MIN_FILTER);
 
-   ss[2] = border_color_offset;
-   if (brw->gen < 6) {
-  ss[2] += brw->batch.bo->offset64; /* reloc */
-  drm_intel_bo_emit_reloc(brw->batch.bo,
-  batch_offset_for_sampler_state + 8,
-  brw->batch.bo, border_color_offset,
-  I915_GEM_DOMAIN_SAMPLER, 0);
-   }
+   if (brw->gen < 6)
+  brw_state_reloc(brw, &ss[2], brw->batch.bo,
+  I915_GEM_DOMAIN_SAMPLER, 0, border_color_offset);
+   else
+  ss[2] = border_color_offset;
 
ss[3] = SET_FIELD(max_anisotropy, BRW_SAMPLER_MAX_ANISOTROPY) |
SET_FIELD(address_rounding, BRW_SAMPLER_ADDRESS_ROUNDING);
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c 
b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 75d6451..227c99a 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -126,7 +126,6 @@ static void up

Re: [Mesa-dev] [PATCH] mesa: Add new fast mtx_t mutex type for basic use cases

2015-01-27 Thread Jonathan Gray
Including  under __GNUC__ is going to break the build of
Mesa on everything compiled with clang/gcc that isn't Linux.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev