[Mesa-dev] [Bug 78298] Don't enforce gallium-pipe shared library when enable_xa is set to yes

2014-05-06 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=78298

NicolasChauvet  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |NOTABUG

--- Comment #3 from NicolasChauvet  ---
The issue was sorted out as the gallium-pipe was incorrecly distributed under
the OpenCL sub-package, which conditionalized only for x86 (not for ARM). So
I've incorrectly assumed the gallium-pipe was not built on x86.

Thx for the answear.

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965: Relax accumulator dependency scheduling on Gen < 6

2014-05-06 Thread Iago Toral Quiroga
Many instructions implicitly update the accumulator on Gen < 6. The instruction
scheduling code just calls add_barrier_deps() for each accumulator access on
these platforms, but a large class of operations don't actually update the
accumulator -- mostly move and logical instructions. Teaching the scheduling
code about this would allow more flexibility to schedule instructions.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=77740
---
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 84 +++---
 src/mesa/drivers/dri/i965/brw_shader.cpp   |  7 ++
 src/mesa/drivers/dri/i965/brw_shader.h |  1 +
 3 files changed, 33 insertions(+), 59 deletions(-)

I tested this on IvyBridge and IronLake with a piglit test run but it would be
nice if someone could test on other gens too.

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 8cc6908..6f8f405 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -742,8 +742,6 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
 void
 fs_instruction_scheduler::calculate_deps()
 {
-   const bool gen6plus = v->brw->gen >= 6;
-
/* Pre-register-allocation, this tracks the last write per VGRF (so
 * different reg_offsets within it can interfere when they shouldn't).
 * After register allocation, reg_offsets are gone and we track individual
@@ -803,7 +801,7 @@ fs_instruction_scheduler::calculate_deps()
 } else {
add_dep(last_fixed_grf_write, n);
 }
- } else if (inst->src[i].is_accumulator() && gen6plus) {
+ } else if (inst->src[i].is_accumulator()) {
 add_dep(last_accumulator_write, n);
 } else if (inst->src[i].file != BAD_FILE &&
inst->src[i].file != IMM &&
@@ -828,11 +826,7 @@ fs_instruction_scheduler::calculate_deps()
   }
 
   if (inst->reads_accumulator_implicitly()) {
- if (gen6plus) {
-add_dep(last_accumulator_write, n);
- } else {
-add_barrier_deps(n);
- }
+ add_dep(last_accumulator_write, n);
   }
 
   /* write-after-write deps. */
@@ -867,7 +861,7 @@ fs_instruction_scheduler::calculate_deps()
  } else {
 last_fixed_grf_write = n;
  }
-  } else if (inst->dst.is_accumulator() && gen6plus) {
+  } else if (inst->dst.is_accumulator()) {
  add_dep(last_accumulator_write, n);
  last_accumulator_write = n;
   } else if (inst->dst.file != BAD_FILE &&
@@ -887,13 +881,10 @@ fs_instruction_scheduler::calculate_deps()
 last_conditional_mod[inst->flag_subreg] = n;
   }
 
-  if (inst->writes_accumulator) {
- if (gen6plus) {
-add_dep(last_accumulator_write, n);
-last_accumulator_write = n;
- } else {
-add_barrier_deps(n);
- }
+  if (inst->writes_accumulator_implicitly(v->brw->gen) &&
+  !inst->dst.is_accumulator()) {
+ add_dep(last_accumulator_write, n);
+ last_accumulator_write = n;
   }
}
 
@@ -933,7 +924,7 @@ fs_instruction_scheduler::calculate_deps()
 } else {
add_dep(n, last_fixed_grf_write);
 }
- } else if (inst->src[i].is_accumulator() && gen6plus) {
+ } else if (inst->src[i].is_accumulator()) {
 add_dep(n, last_accumulator_write);
  } else if (inst->src[i].file != BAD_FILE &&
inst->src[i].file != IMM &&
@@ -958,11 +949,7 @@ fs_instruction_scheduler::calculate_deps()
   }
 
   if (inst->reads_accumulator_implicitly()) {
- if (gen6plus) {
-add_dep(n, last_accumulator_write);
- } else {
-add_barrier_deps(n);
- }
+ add_dep(n, last_accumulator_write);
   }
 
   /* Update the things this instruction wrote, so earlier reads
@@ -996,7 +983,7 @@ fs_instruction_scheduler::calculate_deps()
  } else {
 last_fixed_grf_write = n;
  }
-  } else if (inst->dst.is_accumulator() && gen6plus) {
+  } else if (inst->dst.is_accumulator()) {
  last_accumulator_write = n;
   } else if (inst->dst.file != BAD_FILE &&
  !inst->dst.is_null()) {
@@ -1013,12 +1000,8 @@ fs_instruction_scheduler::calculate_deps()
 last_conditional_mod[inst->flag_subreg] = n;
   }
 
-  if (inst->writes_accumulator) {
- if (gen6plus) {
-last_accumulator_write = n;
- } else {
-add_barrier_deps(n);
- }
+  if (inst->writes_accumulator_implicitly(v->brw->gen)) {
+ last_accumulator_write = n;
   }
}
 }
@@ -1026,8 +1009,6 @@ fs_instruction_scheduler::calculate_deps()
 void
 vec4_instruction_scheduler::calculate_deps()
 {
-   const bool gen6plus = v->brw->gen >= 6;
-
schedule_no

[Mesa-dev] [PATCH] mesa: pass target through to driver when choosing texture format

2014-05-06 Thread Ilia Mirkin
This only matters for TextureView where the texObj's target has not been
set yet, in all other instances, texObj->target should be the same as
the passed-in target parameter.

Signed-off-by: Ilia Mirkin 
---

I ran into an assert in mesa/st when choosing the texture format because the
target was 0. (While trying to implement texture views.) Not sure why it cares
about the target, but this seems correct.

 src/mesa/main/teximage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index c7f301c..845ba80 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -3024,7 +3024,7 @@ _mesa_choose_texture_format(struct gl_context *ctx,
}
 
/* choose format from scratch */
-   f = ctx->Driver.ChooseTextureFormat(ctx, texObj->Target, internalFormat,
+   f = ctx->Driver.ChooseTextureFormat(ctx, target, internalFormat,
format, type);
ASSERT(f != MESA_FORMAT_NONE);
return f;
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported

2014-05-06 Thread Ilia Mirkin
ping for this and 1/2 (which just adds the new cap)

On Mon, Apr 28, 2014 at 7:30 PM, Ilia Mirkin  wrote:
> Signed-off-by: Ilia Mirkin 
> ---
>
> The handling of the 4 offsets is less-than-pretty. I had an alternate version
> that created a new ir_dereference_array object and ran ->accept on that. This
> worked as well, but for each offset it would create a separate new array, and
> then deref just one item out of it. This seems incredibly wasteful. The
> slightly open-coded version of that seems reasonable and uses the same array.
>
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 
> ++
>  1 file changed, 41 insertions(+), 14 deletions(-)
>
> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> index d1c3856..20d5e99 100644
> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> @@ -87,8 +87,7 @@ extern "C" {
>   */
>  #define MAX_ARRAYS256
>
> -/* if we support a native gallium TG4 with the ability to take 4 texoffsets 
> then bump this */
> -#define MAX_GLSL_TEXTURE_OFFSET 1
> +#define MAX_GLSL_TEXTURE_OFFSET 4
>
>  class st_src_reg;
>  class st_dst_reg;
> @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
>  void
>  glsl_to_tgsi_visitor::visit(ir_texture *ir)
>  {
> -   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
> offset, sample_index, component;
> +   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
> offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
> st_dst_reg result_dst, coord_dst, cube_sc_dst;
> glsl_to_tgsi_instruction *inst = NULL;
> unsigned opcode = TGSI_OPCODE_NOP;
> const glsl_type *sampler_type = ir->sampler->type;
> bool is_cube_array = false;
> +   unsigned i;
>
> /* if we are a cube array sampler */
> if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 : 
> TGSI_OPCODE_TEX;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txb:
> @@ -2780,7 +2780,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>lod_info = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txl:
> @@ -2789,7 +2789,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>lod_info = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txd:
> @@ -2800,7 +2800,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>dy = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txs:
> @@ -2814,7 +2814,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>lod_info = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txf_ms:
> @@ -2828,9 +2828,17 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>component = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - /* this should have been lowered */
> - assert(ir->offset->type->base_type != GLSL_TYPE_ARRAY);
> - offset = this->result;
> + if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) {
> +const glsl_type *elt_type = ir->offset->type->fields.array;
> +for (i = 0; i < ir->offset->type->length; i++) {
> +   offset[i] = this->result;
> +   offset[i].index += i * type_size(elt_type);
> +   offset[i].type = elt_type->base_type;
> +   offset[i].swizzle = 
> swizzle_for_size(elt_type->vector_elements);
> +}
> + } else {
> +offset[0] = this->result;
> + }
>}
>break;
> case ir_lod:
> @@ -2960,8 +2968,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>this->prog);
>
> if (ir->offset) {
> -  inst->tex_offset_num_offset = 1;
> -  inst->tex_offsets[0] = offset;
> +  for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != 
> PROGRAM_UNDEFINED; i++)
> + inst->tex_offsets[i] = offset[i];
> +  inst->tex_offset_num_offset = i;
> }
>
> switch (sampler_type->sampler_dimensionality) {
> @@ -4479,6 +4488,8 @@ translate_tex_offset(struct st_translate *t,
>  {
> struct tgsi_texture_offset offset;
> struct ureg_src imm_src;
> +

Re: [Mesa-dev] [PATCH 00/21] deferred and threaded glCompileShader

2014-05-06 Thread Chia-I Wu
On Tue, May 6, 2014 at 5:27 AM, Fredrik Höglund  wrote:
> On Tuesday 22 April 2014, Chia-I Wu wrote:
>> Hi list,
>>
>> This series adds a thread pool to the GLSL compiler, and a drirc option to
>> defer glCompileShader calls to the pool.  The goal is to reduce the start-up
>> time of applications that are aware of this feature.  That is, applications
>> that compile shaders first and check the compile status later.
>>
>> I do not have numbers from real applications yet.  But trying to compiling a
>> set of 2882 shaders extracted from some trace file, with everything else
>> idled, the time it takes is
>>
>>   8 threads: 17.8s
>>   4 threads: 20.3s
>>   2 threads: 31.2s
>>   1 threads: 58.0s
>>   no thread pool: 54.5
>>
>> on a quad core system.
>>
>> Patches 1-4 fix potential races in the GLSL compiler.  As the compiler is
>> already shared by all contexts, these patches could be desirable even without
>> the thread pool that I am going to add.
>>
>> Patches 5-18 adds true GL_DEBUG_OUTPUT_SYNCHRONOUS support to the KHR_debug
>> code.  All except patch 18 are clean-ups.  Patch 18 adds a mutex to protect
>> gl_debug_state.
>>
>> Patch 19 defines a simple API to create and work with thread pools, as well 
>> as
>> a test for the API.
>>
>> Patch 20 adds the singleton GLSL thread pool and allows glCompileShader to be
>> deferred to the pool.  This feature needs to be explicitly enabled with
>> _mesa_enable_glsl_threadpool.
>>
>> Patch 21 adds a drirc option to enable the thread pool.  The idea is that 
>> only
>> applications that can benefit from it will enable it.
>
> If applications are supposed to enable this behavior themselves I think
> it would be better to add an extension that lets them do:
>
> glEnable(GL_DEFERRED_SHADER_COMPILATION_EXT);
glHint might be better.  But it is hard to define the behavior of
deferred shader compilation without restricting what an implementation
may do.  And for a driver that is able to determine GL_COMPILE_STATUS
quickly and defer only the optimization passes, threaded compile may
be always on.


>
> The drirc option can still be useful for overriding the default though.
>
> Fredrik
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 02/21] glsl: protect locale_t with a mutex

2014-05-06 Thread Petri Latvala

On 05/06/2014 01:04 AM, Kenneth Graunke wrote:

On 05/05/2014 10:01 AM, Ian Romanick wrote:


The problem I see with that is we don't know what order constructors on
global objects will occur.  Are we sure the contructor will be invoked
before we need the locale now and in the future?

These functions are used during lexing to read floating point
literals...without breaking on locales where the decimal point is a comma.

All constructors have definitely been run by that point, and I can't
imagine we'll ever need to call locale-independent strtof/strtod from
other constructors.




The order of constructors in a single TU is well defined. The usual 
pattern for this kind of initialization is to have the initializer 
object in the header as static:



In strtod.h:

struct loc_initializer
{
  _Initializer()
  {
if (!loc) {
  loc = initialize_it();
}
   }
};

static loc_initializer loc_init;


Any global object constructor that might use loc will have to #include 
strtod.h, therefore loc_init will appear first in that TU, and will 
initialize first.


A real-world example would be std::_Init that initializes std::cout and 
its pals.



--
Petri Latvala

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 02/21] glsl: protect locale_t with a mutex

2014-05-06 Thread Chia-I Wu
On Tue, May 6, 2014 at 6:04 AM, Kenneth Graunke  wrote:
> On 05/05/2014 10:01 AM, Ian Romanick wrote:
>> On 05/04/2014 02:14 PM, Chia-I Wu wrote:
>>> On Sat, May 3, 2014 at 1:52 AM, Ian Romanick  wrote:
 On 04/22/2014 01:58 AM, Chia-I Wu wrote:
> There may be two contexts compiling shaders at the same time.  locale_t 
> needs
> to be protected.

 Rather than calling glsl_initialize_strtod from other places in the
 compiler, it seems better to use call_once from the strtof and strtod
 functions.
>>> How about having a static object to call newlocale()/freelocale() in
>>> its constructor/destructor?  It will impose no overead on
>>> glsl_strtod(), at the expense of little wasted memory when the
>>> applications do not compile shaders.
>>
>> The problem I see with that is we don't know what order constructors on
>> global objects will occur.  Are we sure the contructor will be invoked
>> before we need the locale now and in the future?
>
> These functions are used during lexing to read floating point
> literals...without breaking on locales where the decimal point is a comma.
>
> All constructors have definitely been run by that point, and I can't
> imagine we'll ever need to call locale-independent strtof/strtod from
> other constructors.
They are also used by the IR reader.  But the IR reader is no longer
used to parse built-in functions or so.  We should be good as you
said.

Not that we need this, and I am not saying this with 100% confidence,
from my research on stackoverflow (instead of the standard, sorry), it
is the initialization order of static objects from different
translation units that are undefined.  If this locale_t_initializer
was defined statically in strtod.cpp, other static constructors
calling glsl_strtod() would actually work because, by the time
glsl_strtod() is called, locale_t_initializer is guaranteed to be
initialized.

Anyway, I should add some assert() just in case.

>
> --Ken
>



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] glsl_to_tgsi: remove unnecessary dead code elimination pass

2014-05-06 Thread Marek Olšák
Reviewed-by: Marek Olšák 

Marek

On Tue, May 6, 2014 at 5:40 AM, Bryan Cain  wrote:
> With the more advanced dead code elimination pass already being run,
> eliminate_dead_code was making no difference in instruction count, and had
> an undesirable O(n^2) runtime. So remove it and rename
> eliminate_dead_code_advanced to eliminate_dead_code.
> ---
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp |   50 
> +++-
>  1 file changed, 5 insertions(+), 45 deletions(-)
>
> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> index 6eb6c8a..b0e0782 100644
> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> @@ -460,8 +460,7 @@ public:
> int get_last_temp_write(int index);
>
> void copy_propagate(void);
> -   void eliminate_dead_code(void);
> -   int eliminate_dead_code_advanced(void);
> +   int eliminate_dead_code(void);
> void merge_registers(void);
> void renumber_registers(void);
>
> @@ -3663,7 +3662,8 @@ glsl_to_tgsi_visitor::copy_propagate(void)
>  }
>
>  /*
> - * Tracks available PROGRAM_TEMPORARY registers for dead code elimination.
> + * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for 
> dead
> + * code elimination.
>   *
>   * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
>   * will occur.  As an example, a TXP production after copy propagation but
> @@ -3676,48 +3676,9 @@ glsl_to_tgsi_visitor::copy_propagate(void)
>   * and after this pass:
>   *
>   * 0: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D;
> - *
> - * FIXME: assumes that all functions are inlined (no support for 
> BGNSUB/ENDSUB)
> - * FIXME: doesn't eliminate all dead code inside of loops; it steps around 
> them
> - */
> -void
> -glsl_to_tgsi_visitor::eliminate_dead_code(void)
> -{
> -   int i;
> -
> -   for (i=0; i < this->next_temp; i++) {
> -  int last_read = get_last_temp_read(i);
> -  int j = 0;
> -
> -  foreach_list_safe(node, &this->instructions) {
> - glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *) node;
> -
> - if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.index == i &&
> - j > last_read)
> - {
> -inst->remove();
> -delete inst;
> - }
> -
> - j++;
> -  }
> -   }
> -}
> -
> -/*
> - * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for 
> dead
> - * code elimination.  This is less primitive than eliminate_dead_code(), as 
> it
> - * is per-channel and can detect consecutive writes without a read between 
> them
> - * as dead code.  However, there is some dead code that can be eliminated by
> - * eliminate_dead_code() but not this function - for example, this function
> - * cannot eliminate an instruction writing to a register that is never read 
> and
> - * is the only instruction writing to that register.
> - *
> - * The glsl_to_tgsi_visitor lazily produces code assuming that this pass
> - * will occur.
>   */
>  int
> -glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void)
> +glsl_to_tgsi_visitor::eliminate_dead_code(void)
>  {
> glsl_to_tgsi_instruction **writes = rzalloc_array(mem_ctx,
>   
> glsl_to_tgsi_instruction *,
> @@ -5245,9 +5206,8 @@ get_mesa_program(struct gl_context *ctx,
> /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. 
> */
> v->simplify_cmp();
> v->copy_propagate();
> -   while (v->eliminate_dead_code_advanced());
> +   while (v->eliminate_dead_code());
>
> -   v->eliminate_dead_code();
> v->merge_registers();
> v->renumber_registers();
>
> --
> 1.7.9.5
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Compiling Mesa/softpipe for Windows

2014-05-06 Thread Jose Fonseca
Cross-compiling Mesa for windows on linux with mingw is dead easy:

  - install mingw-w64 C/C++ cross-compilers (any recent linux distro already 
has the packages)

  - run

 scons platform=windows libgl-gdi 

The opengl32.dll drop-in replacement will in 
mesa/build/windows-x86-debug/gallium/targets/libgl-gdi/opengl32.dll

If you want a release build add "build=release" to the scons command line.

llvmpipe is faster, but its build is more complex (you'll need to build LLVM 
for mingw, but that too can be done with cross-compilers.)

Jose

- Original Message -
> Hello,
> 
> I’m trying to get OpenGL 3.3 working on Windows via the softpipe driver. Can
> somebody tell me the steps to successfully (cross) compile it with either
> MinGW or Visual Studio. I seem to always run into problems.
> 
> Thanks
> André
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=NMr9uy2iTjWVixC0wOcYCWEIYhfo80qKwRgdodpoDzA%3D%0A&m=J1yzqfqqAXFRaco4DEX3lwmn2jsACsOex%2FrQfGJ6LL0%3D%0A&s=fc09b7ceb8ab879f7f58d38f43b2f6de9bbc6afc9fd04cbd0306afa1a24c75a0
> 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/2] i965: Simulate MAD opcode with gen<6

2014-05-06 Thread Juha-Pekka Heikkila
These patches allow MAD opcode to be used with pre gen6 hardware.
Instead of failing on emitting MAD there will be emitted MUL and ADD 
to simulate MAD.

I tried this with piglit on ILK (gen5) and did not see regression.

Juha-Pekka Heikkila (2):
  i965/fs: Simulate MAD opcode with gen<6
  i965/vec4: Simulate MAD opcode for gen<6

 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   | 15 ++
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 41 ++
 2 files changed, 39 insertions(+), 17 deletions(-)

-- 
1.8.1.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] i965/fs: Simulate MAD opcode with gen<6

2014-05-06 Thread Juha-Pekka Heikkila
Signed-off-by: Juha-Pekka Heikkila 
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d2dc5fa..22ca528 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -293,10 +293,6 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
 bool
 fs_visitor::try_emit_mad(ir_expression *ir)
 {
-   /* 3-src instructions were introduced in gen6. */
-   if (brw->gen < 6)
-  return false;
-
/* MAD can only handle floating-point data. */
if (ir->type != glsl_type::float_type)
   return false;
@@ -327,7 +323,16 @@ fs_visitor::try_emit_mad(ir_expression *ir)
fs_reg src2 = this->result;
 
this->result = fs_reg(this, ir->type);
-   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
+
+   /* 3-src instructions were introduced in gen6. */
+   if (brw->gen < 6) {
+  fs_reg temp = fs_reg(this, glsl_type::float_type);
+
+  emit(MUL(temp, src1, src2));
+  emit(ADD(this->result, src0, temp));
+   } else {
+  emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
+   }
 
return true;
 }
-- 
1.8.1.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] i965/vec4: Simulate MAD opcode for gen<6

2014-05-06 Thread Juha-Pekka Heikkila
Signed-off-by: Juha-Pekka Heikkila 
---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 41 ++
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 7bad81c..506a4b2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1092,10 +1092,6 @@ vec4_visitor::try_emit_sat(ir_expression *ir)
 bool
 vec4_visitor::try_emit_mad(ir_expression *ir)
 {
-   /* 3-src instructions were introduced in gen6. */
-   if (brw->gen < 6)
-  return false;
-
/* MAD can only handle floating-point data. */
if (ir->type->base_type != GLSL_TYPE_FLOAT)
   return false;
@@ -,17 +1107,38 @@ vec4_visitor::try_emit_mad(ir_expression *ir)
  return false;
}
 
-   nonmul->accept(this);
-   src_reg src0 = fix_3src_operand(this->result);
+   /* 3-src instructions were introduced in gen6. */
+   if (brw->gen < 6) {
+  nonmul->accept(this);
+  src_reg src0(this->result);
 
-   mul->operands[0]->accept(this);
-   src_reg src1 = fix_3src_operand(this->result);
+  mul->operands[0]->accept(this);
+  src_reg src1(this->result);
 
-   mul->operands[1]->accept(this);
-   src_reg src2 = fix_3src_operand(this->result);
+  mul->operands[1]->accept(this);
+  src_reg src2(this->result);
 
-   this->result = src_reg(this, ir->type);
-   emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
+  this->result = src_reg(this, ir->type);
+
+  dst_reg mul_destination = dst_reg(this, glsl_type::float_type);
+  mul_destination.writemask = dst_reg(this->result).writemask;
+
+  emit(MUL(mul_destination, src1, src2));
+  emit(ADD(dst_reg(this->result), src0, src_reg(mul_destination)));
+   } else {
+  nonmul->accept(this);
+  src_reg src0 = fix_3src_operand(this->result);
+
+  mul->operands[0]->accept(this);
+  src_reg src1 = fix_3src_operand(this->result);
+
+  mul->operands[1]->accept(this);
+  src_reg src2 = fix_3src_operand(this->result);
+
+  this->result = src_reg(this, ir->type);
+
+  emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
+   }
 
return true;
 }
-- 
1.8.1.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] GL_OES_texture_float and GL_OES_texture_half_float support

2014-05-06 Thread Kevin Rogovin
Add support for GLES2 extentions for floating point and half
floating point textures (GL_OES_texture_float, GL_OES_texture_half_float,
GL_OES_texture_float_linear and GL_OES_texture_half_float_linear).

---
 src/mesa/main/extensions.c | 12 +
 src/mesa/main/glformats.c  | 27 
 src/mesa/main/pack.c   | 17 +
 src/mesa/main/teximage.c   | 61 ++
 4 files changed, 117 insertions(+)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index c2ff7e3..58a5a51 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -360,6 +360,18 @@ static const struct extension extension_table[] = {
{ "GL_SGIS_texture_lod",o(dummy_true),  
GLL,1997 },
{ "GL_SUN_multi_draw_arrays",   o(dummy_true),  
GLL,1999 },
 
+   /*
+ TODO:
+  - rather than have an all or nothing approach for floating point 
textures,
+allow for driver to specify what parts of floating point texture 
funtionality
+is supported: float/half-float and filtering for each. 
+*/
+   { "GL_OES_texture_float",   o(ARB_texture_float),   
ES2|ES3,2005 },
+   { "GL_OES_texture_half_float",  o(ARB_texture_float),   
ES2|ES3,2005 },
+   { "GL_OES_texture_float_linear",o(ARB_texture_float),   
ES2|ES3,2005 },
+   { "GL_OES_texture_half_float_linear",   o(ARB_texture_float),   
ES2|ES3,2005 },
+   
+
{ 0, 0, 0, 0 },
 };
 
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 9bb341c..be728f4 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -93,6 +93,7 @@ _mesa_sizeof_type(GLenum type)
case GL_DOUBLE:
   return sizeof(GLdouble);
case GL_HALF_FLOAT_ARB:
+   case GL_HALF_FLOAT_OES:
   return sizeof(GLhalfARB);
case GL_FIXED:
   return sizeof(GLfixed);
@@ -125,6 +126,7 @@ _mesa_sizeof_packed_type(GLenum type)
case GL_INT:
   return sizeof(GLint);
case GL_HALF_FLOAT_ARB:
+   case GL_HALF_FLOAT_OES:
   return sizeof(GLhalfARB);
case GL_FLOAT:
   return sizeof(GLfloat);
@@ -243,6 +245,7 @@ _mesa_bytes_per_pixel(GLenum format, GLenum type)
case GL_FLOAT:
   return comps * sizeof(GLfloat);
case GL_HALF_FLOAT_ARB:
+   case GL_HALF_FLOAT_OES:
   return comps * sizeof(GLhalfARB);
case GL_UNSIGNED_BYTE_3_3_2:
case GL_UNSIGNED_BYTE_2_3_3_REV:
@@ -1365,6 +1368,11 @@ _mesa_error_check_format_and_type(const struct 
gl_context *ctx,
 case GL_FLOAT:
 case GL_HALF_FLOAT:
return GL_NO_ERROR;
+case GL_HALF_FLOAT_OES:
+   return (format==GL_LUMINANCE || 
+   format==GL_LUMINANCE_ALPHA || 
+   format==GL_ALPHA) 
+  ? GL_NO_ERROR: GL_INVALID_ENUM;
 default:
return GL_INVALID_ENUM;
  }
@@ -1401,6 +1409,9 @@ _mesa_error_check_format_and_type(const struct gl_context 
*ctx,
 case GL_UNSIGNED_SHORT_5_6_5_REV:
 case GL_HALF_FLOAT:
return GL_NO_ERROR;
+case GL_HALF_FLOAT_OES:
+   return (format==GL_RGB) 
+  ? GL_NO_ERROR: GL_INVALID_ENUM;
 case GL_UNSIGNED_INT_2_10_10_10_REV:
/* OK by GL_EXT_texture_type_2_10_10_10_REV */
return (ctx->API == API_OPENGLES2)
@@ -1454,6 +1465,9 @@ _mesa_error_check_format_and_type(const struct gl_context 
*ctx,
 case GL_UNSIGNED_INT_2_10_10_10_REV:
 case GL_HALF_FLOAT:
return GL_NO_ERROR;
+case GL_HALF_FLOAT_OES:
+   return (format==GL_RGBA) 
+  ? GL_NO_ERROR: GL_INVALID_ENUM;
 default:
return GL_INVALID_ENUM;
  }
@@ -1666,6 +1680,7 @@ _mesa_es_error_check_format_and_type(GLenum format, 
GLenum type,
return type_valid ? GL_NO_ERROR : GL_INVALID_OPERATION;
 }
 
+ 
 
 /**
  * Do error checking of format/type combinations for OpenGL ES 3
@@ -1676,6 +1691,18 @@ GLenum
 _mesa_es3_error_check_format_and_type(GLenum format, GLenum type,
   GLenum internalFormat)
 {
+   /*
+ special case checking for support the GLES2 extension
+ GL_OES_texture_float and GL_OES_texture_half_float
+*/
+   if(format==internalFormat
+  && (type==GL_HALF_FLOAT_OES || type==GL_FLOAT) 
+  && (format==GL_RGBA || format==GL_RGB ||
+  format==GL_LUMINANCE || format==GL_ALPHA ||
+  format==GL_LUMINANCE_ALPHA) ) {
+  return GL_NO_ERROR;
+   }
+   
switch (format) {
case GL_RGBA:
   switch (type) {
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index 1df6568.

Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported

2014-05-06 Thread Roland Scheidegger
Looks good to me.
Does that mean if also the GATHER_SM5 cap is supported you have to
support 4 independent, non-constant offsets?
Would it make sense to reorder the caps so the gather stuff is all
together (now 5 cap bits just for this...)?

Roland

Am 29.04.2014 01:30, schrieb Ilia Mirkin:
> Signed-off-by: Ilia Mirkin 
> ---
> 
> The handling of the 4 offsets is less-than-pretty. I had an alternate version
> that created a new ir_dereference_array object and ran ->accept on that. This
> worked as well, but for each offset it would create a separate new array, and
> then deref just one item out of it. This seems incredibly wasteful. The
> slightly open-coded version of that seems reasonable and uses the same array.
> 
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 
> ++
>  1 file changed, 41 insertions(+), 14 deletions(-)
> 
> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> index d1c3856..20d5e99 100644
> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> @@ -87,8 +87,7 @@ extern "C" {
>   */
>  #define MAX_ARRAYS256
>  
> -/* if we support a native gallium TG4 with the ability to take 4 texoffsets 
> then bump this */
> -#define MAX_GLSL_TEXTURE_OFFSET 1
> +#define MAX_GLSL_TEXTURE_OFFSET 4
>  
>  class st_src_reg;
>  class st_dst_reg;
> @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
>  void
>  glsl_to_tgsi_visitor::visit(ir_texture *ir)
>  {
> -   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
> offset, sample_index, component;
> +   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
> offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
> st_dst_reg result_dst, coord_dst, cube_sc_dst;
> glsl_to_tgsi_instruction *inst = NULL;
> unsigned opcode = TGSI_OPCODE_NOP;
> const glsl_type *sampler_type = ir->sampler->type;
> bool is_cube_array = false;
> +   unsigned i;
>  
> /* if we are a cube array sampler */
> if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 : 
> TGSI_OPCODE_TEX; 
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txb:
> @@ -2780,7 +2780,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>lod_info = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txl:
> @@ -2789,7 +2789,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>lod_info = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txd:
> @@ -2800,7 +2800,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>dy = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txs:
> @@ -2814,7 +2814,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>lod_info = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - offset = this->result;
> + offset[0] = this->result;
>}
>break;
> case ir_txf_ms:
> @@ -2828,9 +2828,17 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>component = this->result;
>if (ir->offset) {
>   ir->offset->accept(this);
> - /* this should have been lowered */
> - assert(ir->offset->type->base_type != GLSL_TYPE_ARRAY);
> - offset = this->result;
> + if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) {
> +const glsl_type *elt_type = ir->offset->type->fields.array;
> +for (i = 0; i < ir->offset->type->length; i++) {
> +   offset[i] = this->result;
> +   offset[i].index += i * type_size(elt_type);
> +   offset[i].type = elt_type->base_type;
> +   offset[i].swizzle = 
> swizzle_for_size(elt_type->vector_elements);
> +}
> + } else {
> +offset[0] = this->result;
> + }
>}
>break;
> case ir_lod:
> @@ -2960,8 +2968,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>  this->prog);
>  
> if (ir->offset) {
> -  inst->tex_offset_num_offset = 1;
> -  inst->tex_offsets[0] = offset;
> +  for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != 
> PROGRAM_UNDEFINED; i++)
> + inst->tex_offsets[i] = offset[i];
> +  inst->tex_offset_num_offset = i;
> }
>  
>

Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported

2014-05-06 Thread Ilia Mirkin
On Tue, May 6, 2014 at 10:48 AM, Roland Scheidegger  wrote:
> Looks good to me.

Thanks!

> Does that mean if also the GATHER_SM5 cap is supported you have to
> support 4 independent, non-constant offsets?

Not 100% sure what you're asking... but yes, for ARB_gs5 to work, you
have to support independent non-constant offsets. And if you have
PIPE_CAP_TEXTURE_GATHER_OFFSETS enabled, you're making the claim that
you can handle multiple independent offsets in a single texgather.
Without the cap, the 4 offsets get lowered into 4 separate texgathers
(with only one of the returned components used).

With nvc0, the offsets are passed in via a register, so non-constant
is never an issue. And with nv50, the offsets must be immediates (and
there can be only 1 set of them), but it also has no hope of
supporting all of ARB_gs5.

> Would it make sense to reorder the caps so the gather stuff is all
> together (now 5 cap bits just for this...)?

The quantity of caps for texgather is a little ridiculous. I'm of the
opinion that this should be the default behaviour, and it should be up
to the driver to lower it into 4 texgathers if it can't handle them
directly. Furthermore, this functionality is only available (via GL)
with ARB_gs5, which in turn will require a whole bunch of stuff, so I
don't know whether the GATHER_SM5 cap is really that useful. And for
someone with a DX tracker, this functionality would again not be
useful on its own, the rest of SM5 would have to be supported as well
(I assume).

But that's not what got implemented, and I don't care to modify
radeon, which can only support 1 offset at a time. (Although I don't
think the radeon impl got pushed...) I anticipate that llvmpipe
doesn't care one way or another (perhaps with even a minor preference
towards having it all in one instruction).

If there's concensus, happy to switch this on by default and get rid
of the cap :) [And also get rid of the GATHER_SM5 cap.]

>
> Roland
>
> Am 29.04.2014 01:30, schrieb Ilia Mirkin:
>> Signed-off-by: Ilia Mirkin 
>> ---
>>
>> The handling of the 4 offsets is less-than-pretty. I had an alternate version
>> that created a new ir_dereference_array object and ran ->accept on that. This
>> worked as well, but for each offset it would create a separate new array, and
>> then deref just one item out of it. This seems incredibly wasteful. The
>> slightly open-coded version of that seems reasonable and uses the same array.
>>
>>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 
>> ++
>>  1 file changed, 41 insertions(+), 14 deletions(-)
>>
>> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
>> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>> index d1c3856..20d5e99 100644
>> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>> @@ -87,8 +87,7 @@ extern "C" {
>>   */
>>  #define MAX_ARRAYS256
>>
>> -/* if we support a native gallium TG4 with the ability to take 4 texoffsets 
>> then bump this */
>> -#define MAX_GLSL_TEXTURE_OFFSET 1
>> +#define MAX_GLSL_TEXTURE_OFFSET 4
>>
>>  class st_src_reg;
>>  class st_dst_reg;
>> @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
>>  void
>>  glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>  {
>> -   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
>> offset, sample_index, component;
>> +   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
>> offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
>> st_dst_reg result_dst, coord_dst, cube_sc_dst;
>> glsl_to_tgsi_instruction *inst = NULL;
>> unsigned opcode = TGSI_OPCODE_NOP;
>> const glsl_type *sampler_type = ir->sampler->type;
>> bool is_cube_array = false;
>> +   unsigned i;
>>
>> /* if we are a cube array sampler */
>> if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
>> @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 
>> : TGSI_OPCODE_TEX;
>>if (ir->offset) {
>>   ir->offset->accept(this);
>> - offset = this->result;
>> + offset[0] = this->result;
>>}
>>break;
>> case ir_txb:
>> @@ -2780,7 +2780,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>lod_info = this->result;
>>if (ir->offset) {
>>   ir->offset->accept(this);
>> - offset = this->result;
>> + offset[0] = this->result;
>>}
>>break;
>> case ir_txl:
>> @@ -2789,7 +2789,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>lod_info = this->result;
>>if (ir->offset) {
>>   ir->offset->accept(this);
>> - offset = this->result;
>> + offset[0] = this->result;
>>}
>>break;
>> case ir_txd:
>> @@ -2800,7 +2800,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>dy = this->result;
>>if (ir->offset) {
>>   ir->offset-

Re: [Mesa-dev] [PATCH 3/3] svga: add switch case for PIPE_SHADER_CAP_PREFERRED_IR, remove default case

2014-05-06 Thread Jose Fonseca
Series LGTM.

Jose

- Original Message -
> Remove default switch case so we're warned of missing cases at compile
> time.
> ---
>  src/gallium/drivers/svga/svga_screen.c |   18 ++
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/src/gallium/drivers/svga/svga_screen.c
> b/src/gallium/drivers/svga/svga_screen.c
> index 8c70bb7..bc914b7 100644
> --- a/src/gallium/drivers/svga/svga_screen.c
> +++ b/src/gallium/drivers/svga/svga_screen.c
> @@ -343,11 +343,12 @@ static int svga_get_shader_param(struct pipe_screen
> *screen, unsigned shader, en
>case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
>case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
>   return 16;
> -  default:
> - debug_printf("Unexpected fragment shader query %u\n", param);
> - return 0;
> +  case PIPE_SHADER_CAP_PREFERRED_IR:
> + return PIPE_SHADER_IR_TGSI;
>}
> -  break;
> +  /* If we get here, we failed to handle a cap above */
> +  debug_printf("Unexpected fragment shader query %u\n", param);
> +  return 0;
> case PIPE_SHADER_VERTEX:
>switch (param)
>{
> @@ -394,11 +395,12 @@ static int svga_get_shader_param(struct pipe_screen
> *screen, unsigned shader, en
>case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
>case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
>   return 0;
> -  default:
> - debug_printf("Unexpected vertex shader query %u\n", param);
> - return 0;
> +  case PIPE_SHADER_CAP_PREFERRED_IR:
> + return PIPE_SHADER_IR_TGSI;
>}
> -  break;
> +  /* If we get here, we failed to handle a cap above */
> +  debug_printf("Unexpected vertex shader query %u\n", param);
> +  return 0;
> case PIPE_SHADER_GEOMETRY:
> case PIPE_SHADER_COMPUTE:
>/* no support for geometry or compute shaders at this time */
> --
> 1.7.10.4
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=NMr9uy2iTjWVixC0wOcYCWEIYhfo80qKwRgdodpoDzA%3D%0A&m=MQcxpL%2FgTB1nG3hZaJq%2FUqYDOOEJQ8XufGj7cxgVFEA%3D%0A&s=b771d15063454cf04d10429852920d1ef152aac7a9e539b01aa528bb1bedfdbc
> 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] automake: Honor GL_LIB for gallium libgl-xlib

2014-05-06 Thread Brad King
Use "@GL_LIB@" in src/gallium/targets/libgl-xlib/Makefile.am to produce
the library name specified by the configure --with-gl-lib-name option.
---
 src/gallium/targets/libgl-xlib/Makefile.am | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/targets/libgl-xlib/Makefile.am 
b/src/gallium/targets/libgl-xlib/Makefile.am
index 4ee364e..7651333 100644
--- a/src/gallium/targets/libgl-xlib/Makefile.am
+++ b/src/gallium/targets/libgl-xlib/Makefile.am
@@ -40,17 +40,17 @@ AM_CPPFLAGS = \
-DGALLIUM_GALAHAD
 AM_CFLAGS = $(X11_INCLUDES)
 
-lib_LTLIBRARIES = libGL.la
+lib_LTLIBRARIES = lib@GL_LIB@.la
 
-nodist_EXTRA_libGL_la_SOURCES = dummy.cpp
-libGL_la_SOURCES = xlib.c
-libGL_la_LDFLAGS = \
+nodist_EXTRA_lib@GL_LIB@_la_SOURCES = dummy.cpp
+lib@GL_LIB@_la_SOURCES = xlib.c
+lib@GL_LIB@_la_LDFLAGS = \
-no-undefined \
-version-number $(GL_MAJOR):$(GL_MINOR):$(GL_TINY) \
$(GC_SECTIONS) \
$(LD_NO_UNDEFINED)
 
-libGL_la_LIBADD = \
+lib@GL_LIB@_la_LIBADD = \
$(top_builddir)/src/gallium/state_trackers/glx/xlib/libxlib.la \
$(top_builddir)/src/gallium/winsys/sw/xlib/libws_xlib.la \
$(top_builddir)/src/gallium/drivers/softpipe/libsoftpipe.la \
@@ -64,9 +64,9 @@ libGL_la_LIBADD = \
$(CLOCK_LIB)
 
 if HAVE_MESA_LLVM
-libGL_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la 
$(LLVM_LIBS)
+lib@GL_LIB@_la_LIBADD += 
$(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS)
 AM_CPPFLAGS += -DGALLIUM_LLVMPIPE
-libGL_la_LDFLAGS += $(LLVM_LDFLAGS)
+lib@GL_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS)
 endif
 
 include $(top_srcdir)/install-gallium-links.mk
-- 
1.9.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] GL_OES_texture_float and GL_OES_texture_half_float support

2014-05-06 Thread Matt Turner
The title should be something like

mesa: Expose GL_OES_texture_float and GL_OES_texture_half_float.

Have you found an application that wants these extensions? That might
be useful to describe in the commit message.

On Tue, May 6, 2014 at 4:02 AM, Kevin Rogovin  wrote:
> Add support for GLES2 extentions for floating point and half
> floating point textures (GL_OES_texture_float, GL_OES_texture_half_float,
> GL_OES_texture_float_linear and GL_OES_texture_half_float_linear).
>
> ---
>  src/mesa/main/extensions.c | 12 +
>  src/mesa/main/glformats.c  | 27 
>  src/mesa/main/pack.c   | 17 +
>  src/mesa/main/teximage.c   | 61 
> ++
>  4 files changed, 117 insertions(+)
>
> diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
> index c2ff7e3..58a5a51 100644
> --- a/src/mesa/main/extensions.c
> +++ b/src/mesa/main/extensions.c
> @@ -360,6 +360,18 @@ static const struct extension extension_table[] = {
> { "GL_SGIS_texture_lod",o(dummy_true),
>   GLL,1997 },
> { "GL_SUN_multi_draw_arrays",   o(dummy_true),
>   GLL,1999 },
>
> +   /*
> + TODO:
> +  - rather than have an all or nothing approach for floating point 
> textures,
> +allow for driver to specify what parts of floating point texture 
> funtionality

functionality

> +is supported: float/half-float and filtering for each.

For which driver would that be useful?

> +*/
> +   { "GL_OES_texture_float",   o(ARB_texture_float), 
>   ES2|ES3,2005 },
> +   { "GL_OES_texture_half_float",  o(ARB_texture_float), 
>   ES2|ES3,2005 },
> +   { "GL_OES_texture_float_linear",o(ARB_texture_float), 
>   ES2|ES3,2005 },
> +   { "GL_OES_texture_half_float_linear",   o(ARB_texture_float), 
>   ES2|ES3,2005 },

The ES3 bit is for extensions that can not be exposed in ES2. ES2
means that it will be exposed in both ES2 and ES3.

The extension table is organized by extension prefix. Put these
extensions in the proper place.

> +
> +
> { 0, 0, 0, 0 },
>  };
>
> diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
> index 9bb341c..be728f4 100644
> --- a/src/mesa/main/glformats.c
> +++ b/src/mesa/main/glformats.c
> @@ -93,6 +93,7 @@ _mesa_sizeof_type(GLenum type)
> case GL_DOUBLE:
>return sizeof(GLdouble);
> case GL_HALF_FLOAT_ARB:
> +   case GL_HALF_FLOAT_OES:
>return sizeof(GLhalfARB);
> case GL_FIXED:
>return sizeof(GLfixed);
> @@ -125,6 +126,7 @@ _mesa_sizeof_packed_type(GLenum type)
> case GL_INT:
>return sizeof(GLint);
> case GL_HALF_FLOAT_ARB:
> +   case GL_HALF_FLOAT_OES:
>return sizeof(GLhalfARB);
> case GL_FLOAT:
>return sizeof(GLfloat);
> @@ -243,6 +245,7 @@ _mesa_bytes_per_pixel(GLenum format, GLenum type)
> case GL_FLOAT:
>return comps * sizeof(GLfloat);
> case GL_HALF_FLOAT_ARB:
> +   case GL_HALF_FLOAT_OES:
>return comps * sizeof(GLhalfARB);
> case GL_UNSIGNED_BYTE_3_3_2:
> case GL_UNSIGNED_BYTE_2_3_3_REV:
> @@ -1365,6 +1368,11 @@ _mesa_error_check_format_and_type(const struct 
> gl_context *ctx,
>  case GL_FLOAT:
>  case GL_HALF_FLOAT:
> return GL_NO_ERROR;
> +case GL_HALF_FLOAT_OES:
> +   return (format==GL_LUMINANCE ||
> +   format==GL_LUMINANCE_ALPHA ||
> +   format==GL_ALPHA)

Spaces around operators. Repeated below as well.

> +  ? GL_NO_ERROR: GL_INVALID_ENUM;
>  default:
> return GL_INVALID_ENUM;
>   }
> @@ -1401,6 +1409,9 @@ _mesa_error_check_format_and_type(const struct 
> gl_context *ctx,
>  case GL_UNSIGNED_SHORT_5_6_5_REV:
>  case GL_HALF_FLOAT:
> return GL_NO_ERROR;
> +case GL_HALF_FLOAT_OES:
> +   return (format==GL_RGB)
> +  ? GL_NO_ERROR: GL_INVALID_ENUM;
>  case GL_UNSIGNED_INT_2_10_10_10_REV:
> /* OK by GL_EXT_texture_type_2_10_10_10_REV */
> return (ctx->API == API_OPENGLES2)
> @@ -1454,6 +1465,9 @@ _mesa_error_check_format_and_type(const struct 
> gl_context *ctx,
>  case GL_UNSIGNED_INT_2_10_10_10_REV:
>  case GL_HALF_FLOAT:
> return GL_NO_ERROR;
> +case GL_HALF_FLOAT_OES:
> +   return (format==GL_RGBA)
> +  ? GL_NO_ERROR: GL_INVALID_ENUM;
>  default:
> return GL_INVALID_ENUM;
>   }
> @@ -1666,6 +1680,7 @@ _mesa_es_error_check_format_and_type(GLenum format, 
> GLenum type,
> return type_valid ? GL_NO_ERROR : GL_I

Re: [Mesa-dev] [PATCH 0/2] i965: Simulate MAD opcode with gen<6

2014-05-06 Thread Eric Anholt
Juha-Pekka Heikkila  writes:

> These patches allow MAD opcode to be used with pre gen6 hardware.
> Instead of failing on emitting MAD there will be emitted MUL and ADD 
> to simulate MAD.
>
> I tried this with piglit on ILK (gen5) and did not see regression.

This hides the MUL and ADD from instruction scheduling, which I expect
to make performance worse.  What was the motivation for this?


pgpIztp4XE9kH.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] i965/fs: Simulate MAD opcode with gen<6

2014-05-06 Thread Matt Turner
On Tue, May 6, 2014 at 3:53 AM, Juha-Pekka Heikkila
 wrote:
> Signed-off-by: Juha-Pekka Heikkila 
> ---
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 15 ++-
>  1 file changed, 10 insertions(+), 5 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index d2dc5fa..22ca528 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -293,10 +293,6 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
>  bool
>  fs_visitor::try_emit_mad(ir_expression *ir)
>  {
> -   /* 3-src instructions were introduced in gen6. */
> -   if (brw->gen < 6)
> -  return false;
> -
> /* MAD can only handle floating-point data. */
> if (ir->type != glsl_type::float_type)
>return false;
> @@ -327,7 +323,16 @@ fs_visitor::try_emit_mad(ir_expression *ir)
> fs_reg src2 = this->result;
>
> this->result = fs_reg(this, ir->type);
> -   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
> +
> +   /* 3-src instructions were introduced in gen6. */
> +   if (brw->gen < 6) {
> +  fs_reg temp = fs_reg(this, glsl_type::float_type);
> +
> +  emit(MUL(temp, src1, src2));
> +  emit(ADD(this->result, src0, temp));
> +   } else {
> +  emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
> +   }
>
> return true;
>  }
> --
> 1.8.1.2

try_emit_mad is called every time we visit an add-expression, and on
platforms that don't have MAD it fails and the compiler generates
standard code for the expression tree.

So, if your expression tree was a a multiply-add the compiler will
generate a multiply and an add instruction. Adding code to make
try_emit_mad do that doesn't actually change anything.

I've made a branch that uses the LINE instruction to perform
multiply-adds when the arguments are immediates. Minus the shader size
explosion in unigine tropics, it seems to be a pretty nice
improvement. But the problem with unigine will have to be sorted out
before it can be committed.

Maybe you'd be interested in taking a look at that?

See https://bugs.freedesktop.org/show_bug.cgi?id=77544
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported

2014-05-06 Thread Roland Scheidegger
Am 06.05.2014 17:03, schrieb Ilia Mirkin:
> On Tue, May 6, 2014 at 10:48 AM, Roland Scheidegger  
> wrote:
>> Looks good to me.
> 
> Thanks!
> 
>> Does that mean if also the GATHER_SM5 cap is supported you have to
>> support 4 independent, non-constant offsets?
> 
> Not 100% sure what you're asking... but yes, for ARB_gs5 to work, you
> have to support independent non-constant offsets. And if you have
> PIPE_CAP_TEXTURE_GATHER_OFFSETS enabled, you're making the claim that
> you can handle multiple independent offsets in a single texgather.
> Without the cap, the 4 offsets get lowered into 4 separate texgathers
> (with only one of the returned components used).
> 
> With nvc0, the offsets are passed in via a register, so non-constant
> is never an issue. And with nv50, the offsets must be immediates (and
> there can be only 1 set of them), but it also has no hope of
> supporting all of ARB_gs5.
> 
>> Would it make sense to reorder the caps so the gather stuff is all
>> together (now 5 cap bits just for this...)?
> 
> The quantity of caps for texgather is a little ridiculous. I'm of the
> opinion that this should be the default behaviour, and it should be up
> to the driver to lower it into 4 texgathers if it can't handle them
> directly. Furthermore, this functionality is only available (via GL)
> with ARB_gs5, which in turn will require a whole bunch of stuff, so I
> don't know whether the GATHER_SM5 cap is really that useful. And for
> someone with a DX tracker, this functionality would again not be
> useful on its own, the rest of SM5 would have to be supported as well
> (I assume).
> 
> But that's not what got implemented, and I don't care to modify
> radeon, which can only support 1 offset at a time. (Although I don't
> think the radeon impl got pushed...) I anticipate that llvmpipe
> doesn't care one way or another (perhaps with even a minor preference
> towards having it all in one instruction).
> 
> If there's concensus, happy to switch this on by default and get rid
> of the cap :) [And also get rid of the GATHER_SM5 cap.]
Well I think the point was that there's really hw which can only do
simple gather (what d3d10.1 could do or arb_texture_gather would do).
This hw will not be able to do other stuff from newer gl versions anyway
so it should not be required to support those new features.
I'm not entirely sure to what it's actually lowered but in any case
llvmpipe if it implemented this definitely would want a non-lowered
version. I think though some radeon hw could really do SM5 version but
not independent offsets natively, though I'm not sure if it would really
be all that complicated to handle it in the driver.
I guess though this could be changed later rather easily.

Roland


> 
>>
>> Roland
>>
>> Am 29.04.2014 01:30, schrieb Ilia Mirkin:
>>> Signed-off-by: Ilia Mirkin 
>>> ---
>>>
>>> The handling of the 4 offsets is less-than-pretty. I had an alternate 
>>> version
>>> that created a new ir_dereference_array object and ran ->accept on that. 
>>> This
>>> worked as well, but for each offset it would create a separate new array, 
>>> and
>>> then deref just one item out of it. This seems incredibly wasteful. The
>>> slightly open-coded version of that seems reasonable and uses the same 
>>> array.
>>>
>>>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 
>>> ++
>>>  1 file changed, 41 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
>>> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>>> index d1c3856..20d5e99 100644
>>> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>>> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>>> @@ -87,8 +87,7 @@ extern "C" {
>>>   */
>>>  #define MAX_ARRAYS256
>>>
>>> -/* if we support a native gallium TG4 with the ability to take 4 
>>> texoffsets then bump this */
>>> -#define MAX_GLSL_TEXTURE_OFFSET 1
>>> +#define MAX_GLSL_TEXTURE_OFFSET 4
>>>
>>>  class st_src_reg;
>>>  class st_dst_reg;
>>> @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
>>>  void
>>>  glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>>  {
>>> -   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
>>> offset, sample_index, component;
>>> +   st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, 
>>> offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component;
>>> st_dst_reg result_dst, coord_dst, cube_sc_dst;
>>> glsl_to_tgsi_instruction *inst = NULL;
>>> unsigned opcode = TGSI_OPCODE_NOP;
>>> const glsl_type *sampler_type = ir->sampler->type;
>>> bool is_cube_array = false;
>>> +   unsigned i;
>>>
>>> /* if we are a cube array sampler */
>>> if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
>>> @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
>>>opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 
>>> : TGSI_OPCODE_TEX;
>>>if (ir->offset) {
>>>   ir->offset->accept(t

Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported

2014-05-06 Thread Ilia Mirkin
On Tue, May 6, 2014 at 1:29 PM, Roland Scheidegger  wrote:
> Am 06.05.2014 17:03, schrieb Ilia Mirkin:
>> On Tue, May 6, 2014 at 10:48 AM, Roland Scheidegger  
>> wrote:
>>> Looks good to me.
>>
>> Thanks!
>>
>>> Does that mean if also the GATHER_SM5 cap is supported you have to
>>> support 4 independent, non-constant offsets?
>>
>> Not 100% sure what you're asking... but yes, for ARB_gs5 to work, you
>> have to support independent non-constant offsets. And if you have
>> PIPE_CAP_TEXTURE_GATHER_OFFSETS enabled, you're making the claim that
>> you can handle multiple independent offsets in a single texgather.
>> Without the cap, the 4 offsets get lowered into 4 separate texgathers
>> (with only one of the returned components used).
>>
>> With nvc0, the offsets are passed in via a register, so non-constant
>> is never an issue. And with nv50, the offsets must be immediates (and
>> there can be only 1 set of them), but it also has no hope of
>> supporting all of ARB_gs5.
>>
>>> Would it make sense to reorder the caps so the gather stuff is all
>>> together (now 5 cap bits just for this...)?
>>
>> The quantity of caps for texgather is a little ridiculous. I'm of the
>> opinion that this should be the default behaviour, and it should be up
>> to the driver to lower it into 4 texgathers if it can't handle them
>> directly. Furthermore, this functionality is only available (via GL)
>> with ARB_gs5, which in turn will require a whole bunch of stuff, so I
>> don't know whether the GATHER_SM5 cap is really that useful. And for
>> someone with a DX tracker, this functionality would again not be
>> useful on its own, the rest of SM5 would have to be supported as well
>> (I assume).
>>
>> But that's not what got implemented, and I don't care to modify
>> radeon, which can only support 1 offset at a time. (Although I don't
>> think the radeon impl got pushed...) I anticipate that llvmpipe
>> doesn't care one way or another (perhaps with even a minor preference
>> towards having it all in one instruction).
>>
>> If there's concensus, happy to switch this on by default and get rid
>> of the cap :) [And also get rid of the GATHER_SM5 cap.]
> Well I think the point was that there's really hw which can only do
> simple gather (what d3d10.1 could do or arb_texture_gather would do).
> This hw will not be able to do other stuff from newer gl versions anyway
> so it should not be required to support those new features.

Right. But since that hw will only ever expose ARB_texture_gather and
not ARB_gpu_shader5, it will never receive a TG4 instruciton with
non-const offsets or multiple offsets. So the cap to indicate that
non-const or quad offsets are supported isn't really necessary, since
those will only appear if ARB_gs5 support is claimed, which requires
more than just the texgather stuff. (The
PIPE_CAP_TEXTURE_GATHER_COMPONENTS cap _is_ necessary since it
indicates ARB_texture_gather support, and the value that should be
returned by some GL query about what tex gather supports.)

> I'm not entirely sure to what it's actually lowered but in any case
> llvmpipe if it implemented this definitely would want a non-lowered
> version.

Right now, it'll get lowered to 4 texgathers, with only one of the
returned 4 components used from each one. (And it can't use texfetch
since the min/max offsets are different, and there's probably some
other clever reason as well.)

> I think though some radeon hw could really do SM5 version but
> not independent offsets natively, though I'm not sure if it would really
> be all that complicated to handle it in the driver.

Well, I think the claim was that SM5 doesn't actually support the 4
separate offsets, but GL4 does with textureGatherOffsets(). Also, I
believe that radeon supports non-const natively, just not have 4
offsets in one instruction. Same deal with i965 (which is why that
lowering pass exists in the first place).

> I guess though this could be changed later rather easily.
>
> Roland
>
>
>>
>>>
>>> Roland
>>>
>>> Am 29.04.2014 01:30, schrieb Ilia Mirkin:
 Signed-off-by: Ilia Mirkin 
 ---

 The handling of the 4 offsets is less-than-pretty. I had an alternate 
 version
 that created a new ir_dereference_array object and ran ->accept on that. 
 This
 worked as well, but for each offset it would create a separate new array, 
 and
 then deref just one item out of it. This seems incredibly wasteful. The
 slightly open-coded version of that seems reasonable and uses the same 
 array.

  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 
 ++
  1 file changed, 41 insertions(+), 14 deletions(-)

 diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
 b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
 index d1c3856..20d5e99 100644
 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
 +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
 @@ -87,8 +87,7 @@ extern "C" {
   */
  #defin

Re: [Mesa-dev] [Mesa-users] Problem with ARB_copy_buffer on Mesa 9.2.4

2014-05-06 Thread Benjamin Bellec
Hello Jonas,

I tested your program and on my system (Fedora 19) on an Evergreen
(Radeon HD 5850) and I have the same issue indeed.
Here is my result :

Mesa 9.2.4 (from F19 repo) => Data does NOT match up!
Mesa 9.2.5 => Data does NOT match up!
Mesa 10.0.5 => Data does NOT match up!
Mesa 10.1.0 => Data matches.

So this is fixed in newer version.
That said, Mesa 9.2 is not supported anymore and I really don't know if
there will be a new Mesa 10.0.x release given the imminence of Mesa 10.2.
If yes, I can bisect and you can open a bug.

Mesa-dev, any new 10.0.x release planned ?

Regards.

Benjamin

Le 06/05/2014 13:40, Jonas Kulla a écrit :
> Hello list,
>
> after about 3 days of debugging, I was able to isolate a rather weird
> behavior in Mesa GL.
> The gist of it is the following: When I create a buffer object and
> allocate uninitilaized
> memory for it (glBufferData() with nullptr as 'data'),
> then glCopyBufferSubData() data into
> it from another buffer object, then subsequently fill a part of it
> with glBufferSubData(),
> this new data isn't visible to the buffer object. In fact, it seems
> that the SubData'ed bytes
> are completely lost. Any further data uploads however work as
> expected. I will attach
> a small C test case below that demonstrates this behavior.
>
> I realize that I am working with an old Mesa release (on Fedora 19),
> but I'm afraid of
> upgrading my system to the newest distro release as I might break my
> working environment.
> That's why I would like to kindly ask if someone could verify that
> this problem still persists
> on the newest Mesa code, in which case I would go ahead and file a bug
> report. At the
> same time, maybe someone could spot a critical mistake in my code that
> would explain
> this strange behavior I'm seeing. I think the code paths I'm hitting
> here in the driver are
> sufficiently obscure though.
>
> I should probably mention that my card is a Mobility Radeon HD 3650
> (ie. r600).
>
> Here's the code sample (you can replace the GL setup code with your own):
>
> #include 
> #include 
>  
> #include 
> #include 
>  
> static SDL_Window *win;
> static SDL_GLContext *ctx;
>  
> void setupGL()
> {
> SDL_Init(SDL_INIT_VIDEO);
> win = SDL_CreateWindow("CopyBufferBug",
> SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 64, 64,
> SDL_WINDOW_OPENGL);
> ctx = SDL_GL_CreateContext(win);
> glewInit();
> }
>  
> static void teardownGL()
> {
> SDL_GL_DeleteContext(ctx);
> SDL_DestroyWindow(win);
>  
> SDL_Quit();
> }
>  
> int main(int argc, char *argv[])
> {
> setupGL();
>  
> /* These don't matter I think */
> #define BLOCK_SIZE 128
> #define BUFFER1_SIZE BLOCK_SIZE
> #define BUFFER2_SIZE BLOCK_SIZE
> #define BUFFER1_TARGET GL_COPY_READ_BUFFER
> #define BUFFER2_TARGET GL_COPY_WRITE_BUFFER
> #define BUFFER1_USAGE GL_DYNAMIC_DRAW
> #define BUFFER2_USAGE GL_DYNAMIC_DRAW
>  
> GLuint buffers[2];
> glGenBuffers(2, buffers);
>  
> /* We allocate both buffers with undefined memory */
> glBindBuffer(BUFFER1_TARGET, buffers[0]);
> glBufferData(BUFFER1_TARGET, BUFFER1_SIZE, 0, BUFFER1_USAGE);
>  
> glBindBuffer(BUFFER2_TARGET, buffers[1]);
> glBufferData(BUFFER2_TARGET, BUFFER2_SIZE, 0, BUFFER2_USAGE);
>  
> /* Then copy (undefined) bytes from the first into the second
> buffer */
> /* Note: If I comment this line out, everything works */
> glCopyBufferSubData(BUFFER1_TARGET, BUFFER2_TARGET, 0, 0,
> BUFFER1_SIZE);
>  
> /* Generate random string */
> FILE *rand = fopen("/dev/urandom", "r");
> char data[BLOCK_SIZE];
> fread(data, 1, sizeof(data), rand);
> fclose(rand);
>  
> /* We fill the second buffer with defined data */
> /* Note: If I execute this call twice (just copy paste the
> line), everything works */
> glBufferSubData(BUFFER2_TARGET, 0, sizeof(data), data);
>  
> /* Then download it again to compare its contents against our
> test string */
> char data2[BLOCK_SIZE];
> glGetBufferSubData(BUFFER2_TARGET, 0, sizeof(data2), data2);
>  
> if (memcmp(data, data2, sizeof(data)))
> printf("Data does NOT match up!\n");
> else
> printf("Data matches.\n");
>  
> glDeleteBuffers(2, buffers);
>  
> teardownGL();
>  
> return 0;
> }
>
> Thank you very much for your time.
> Jonas
>
>
> ___
> mesa-users mailing list
> mesa-us...@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-users

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/5] i965: Always intel_prepare_render() after invalidating front buffers.

2014-05-06 Thread Eric Anholt
Kenneth Graunke  writes:

> Fixes glean/texture_srgb, which hit recursive-flush prevention
> assertions in vbo_exec_FlushVertices.
>
> This probably hurts the performance of front buffer rendering, but
> very few people in their right mind do front buffer rendering.

This series is:

Reviewed-by: Eric Anholt 


pgpw9NR42MFDO.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [rong.r.y...@intel.com: [Intel-gfx] How user space applications load registers on HSW?]

2014-05-06 Thread Ben Widawsky
FWDing to mesa-dev, since they should have the same issue.

- Forwarded message from "Yang, Rong R"  -

Date: Tue, 6 May 2014 08:26:15 +
From: "Yang, Rong R" 
To: "intel-...@lists.freedesktop.org" 
Subject: [Intel-gfx] How user space applications load registers on HSW?
Message-ID: 
<7597c9376c272a4ab2d29e91550b7b0901354...@shsmsx102.ccr.corp.intel.com>

   Hi,


   I am developing the HSW’s OCL driver in the linux. I encounter a LRI
   problem on HSW.


   Some gpgpu's applications, which use the shared local memory, must load
   the L3CTRLREG2 and L3CTRLREG3 registers to allocate the SLM in the L3
   cache.

   So I add L3CTRLREG2 and L3CTRLREG3 to the gen7_render_regs to pass the
   cmds parse when exec buffer. But it still don’t work.

   I notice that, on HSW, the commands that load the register, such as
   MI_LOAD_REGISTER_IMM, will be converted to NOOP by the GPU if the batch
   buffer's MI_BATCH_NON_SECURE_HSW bit is set. And after parse cmd, the
   MI_BATCH_NON_SECURE_HSW still set in the kernel. So HSW don’t accept
   LRI commands.


   Can I load these registers in the user space? Or should I hack the
   kernel?


   Yang Rong

___
Intel-gfx mailing list
intel-...@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


- End forwarded message -

-- 
Ben Widawsky, Intel Open Source Technology Center
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965: Relax accumulator dependency scheduling on Gen < 6

2014-05-06 Thread Matt Turner
Nice work.

On Tue, May 6, 2014 at 1:16 AM, Iago Toral Quiroga  wrote:
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
> b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 6e74803..37d3eab 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -676,6 +676,13 @@ backend_instruction::reads_accumulator_implicitly() const
>  }
>
>  bool
> +backend_instruction::writes_accumulator_implicitly(int gen) const
> +{
> +   return writes_accumulator ||
> +  (gen < 6 && opcode >= BRW_OPCODE_ADD && opcode != BRW_OPCODE_NOP);

Since our virtual instruction opcodes are > BRW_OPCODE_NOP, they'll
also be classified as writing the accumulator, whereas before they
weren't.

I think the only ones (that are used on gen < 6) that generate
hardware instructions that write the accumulator are

   FS_OPCODE_DDX
   FS_OPCODE_DDY
   FS_OPCODE_PIXEL_X
   FS_OPCODE_PIXEL_Y
   FS_OPCODE_CINTERP
   FS_OPCODE_LINTERP

If you update this function with these and it still passes piglit on
gen < 6, then this patch is

Reviewed-by: Matt Turner 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [rong.r.y...@intel.com: [Intel-gfx] How user space applications load registers on HSW?]

2014-05-06 Thread Kenneth Graunke
On 05/06/2014 08:26:15 AM, Yang, Rong R wrote:
> Hi,
> 
> I am developing the HSW’s OCL driver in the linux. I encounter a LRI
> problem on HSW.
> 
> 
> Some gpgpu's applications, which use the shared local memory, must load
> the L3CTRLREG2 and L3CTRLREG3 registers to allocate the SLM in the L3
> cache.
> 
> So I add L3CTRLREG2 and L3CTRLREG3 to the gen7_render_regs to pass the
> cmds parse when exec buffer. But it still don’t work.
> 
> I notice that, on HSW, the commands that load the register, such as
> MI_LOAD_REGISTER_IMM, will be converted to NOOP by the GPU if the batch
> buffer's MI_BATCH_NON_SECURE_HSW bit is set. And after parse cmd, the
> MI_BATCH_NON_SECURE_HSW still set in the kernel. So HSW don’t accept
> LRI commands.
> 
> 
> Can I load these registers in the user space? Or should I hack the
> kernel?
> 
> 
> Yang Rong

I've been asking the kernel developers for the ability to LRI/LRM from
userspace batches for around 1.5 years.  Unfortunately, we're still
waiting, and I honestly have no idea when they're going to finish it.

In the meantime, you can apply the attached patch to your kernel tree to
disable the hardware scanner, letting you run whatever commands you
want.  Obviously, we can't ship this on production systems, but it will
allow you to do your development without having to wait for the kernel team.

--Ken
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index a3ba9a8..86c173b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -995,6 +995,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		return ret;
 
 	flags = 0;
+	flags |= I915_DISPATCH_SECURE;
 	if (args->flags & I915_EXEC_SECURE) {
 		if (!file->is_master || !capable(CAP_SYS_ADMIN))
 		return -EPERM;


signature.asc
Description: OpenPGP digital signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 1/2] Import eglextchromium.h from Chromium.

2014-05-06 Thread Sarah Sharp
In order to support the (currently unregistered) Chromium-specific EGL
extension eglGetSyncValuesCHROMIUM on Intel systems, we need to import
the Chromium header that defines it.  The file was downloaded from

https://chromium.googlesource.com/chromium/chromium/+/trunk/ui/gl/EGL/eglextchromium.h

It is subject to the license found at

https://chromium.googlesource.com/chromium/chromium/+/trunk/LICENSE

I have imported the header file and added the license text to the top.
The only change was to fix the include guard on the Chromium header to
change the last line from a #define to a #endif, which makes the header
actually compile.

Signed-off-by: Sarah Sharp 
Reviewed-by: Chad Versace 
Cc: Jamey Sharp 
Cc: Ian Romanick 
Cc: Stéphane Marchesin 
---

v3: Add an include guard on Chromium header.

 include/EGL/eglext.h |  1 +
 include/EGL/eglextchromium.h | 60 
 src/egl/main/Makefile.am |  1 +
 3 files changed, 62 insertions(+)
 create mode 100644 include/EGL/eglextchromium.h

diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h
index 243da4a..88b39db 100644
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -646,6 +646,7 @@ EGLAPI EGLuint64NV EGLAPIENTRY eglGetSystemTimeNV (void);
 #endif /* EGL_NV_system_time */
 
 #include 
+#include 
 
 #ifdef __cplusplus
 }
diff --git a/include/EGL/eglextchromium.h b/include/EGL/eglextchromium.h
new file mode 100644
index 000..0cc0976
--- /dev/null
+++ b/include/EGL/eglextchromium.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//* Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//* Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//* Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file contains Chromium-specific EGL extensions declarations.
+
+#ifndef GPU_EGL_EGLEXTCHROMIUM_H_
+#define GPU_EGL_EGLEXTCHROMIUM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include 
+
+/* EGLSyncControlCHROMIUM requires 64-bit uint support */
+#if KHRONOS_SUPPORT_INT64
+#ifndef EGL_CHROMIUM_sync_control
+#define EGL_CHROMIUM_sync_control 1
+typedef khronos_uint64_t EGLuint64CHROMIUM;
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncValuesCHROMIUM(
+EGLDisplay dpy, EGLSurface surface, EGLuint64CHROMIUM *ust,
+EGLuint64CHROMIUM *msc, EGLuint64CHROMIUM *sbc);
+#endif /* EGL_EGLEXT_PROTOTYPES */
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCVALUESCHROMIUMPROC)
+(EGLDisplay dpy, EGLSurface surface, EGLuint64CHROMIUM *ust,
+ EGLuint64CHROMIUM *msc, EGLuint64CHROMIUM *sbc);
+#endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // GPU_EGL_EGLEXTCHROMIUM_H_
diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am
index 2858913..23207db 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -126,5 +126,6 @@ egldir = $(includedir)/EGL
 egl_HEADERS = \
$(top_srcdir)/include/EGL/eglext.h \
$(top_srcdir)/include/EGL/egl.h \
+   $(top_srcdir)/include/EGL/eglextchromium.h \
$(top_srcdir)/include/EGL/eglmesaext.h \
$(top_srcdir)/include/EGL/eglplatform.h
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 2/2] egl: Add EGL_CHROMIUM_sync_control extension.

2014-05-06 Thread Sarah Sharp
Chromium defined a new GL extension (that isn't registered with Khronos).
We need to add an EGL extension for it, so we can migrate ChromeOS on
Intel systems to use EGL instead of GLX.

http://git.chromium.org/gitweb/?p=chromium/src/third_party/khronos.git;a=commitdiff;h=27cbfdab35c601f70aa150581ad1448d0401f447

The EGL_CHROMIUM_sync_control extension is similar to the GLX extension
OML_sync_control, but only defines one function,
eglGetSyncValuesCHROMIUM, which is equivalent to glXGetSyncValuesOML.

http://www.opengl.org/registry/specs/OML/glx_sync_control.txt

Signed-off-by: Sarah Sharp 
Cc: Chad Versace 
Cc: Jamey Sharp 
Cc: Ian Romanick 
Cc: Stéphane Marchesin 
---
v2:
 - Clear up confusion around extension vs functions.  The new EGL
   extension name is CHROMIUM_sync_control and the new function name is
   eglGetSyncValuesCHROMIUM.
 - Remove all instances of #ifdef EGL_CHROMIUM_sync_control, but leave
   the #define in include/EGL/eglext.h.
 - Extensions are sorted by group, then alphabetically.  Make sure to
   respect that when adding the EGL_CHROMIUM_sync_control extension.
 - Set EGL error codes where appropriate.  Make sure
   dri2_x11_get_sync_values and eglGetSyncValuesCHROMIUM set an EGL
   error code they fail.
 - Use the newly imported Chromium header, rather than putting the
   extension in eglext.h (which will be overwritten as new versions are
   imported from Khronos).

v3:
 - unchanged from v2

 src/egl/drivers/dri2/egl_dri2.c   | 10 ++
 src/egl/drivers/dri2/egl_dri2.h   |  4 
 src/egl/drivers/dri2/egl_dri2_fallbacks.h |  8 
 src/egl/drivers/dri2/platform_android.c   |  1 +
 src/egl/drivers/dri2/platform_drm.c   |  1 +
 src/egl/drivers/dri2/platform_wayland.c   |  1 +
 src/egl/drivers/dri2/platform_x11.c   | 29 +
 src/egl/main/eglapi.c | 23 +++
 src/egl/main/eglapi.h |  3 +++
 src/egl/main/egldisplay.h |  2 ++
 src/egl/main/eglmisc.c|  2 ++
 11 files changed, 84 insertions(+)

diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index dc541ad..e7987ee 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -1386,6 +1386,15 @@ dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, 
_EGLContext *ctx,
 }
 #endif
 
+static EGLBoolean
+dri2_get_sync_values_chromium(_EGLDisplay *dpy, _EGLSurface *surf,
+  EGLuint64KHR *ust, EGLuint64KHR *msc,
+  EGLuint64KHR *sbc)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
+   return dri2_dpy->vtbl->get_sync_values(dpy, surf, ust, msc, sbc);
+}
+
 /**
  * Set the error code after a call to
  * dri2_egl_image::dri_image::createImageFromTexture.
@@ -2177,6 +2186,7 @@ _eglBuiltInDriverDRI2(const char *args)
dri2_drv->base.API.UnbindWaylandDisplayWL = dri2_unbind_wayland_display_wl;
dri2_drv->base.API.QueryWaylandBufferWL = dri2_query_wayland_buffer_wl;
 #endif
+   dri2_drv->base.API.GetSyncValuesCHROMIUM = dri2_get_sync_values_chromium;
 
dri2_drv->base.Name = "DRI2";
dri2_drv->base.Unload = dri2_unload;
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index e62e265..44f26fb 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -138,6 +138,10 @@ struct dri2_egl_display_vtbl {
 
struct wl_buffer* (*create_wayland_buffer_from_image)(
 _EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img);
+
+   EGLBoolean (*get_sync_values)(_EGLDisplay *display, _EGLSurface *surface,
+ EGLuint64KHR *ust, EGLuint64KHR *msc,
+ EGLuint64KHR *sbc);
 };
 
 struct dri2_egl_display
diff --git a/src/egl/drivers/dri2/egl_dri2_fallbacks.h 
b/src/egl/drivers/dri2/egl_dri2_fallbacks.h
index a5cf344..9cba001 100644
--- a/src/egl/drivers/dri2/egl_dri2_fallbacks.h
+++ b/src/egl/drivers/dri2/egl_dri2_fallbacks.h
@@ -98,3 +98,11 @@ dri2_fallback_create_wayland_buffer_from_image(_EGLDriver 
*drv,
 {
return NULL;
 }
+
+static inline EGLBoolean
+dri2_fallback_get_sync_values(_EGLDisplay *dpy, _EGLSurface *surf,
+  EGLuint64KHR *ust, EGLuint64KHR *msc,
+  EGLuint64KHR *sbc)
+{
+   return EGL_FALSE;
+}
diff --git a/src/egl/drivers/dri2/platform_android.c 
b/src/egl/drivers/dri2/platform_android.c
index 7b1db76..71948bd 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -650,6 +650,7 @@ static struct dri2_egl_display_vtbl droid_display_vtbl = {
.copy_buffers = dri2_fallback_copy_buffers,
.query_buffer_age = dri2_fallback_query_buffer_age,
.create_wayland_buffer_from_image = 
dri2_fallback_create_wayland_buffer_from_image,
+   .get_sync_values = dri2_fallback_get_sync_values,
 };
 
 EGLBoolean
diff --git a/src/egl/drivers/

[Mesa-dev] [PATCH] i965/fs: Enable vector-mask in correct dword in Broadwell's 3DSTATE_PS.

2014-05-06 Thread Matt Turner
---
Noticed by inspection. Not tested.

It looks like this would have messed up the scratch space base pointer.

 src/mesa/drivers/dri/i965/gen8_ps_state.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c 
b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 7d8f954..3006a0e 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -134,7 +134,7 @@ static void
 upload_ps_state(struct brw_context *brw)
 {
struct gl_context *ctx = &brw->ctx;
-   uint32_t dw3 = 0, dw6 = 0, dw7 = 0;
+   uint32_t dw2 = 0, dw3 = 0, dw6 = 0, dw7 = 0;
 
/* CACHE_NEW_SAMPLER */
BEGIN_BATCH(2);
@@ -149,7 +149,7 @@ upload_ps_state(struct brw_context *brw)
 * incorrect for subspans where some of the pixels are unlit.  We believe
 * the bit just didn't take effect in previous generations.
 */
-   dw3 |= GEN7_PS_VECTOR_MASK_ENABLE;
+   dw2 |= GEN7_PS_VECTOR_MASK_ENABLE;
 
/* CACHE_NEW_SAMPLER */
dw3 |=
@@ -229,7 +229,7 @@ upload_ps_state(struct brw_context *brw)
   OUT_BATCH(brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16);
else
   OUT_BATCH(brw->wm.base.prog_offset);
-   OUT_BATCH(0);
+   OUT_BATCH(dw2);
OUT_BATCH(dw3);
if (brw->wm.prog_data->total_scratch) {
   OUT_RELOC64(brw->wm.base.scratch_bo,
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/fs: Enable vector-mask in correct dword in Broadwell's 3DSTATE_PS.

2014-05-06 Thread Kenneth Graunke
On 05/06/2014 12:59 PM, Matt Turner wrote:
> ---
> Noticed by inspection. Not tested.
> 
> It looks like this would have messed up the scratch space base pointer.
> 
>  src/mesa/drivers/dri/i965/gen8_ps_state.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c 
> b/src/mesa/drivers/dri/i965/gen8_ps_state.c
> index 7d8f954..3006a0e 100644
> --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
> @@ -134,7 +134,7 @@ static void
>  upload_ps_state(struct brw_context *brw)
>  {
> struct gl_context *ctx = &brw->ctx;
> -   uint32_t dw3 = 0, dw6 = 0, dw7 = 0;
> +   uint32_t dw2 = 0, dw3 = 0, dw6 = 0, dw7 = 0;
>  
> /* CACHE_NEW_SAMPLER */
> BEGIN_BATCH(2);
> @@ -149,7 +149,7 @@ upload_ps_state(struct brw_context *brw)
>  * incorrect for subspans where some of the pixels are unlit.  We believe
>  * the bit just didn't take effect in previous generations.
>  */
> -   dw3 |= GEN7_PS_VECTOR_MASK_ENABLE;
> +   dw2 |= GEN7_PS_VECTOR_MASK_ENABLE;
>  
> /* CACHE_NEW_SAMPLER */
> dw3 |=
> @@ -229,7 +229,7 @@ upload_ps_state(struct brw_context *brw)
>OUT_BATCH(brw->wm.base.prog_offset + 
> brw->wm.prog_data->prog_offset_16);
> else
>OUT_BATCH(brw->wm.base.prog_offset);
> -   OUT_BATCH(0);
> +   OUT_BATCH(dw2);
> OUT_BATCH(dw3);
> if (brw->wm.prog_data->total_scratch) {
>OUT_RELOC64(brw->wm.base.scratch_bo,

NAK.  I don't know what docs you're looking at, but the latest ones:

DW0 - header, opcode, length
DW1, DW2 - kernel start pointer 0
DW3. has bit 30, "Vector Mask Enable."

I'm pretty sure the code is correct as is.

--Ken



signature.asc
Description: OpenPGP digital signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/fs: Enable vector-mask in correct dword in Broadwell's 3DSTATE_PS.

2014-05-06 Thread Matt Turner
On Tue, May 6, 2014 at 1:40 PM, Kenneth Graunke  wrote:
> NAK.  I don't know what docs you're looking at, but the latest ones:
>
> DW0 - header, opcode, length
> DW1, DW2 - kernel start pointer 0
> DW3. has bit 30, "Vector Mask Enable."

Oh, you're right. The 3DSTATE_PS was 404'd this morning, so I looked
at the wrong file in the XML.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] i965: Generalize the pixel_x/y workaround for all UW types.

2014-05-06 Thread Eric Anholt
This is the only case where a fs_reg in brw_fs_visitor is used during
optimization/code generation, and it meant that optimizations had to be
careful to not move pixel_x/y's register number without updating it.

Additionally, it turns out we had a couple of other UW values that weren't
getting this treatment (like gl_SampleID), so this more general fix is
probably a good idea (though I wasn't able to replicate problems with
either pixel_[xy]'s values or gl_SampleID, even when telling the register
allocator to reuse registers immediately)
---
 src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index c7b1f25..7969b67 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -86,10 +86,10 @@ fs_live_variables::setup_one_read(bblock_t *block, fs_inst 
*inst,
 */
int end_ip = ip;
if (v->dispatch_width == 16 && (reg.stride == 0 ||
-   ((v->pixel_x.file == GRF &&
- v->pixel_x.reg == reg.reg) ||
-(v->pixel_y.file == GRF &&
- v->pixel_y.reg == reg.reg {
+   reg.type == BRW_REGISTER_TYPE_UW ||
+   reg.type == BRW_REGISTER_TYPE_W ||
+   reg.type == BRW_REGISTER_TYPE_UB ||
+   reg.type == BRW_REGISTER_TYPE_B)) {
   end_ip++;
}
 
-- 
1.9.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] i965: Stop doing remapping of "special" regs.

2014-05-06 Thread Eric Anholt
Now that we aren't using pixel_[xy] in live variables, nothing is looking
at these regs after the visitor stage.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 37 
 1 file changed, 37 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c550c41..6b1b866 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1740,34 +1740,6 @@ fs_visitor::compact_virtual_grfs()
   }
}
 
-   /* In addition to registers used in instructions, fs_visitor keeps
-* direct references to certain special values which must be patched:
-*/
-   struct {
-  fs_reg *reg;
-  unsigned count;
-   } special[] = {
-  { &frag_depth, 1 },
-  { &pixel_x, 1 },
-  { &pixel_y, 1 },
-  { &pixel_w, 1 },
-  { &wpos_w, 1 },
-  { &dual_src_output, 1 },
-  { outputs, ARRAY_SIZE(outputs) },
-  { delta_x, ARRAY_SIZE(delta_x) },
-  { delta_y, ARRAY_SIZE(delta_y) },
-  { &sample_mask, 1 },
-  { &shader_start_time, 1 },
-   };
-
-   /* Treat all special values as used, to be conservative */
-   for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
-  for (unsigned j = 0; j < special[i].count; j++) {
- if (special[i].reg[j].file == GRF)
-remap_table[special[i].reg[j].reg] = 0;
-  }
-   }
-
/* Compact the GRF arrays. */
int new_index = 0;
for (int i = 0; i < this->virtual_grf_count; i++) {
@@ -1793,15 +1765,6 @@ fs_visitor::compact_virtual_grfs()
 inst->src[i].reg = remap_table[inst->src[i].reg];
   }
}
-
-   /* Patch all the references to special values */
-   for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
-  for (unsigned j = 0; j < special[i].count; j++) {
- fs_reg *reg = &special[i].reg[j];
- if (reg->file == GRF && remap_table[reg->reg] != -1)
-reg->reg = remap_table[reg->reg];
-  }
-   }
 }
 
 /*
-- 
1.9.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965: Skip recalculating URB allocations if the entry size didn't change.

2014-05-06 Thread Eric Anholt
We only get here if the VS/GS compiled programs change, but we can even
skip it if the VS/GS size didn't change.

Affects cairo runtime on glamor by -1.26471% +/- 0.674335% (n=234)
---
 src/mesa/drivers/dri/i965/brw_context.c |  2 +-
 src/mesa/drivers/dri/i965/brw_context.h |  5 +++--
 src/mesa/drivers/dri/i965/gen6_urb.c|  4 ++--
 src/mesa/drivers/dri/i965/gen7_urb.c| 13 +
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index 17ae685..2a4620b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -745,7 +745,7 @@ brwCreateContext(gl_api api,
brw->max_gtt_map_object_size = gtt_size / 4;
 
if (brw->gen == 6)
-  brw->urb.gen6_gs_previously_active = false;
+  brw->urb.gs_present = false;
 
brw->prim_restart.in_progress = false;
brw->prim_restart.enable_cut_index = false;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 92e1592..b450777 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1183,6 +1183,7 @@ struct brw_context
 */
struct {
   GLuint vsize;/* vertex size plus header in urb registers */
+  GLuint gsize;/* GS output size in urb registers */
   GLuint csize;/* constant buffer size in urb registers */
   GLuint sfsize;   /* setup data size in urb registers */
 
@@ -1205,10 +1206,10 @@ struct brw_context
   GLuint cs_start;
   GLuint size; /* Hardware URB size, in KB. */
 
-  /* gen6: True if the most recently sent _3DSTATE_URB message allocated
+  /* True if the most recently sent _3DSTATE_URB message allocated
* URB space for the GS.
*/
-  bool gen6_gs_previously_active;
+  bool gs_present;
} urb;
 
 
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c 
b/src/mesa/drivers/dri/i965/gen6_urb.c
index b694f5d..9197bcf 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -109,9 +109,9 @@ gen6_upload_urb( struct brw_context *brw )
 * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
 * a workaround.
 */
-   if (brw->urb.gen6_gs_previously_active && !brw->ff_gs.prog_active)
+   if (brw->urb.gs_present && !brw->ff_gs.prog_active)
   intel_batchbuffer_emit_mi_flush(brw);
-   brw->urb.gen6_gs_previously_active = brw->ff_gs.prog_active;
+   brw->urb.gs_present = brw->ff_gs.prog_active;
 }
 
 const struct brw_tracked_state gen6_urb = {
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c 
b/src/mesa/drivers/dri/i965/gen7_urb.c
index 2653e9c..3c21e1e 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -150,6 +150,19 @@ gen7_upload_urb(struct brw_context *brw)
unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
unsigned gs_entry_size_bytes = gs_size * 64;
 
+   /* If we're just switching between programs with the same URB requirements,
+* skip the rest of the logic.
+*/
+   if (!(brw->state.dirty.brw & BRW_NEW_CONTEXT) &&
+   brw->urb.vsize == vs_size &&
+   brw->urb.gs_present == gs_present &&
+   brw->urb.gsize == gs_size) {
+  return;
+   }
+   brw->urb.vsize = vs_size;
+   brw->urb.gs_present = gs_present;
+   brw->urb.gsize = gs_size;
+
/* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
 *
 * VS Number of URB Entries must be divisible by 8 if the VS URB Entry
-- 
1.9.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nv50/ir/gk110: fix set with f32 dest

2014-05-06 Thread Ilia Mirkin
Should fix SGE/SSG instructions, which were previously getting integer
0/-1 values.

Signed-off-by: Ilia Mirkin 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 5992c54..b8d0d3e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -915,6 +915,9 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i)
  modNegAbsF32_3b(i, 1);
   }
   FTZ_(3a);
+
+  if (i->dType == TYPE_F32)
+ code[1] |= 1 << 23;
}
if (i->sType == TYPE_S32)
   code[1] |= 1 << 19;
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965: Enable GL_ARB_texture_view on Broadwell.

2014-05-06 Thread Kenneth Graunke
This is a port of commit c9c08867ed07ceb10b67ffac5f0a33812710a5e8.
A tiny bit of extra work was necessary to not break stencil texturing.

Cc: "10.2" 
Signed-off-by: Kenneth Graunke 
---
 src/mesa/drivers/dri/i965/gen8_surface_state.c | 28 ++
 src/mesa/drivers/dri/i965/intel_extensions.c   |  5 +
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index 564d275..d52b32e 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -134,17 +134,20 @@ gen8_update_texture_surface(struct gl_context *ctx,
struct intel_mipmap_tree *mt = intelObj->mt;
struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel];
struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
+   mesa_format format = intelObj->_Format;
 
if (tObj->Target == GL_TEXTURE_BUFFER) {
   brw_update_buffer_texture_surface(ctx, unit, surf_offset);
   return;
}
 
-   if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL)
+   if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL) {
   mt = mt->stencil_mt;
+  format = MESA_FORMAT_S_UINT8;
+   }
 
unsigned tiling_mode, pitch;
-   if (mt->format == MESA_FORMAT_S_UINT8) {
+   if (format == MESA_FORMAT_S_UINT8) {
   tiling_mode = GEN8_SURFACE_TILING_W;
   pitch = 2 * mt->pitch;
} else {
@@ -152,9 +155,14 @@ gen8_update_texture_surface(struct gl_context *ctx,
   pitch = mt->pitch;
}
 
-   uint32_t tex_format = translate_tex_format(brw,
-  mt->format,
-  sampler->sRGBDecode);
+   /* If this is a view with restricted NumLayers, then our effective depth
+* is not just the miptree depth.
+*/
+   uint32_t effective_depth =
+  (tObj->Immutable && tObj->Target != GL_TEXTURE_3D) ? tObj->NumLayers
+ : mt->logical_depth0;
+
+   uint32_t tex_format = translate_tex_format(brw, format, 
sampler->sRGBDecode);
 
uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
 13 * 4, 64, surf_offset);
@@ -178,11 +186,15 @@ gen8_update_texture_surface(struct gl_context *ctx,
surf[2] = SET_FIELD(mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) |
  SET_FIELD(mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT);
 
-   surf[3] = SET_FIELD(mt->logical_depth0 - 1, BRW_SURFACE_DEPTH) | (pitch - 
1);
+   surf[3] = SET_FIELD(effective_depth - 1, BRW_SURFACE_DEPTH) | (pitch - 1);
 
-   surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout);
+   surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout) |
+ SET_FIELD(tObj->MinLayer, GEN7_SURFACE_MIN_ARRAY_ELEMENT) |
+ SET_FIELD(effective_depth - 1,
+   GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT);
 
-   surf[5] = SET_FIELD(tObj->BaseLevel - mt->first_level, 
GEN7_SURFACE_MIN_LOD) |
+   surf[5] = SET_FIELD(tObj->MinLevel + tObj->BaseLevel - mt->first_level,
+   GEN7_SURFACE_MIN_LOD) |
  (intelObj->_MaxLevel - tObj->BaseLevel); /* mip count */
 
surf[6] = 0;
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c 
b/src/mesa/drivers/dri/i965/intel_extensions.c
index ade86a5..c6c76c2 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -284,6 +284,7 @@ intelInitExtensions(struct gl_context *ctx)
 
if (brw->gen >= 7) {
   ctx->Extensions.ARB_conservative_depth = true;
+  ctx->Extensions.ARB_texture_view = true;
   ctx->Extensions.AMD_vertex_shader_layer = true;
   if (can_do_pipelined_register_writes(brw)) {
  ctx->Extensions.ARB_transform_feedback2 = true;
@@ -302,10 +303,6 @@ intelInitExtensions(struct gl_context *ctx)
  ctx->Extensions.ARB_compute_shader = true;
}
 
-   if (brw->gen == 7) {
-  ctx->Extensions.ARB_texture_view = true;
-   }
-
if (brw->gen >= 8) {
   ctx->Extensions.ARB_stencil_texturing = true;
}
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Mesa-stable] [PATCH] i965: Enable GL_ARB_texture_view on Broadwell.

2014-05-06 Thread Chris Forbes
This patch is:

Reviewed-by: Chris Forbes 

Spotted some other bugs (in using a view as a depth/stencil
attachment) while reading around this, which I'll take care of.

On Wed, May 7, 2014 at 12:03 PM, Kenneth Graunke  wrote:
> This is a port of commit c9c08867ed07ceb10b67ffac5f0a33812710a5e8.
> A tiny bit of extra work was necessary to not break stencil texturing.
>
> Cc: "10.2" 
> Signed-off-by: Kenneth Graunke 
> ---
>  src/mesa/drivers/dri/i965/gen8_surface_state.c | 28 
> ++
>  src/mesa/drivers/dri/i965/intel_extensions.c   |  5 +
>  2 files changed, 21 insertions(+), 12 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
> b/src/mesa/drivers/dri/i965/gen8_surface_state.c
> index 564d275..d52b32e 100644
> --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
> @@ -134,17 +134,20 @@ gen8_update_texture_surface(struct gl_context *ctx,
> struct intel_mipmap_tree *mt = intelObj->mt;
> struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel];
> struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
> +   mesa_format format = intelObj->_Format;
>
> if (tObj->Target == GL_TEXTURE_BUFFER) {
>brw_update_buffer_texture_surface(ctx, unit, surf_offset);
>return;
> }
>
> -   if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL)
> +   if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL) 
> {
>mt = mt->stencil_mt;
> +  format = MESA_FORMAT_S_UINT8;
> +   }
>
> unsigned tiling_mode, pitch;
> -   if (mt->format == MESA_FORMAT_S_UINT8) {
> +   if (format == MESA_FORMAT_S_UINT8) {
>tiling_mode = GEN8_SURFACE_TILING_W;
>pitch = 2 * mt->pitch;
> } else {
> @@ -152,9 +155,14 @@ gen8_update_texture_surface(struct gl_context *ctx,
>pitch = mt->pitch;
> }
>
> -   uint32_t tex_format = translate_tex_format(brw,
> -  mt->format,
> -  sampler->sRGBDecode);
> +   /* If this is a view with restricted NumLayers, then our effective depth
> +* is not just the miptree depth.
> +*/
> +   uint32_t effective_depth =
> +  (tObj->Immutable && tObj->Target != GL_TEXTURE_3D) ? tObj->NumLayers
> + : 
> mt->logical_depth0;
> +
> +   uint32_t tex_format = translate_tex_format(brw, format, 
> sampler->sRGBDecode);
>
> uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
>  13 * 4, 64, surf_offset);
> @@ -178,11 +186,15 @@ gen8_update_texture_surface(struct gl_context *ctx,
> surf[2] = SET_FIELD(mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) |
>   SET_FIELD(mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT);
>
> -   surf[3] = SET_FIELD(mt->logical_depth0 - 1, BRW_SURFACE_DEPTH) | (pitch - 
> 1);
> +   surf[3] = SET_FIELD(effective_depth - 1, BRW_SURFACE_DEPTH) | (pitch - 1);
>
> -   surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout);
> +   surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout) |
> + SET_FIELD(tObj->MinLayer, GEN7_SURFACE_MIN_ARRAY_ELEMENT) |
> + SET_FIELD(effective_depth - 1,
> +   GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT);
>
> -   surf[5] = SET_FIELD(tObj->BaseLevel - mt->first_level, 
> GEN7_SURFACE_MIN_LOD) |
> +   surf[5] = SET_FIELD(tObj->MinLevel + tObj->BaseLevel - mt->first_level,
> +   GEN7_SURFACE_MIN_LOD) |
>   (intelObj->_MaxLevel - tObj->BaseLevel); /* mip count */
>
> surf[6] = 0;
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c 
> b/src/mesa/drivers/dri/i965/intel_extensions.c
> index ade86a5..c6c76c2 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -284,6 +284,7 @@ intelInitExtensions(struct gl_context *ctx)
>
> if (brw->gen >= 7) {
>ctx->Extensions.ARB_conservative_depth = true;
> +  ctx->Extensions.ARB_texture_view = true;
>ctx->Extensions.AMD_vertex_shader_layer = true;
>if (can_do_pipelined_register_writes(brw)) {
>   ctx->Extensions.ARB_transform_feedback2 = true;
> @@ -302,10 +303,6 @@ intelInitExtensions(struct gl_context *ctx)
>   ctx->Extensions.ARB_compute_shader = true;
> }
>
> -   if (brw->gen == 7) {
> -  ctx->Extensions.ARB_texture_view = true;
> -   }
> -
> if (brw->gen >= 8) {
>ctx->Extensions.ARB_stencil_texturing = true;
> }
> --
> 1.9.1
>
> ___
> mesa-stable mailing list
> mesa-sta...@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-stable
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] Mixing of hardware and software renderers

2014-05-06 Thread Patrick McMunn
I'm using some older hardware - an ATI Radeon 9200 - which can only handle
up to OpenGL 1.2. I was wondering if it's possible to use the hardware
renderer generally and have the driver hand off the handling of functions
which my video card can't handle (such as functions from a higher OpenGL
version) to the software render and then the software render hand control
back to the hardware renderer once it's finished. If this isn't currently
possible, is this perhaps a feature which might appear in the future?
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Mesa-users] Problem with ARB_copy_buffer on Mesa 9.2.4

2014-05-06 Thread Jonas Kulla
Hello Benjamin,

thank you very much for testing this. If this issue is fixed in latest
Mesa, then I don't
have to issue a bug report after all. It's also not critical for me to have
this issue fixed
on my distribution as I can easily work around it; I just wanted to make
sure that the
fix for this is available in a higher Mesa version so I'll eventually get
it via upgrading.

Thanks a lot,
Jonas


2014-05-06 20:01 GMT+02:00 Benjamin Bellec :

>  Hello Jonas,
>
> I tested your program and on my system (Fedora 19) on an Evergreen (Radeon
> HD 5850) and I have the same issue indeed.
> Here is my result :
>
> Mesa 9.2.4 (from F19 repo) => Data does NOT match up!
> Mesa 9.2.5 => Data does NOT match up!
> Mesa 10.0.5 => Data does NOT match up!
> Mesa 10.1.0 => Data matches.
>
> So this is fixed in newer version.
> That said, Mesa 9.2 is not supported anymore and I really don't know if
> there will be a new Mesa 10.0.x release given the imminence of Mesa 10.2.
> If yes, I can bisect and you can open a bug.
>
> Mesa-dev, any new 10.0.x release planned ?
>
> Regards.
>
> Benjamin
>
> Le 06/05/2014 13:40, Jonas Kulla a écrit :
>
> Hello list,
>
>  after about 3 days of debugging, I was able to isolate a rather weird
> behavior in Mesa GL.
> The gist of it is the following: When I create a buffer object and
> allocate uninitilaized
> memory for it (glBufferData() with nullptr as 'data'),
> then glCopyBufferSubData() data into
> it from another buffer object, then subsequently fill a part of it with
> glBufferSubData(),
> this new data isn't visible to the buffer object. In fact, it seems that
> the SubData'ed bytes
> are completely lost. Any further data uploads however work as expected. I
> will attach
> a small C test case below that demonstrates this behavior.
>
>  I realize that I am working with an old Mesa release (on Fedora 19), but
> I'm afraid of
> upgrading my system to the newest distro release as I might break my
> working environment.
> That's why I would like to kindly ask if someone could verify that this
> problem still persists
> on the newest Mesa code, in which case I would go ahead and file a bug
> report. At the
> same time, maybe someone could spot a critical mistake in my code that
> would explain
> this strange behavior I'm seeing. I think the code paths I'm hitting here
> in the driver are
> sufficiently obscure though.
>
>  I should probably mention that my card is a Mobility Radeon HD 3650 (ie.
> r600).
>
>  Here's the code sample (you can replace the GL setup code with your own):
>
>  #include 
> #include 
>
> #include 
> #include 
>
> static SDL_Window *win;
> static SDL_GLContext *ctx;
>
> void setupGL()
> {
> SDL_Init(SDL_INIT_VIDEO);
> win = SDL_CreateWindow("CopyBufferBug", SDL_WINDOWPOS_UNDEFINED,
> SDL_WINDOWPOS_UNDEFINED, 64, 64, SDL_WINDOW_OPENGL);
> ctx = SDL_GL_CreateContext(win);
> glewInit();
> }
>
> static void teardownGL()
> {
> SDL_GL_DeleteContext(ctx);
> SDL_DestroyWindow(win);
>
> SDL_Quit();
> }
>
> int main(int argc, char *argv[])
> {
> setupGL();
>
> /* These don't matter I think */
>  #define BLOCK_SIZE 128
> #define BUFFER1_SIZE BLOCK_SIZE
> #define BUFFER2_SIZE BLOCK_SIZE
> #define BUFFER1_TARGET GL_COPY_READ_BUFFER
> #define BUFFER2_TARGET GL_COPY_WRITE_BUFFER
> #define BUFFER1_USAGE GL_DYNAMIC_DRAW
> #define BUFFER2_USAGE GL_DYNAMIC_DRAW
>
> GLuint buffers[2];
> glGenBuffers(2, buffers);
>
>  /* We allocate both buffers with undefined memory */
> glBindBuffer(BUFFER1_TARGET, buffers[0]);
> glBufferData(BUFFER1_TARGET, BUFFER1_SIZE, 0, BUFFER1_USAGE);
>
>  glBindBuffer(BUFFER2_TARGET, buffers[1]);
> glBufferData(BUFFER2_TARGET, BUFFER2_SIZE, 0, BUFFER2_USAGE);
>
> /* Then copy (undefined) bytes from the first into the second
> buffer */
> /* Note: If I comment this line out, everything works */
> glCopyBufferSubData(BUFFER1_TARGET, BUFFER2_TARGET, 0, 0,
> BUFFER1_SIZE);
>
> /* Generate random string */
> FILE *rand = fopen("/dev/urandom", "r");
> char data[BLOCK_SIZE];
> fread(data, 1, sizeof(data), rand);
> fclose(rand);
>
>  /* We fill the second buffer with defined data */
> /* Note: If I execute this call twice (just copy paste the line),
> everything works */
> glBufferSubData(BUFFER2_TARGET, 0, sizeof(data), data);
>
> /* Then download it again to compare its contents against our test
> string */
> char data2[BLOCK_SIZE];
> glGetBufferSubData(BUFFER2_TARGET, 0, sizeof(data2), data2);
>
> if (memcmp(data, data2, sizeof(data)))
> printf("Data does NOT match up!\n");
> else
> printf("Data matches.\n");
>
> glDeleteBuffers(2, buffers);
>
> teardownGL();
>
> return 0;
> }
>
>  Thank you very much for 

Re: [Mesa-dev] Mixing of hardware and software renderers

2014-05-06 Thread Dave Airlie
On 6 May 2014 14:51, Patrick McMunn  wrote:
> I'm using some older hardware - an ATI Radeon 9200 - which can only handle
> up to OpenGL 1.2. I was wondering if it's possible to use the hardware
> renderer generally and have the driver hand off the handling of functions
> which my video card can't handle (such as functions from a higher OpenGL
> version) to the software render and then the software render hand control
> back to the hardware renderer once it's finished. If this isn't currently
> possible, is this perhaps a feature which might appear in the future?

It generally isn't possible and isn't worth it.

Pingpong between hw and sw renderers and the fact that sw renderers
are slow makes it unlikely you could do something like this practically.

Either run a sw renderer, or get a better gpu.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/8] i965/fs: Add plumbing for communicating single program flow.

2014-05-06 Thread Matt Turner
And do blorp at the same time.
---
 src/mesa/drivers/dri/i965/brw_blorp.h   |  6 ++
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp|  2 +-
 src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp | 10 +++---
 src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h   |  3 ++-
 src/mesa/drivers/dri/i965/brw_context.h |  1 +
 src/mesa/drivers/dri/i965/brw_fs.cpp|  6 --
 src/mesa/drivers/dri/i965/brw_fs.h  |  9 ++---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp  |  8 +---
 src/mesa/drivers/dri/i965/gen8_fs_generator.cpp | 10 ++
 9 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h 
b/src/mesa/drivers/dri/i965/brw_blorp.h
index 15a7a0b..b217451 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -202,6 +202,12 @@ struct brw_blorp_prog_data
unsigned int first_curbe_grf;
 
/**
+* True if the WM program contains control flow instructions. Used to
+* enable single program flow.
+*/
+   bool has_control_flow;
+
+   /**
 * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more
 * than one sample per pixel.
 */
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp 
b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 300ff5c..3f1a7bc 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -901,7 +901,7 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
 */
render_target_write();
 
-   return get_program(program_size, dump_file);
+   return get_program(program_size, &prog_data.has_control_flow, dump_file);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp 
b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index 38969d8..4063c63 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -37,16 +37,20 @@ brw_blorp_eu_emitter::~brw_blorp_eu_emitter()
 }
 
 const unsigned *
-brw_blorp_eu_emitter::get_program(unsigned *program_size, FILE *dump_file)
+brw_blorp_eu_emitter::get_program(unsigned *program_size,
+  bool *has_control_flow,
+  FILE *dump_file)
 {
const unsigned *res;
 
if (unlikely(INTEL_DEBUG & DEBUG_BLORP)) {
   fprintf(stderr, "Native code for BLORP blit:\n");
-  res = generator.generate_assembly(NULL, &insts, program_size, dump_file);
+  res = generator.generate_assembly(NULL, &insts, program_size,
+has_control_flow, dump_file);
   fprintf(stderr, "\n");
} else {
-  res = generator.generate_assembly(NULL, &insts, program_size);
+  res = generator.generate_assembly(NULL, &insts, program_size,
+has_control_flow);
}
 
return res;
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h 
b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h
index c10695e..386ddbb 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h
@@ -33,7 +33,8 @@ protected:
explicit brw_blorp_eu_emitter(struct brw_context *brw);
~brw_blorp_eu_emitter();
 
-   const unsigned *get_program(unsigned *program_size, FILE *dump_file);
+   const unsigned *get_program(unsigned *program_size, bool *has_control_flow,
+   FILE *dump_file);
 
void emit_kill_if_outside_rect(const struct brw_reg &x,
   const struct brw_reg &y,
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 82b38fc..18149b5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -395,6 +395,7 @@ struct brw_wm_prog_data {
bool dual_src_blend;
bool uses_pos_offset;
bool uses_omask;
+   bool has_control_flow;
uint32_t prog_offset_16;
 
/**
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c550c41..8b7a77f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3200,11 +3200,13 @@ brw_wm_fs_emit(struct brw_context *brw, struct 
brw_wm_compile *c,
if (brw->gen >= 8) {
   gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src);
   assembly = g.generate_assembly(&v.instructions, simd16_instructions,
- final_assembly_size);
+ final_assembly_size,
+ &c->prog_data.has_control_flow);
} else {
   fs_generator g(brw, c, prog, fp, v.do_dual_src);
   assembly = g.generate_assembly(&v.instructions, simd16_instructions,
- final_assembly_size);
+ final_assembly_size,
+ &c->prog_data.has_control_flow);
}
 
if 

[Mesa-dev] i965: Single program flow for shaders with no control flow.

2014-05-06 Thread Matt Turner
The docs say that flipping this bit on for shaders that don't do SIMD
branching (i.e., non-uniform control flow) will save us some power.

An easy first step is turning this on when we don't see control flow.

In the future with more infrastructure in place, we can determine if
all branching conditions are uniformly constant and turn on SPF.

Hopefully this saves some power and extends battery life, but I'm not
sure how to accurately quantify this, short of printing i915_energy_uJ
before and after some workload. Even then I don't have any expectation
for how much energy the GPU would use for, say a piglit run. Is
200 ~ 300 Joules reasonable (over 220 seconds)?

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/8] i965/fs: Set has_control_flow to true on an IF or WHILE instruction.

2014-05-06 Thread Matt Turner
All of the other control flow instructions are dependent on the
existence of an IF or WHILE instruction.
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp  | 2 ++
 src/mesa/drivers/dri/i965/gen8_fs_generator.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index ae89a50..651b708 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1606,6 +1606,7 @@ fs_generator::generate_code(exec_list *instructions, bool 
*has_control_flow,
  break;
 
   case BRW_OPCODE_IF:
+ *has_control_flow = true;
 if (inst->src[0].file != BAD_FILE) {
/* The instruction has an embedded compare (only allowed on gen6) */
assert(brw->gen == 6);
@@ -1640,6 +1641,7 @@ fs_generator::generate_code(exec_list *instructions, bool 
*has_control_flow,
 break;
 
   case BRW_OPCODE_WHILE:
+ *has_control_flow = true;
 brw_WHILE(p);
 break;
 
diff --git a/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp
index 7009c6b..086c84c 100644
--- a/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp
@@ -1057,6 +1057,7 @@ gen8_fs_generator::generate_code(exec_list *instructions,
  break;
 
   case BRW_OPCODE_IF:
+ *has_control_flow = true;
  IF(BRW_PREDICATE_NORMAL);
  break;
 
@@ -1081,6 +1082,7 @@ gen8_fs_generator::generate_code(exec_list *instructions,
  break;
 
   case BRW_OPCODE_WHILE:
+ *has_control_flow = true;
  WHILE();
  break;
 
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 7/8] i965/vs: Enable SPF when the shader contains no control flow.

2014-05-06 Thread Matt Turner
---
 src/mesa/drivers/dri/i965/gen6_vs_state.c | 3 ++-
 src/mesa/drivers/dri/i965/gen7_vs_state.c | 3 ++-
 src/mesa/drivers/dri/i965/gen8_vs_state.c | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c 
b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 0af87d1..bdfb9b5 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -167,7 +167,8 @@ upload_vs_state(struct brw_context *brw)
OUT_BATCH(floating_point_mode |
 ((ALIGN(stage_state->sampler_count, 4)/4) << 
GEN6_VS_SAMPLER_COUNT_SHIFT) |
  ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
-  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+ (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 
0));
 
if (brw->vs.prog_data->base.total_scratch) {
   OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c 
b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index b5fc871..f9c9abc 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -97,7 +97,8 @@ upload_vs_state(struct brw_context *brw)
 ((ALIGN(stage_state->sampler_count, 4)/4) <<
   GEN6_VS_SAMPLER_COUNT_SHIFT) |
  ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
-  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+ (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 
0));
 
if (brw->vs.prog_data->base.total_scratch) {
   OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c 
b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index 373cfe4..a83d78b 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -85,7 +85,8 @@ upload_vs_state(struct brw_context *brw)
  ((ALIGN(stage_state->sampler_count, 4) / 4) <<
GEN6_VS_SAMPLER_COUNT_SHIFT) |
  ((prog_data->base.binding_table.size_bytes / 4) <<
-   GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+   GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+ (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 
0));
 
if (prog_data->total_scratch) {
   OUT_RELOC64(stage_state->scratch_bo,
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/8] i965/fs: Enable SPF when the shader contains no control flow.

2014-05-06 Thread Matt Turner
---
 src/mesa/drivers/dri/i965/gen6_wm_state.c | 5 +
 src/mesa/drivers/dri/i965/gen7_wm_state.c | 5 +
 src/mesa/drivers/dri/i965/gen8_ps_state.c | 5 +
 3 files changed, 15 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c 
b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 22e0925..0c7e12b 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -151,6 +151,11 @@ upload_wm_state(struct brw_context *brw)
dw2 |= ((brw->wm.prog_data->base.binding_table.size_bytes / 4) <<
GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
 
+   /* Enable single program flow mode to save power if the program doesn't
+* contain any control flow instructions.
+*/
+   dw2 |= !brw->wm.prog_data->has_control_flow ? GEN6_WM_SPF_MODE : 0;
+
dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
/* CACHE_NEW_WM_PROG */
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c 
b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 71535a5..575d321 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -171,6 +171,11 @@ upload_ps_state(struct brw_context *brw)
if (ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] == NULL)
   dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
 
+   /* Enable single program flow mode to save power if the program doesn't
+* contain any control flow instructions.
+*/
+   dw2 |= !brw->wm.prog_data->has_control_flow ? GEN7_PS_SPF_MODE : 0;
+
/* Haswell requires the sample mask to be set in this packet as well as
 * in 3DSTATE_SAMPLE_MASK; the values should match. */
/* _NEW_BUFFERS, _NEW_MULTISAMPLE */
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c 
b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 7d8f954..63883f8 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -145,6 +145,11 @@ upload_ps_state(struct brw_context *brw)
/* CACHE_NEW_WM_PROG */
gen8_upload_constant_state(brw, &brw->wm.base, true, _3DSTATE_CONSTANT_PS);
 
+   /* Enable single program flow mode to save power if the program doesn't
+* contain any control flow instructions.
+*/
+   dw3 |= !brw->wm.prog_data->has_control_flow ? GEN7_PS_SPF_MODE : 0;
+
/* Initialize the execution mask with VMask.  Otherwise, derivatives are
 * incorrect for subspans where some of the pixels are unlit.  We believe
 * the bit just didn't take effect in previous generations.
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/8] i965/blorp: Enable SPF when the shader contains no control flow.

2014-05-06 Thread Matt Turner
---
 src/mesa/drivers/dri/i965/gen6_blorp.cpp | 1 +
 src/mesa/drivers/dri/i965/gen7_blorp.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp 
b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index 4222fa8..5d7be60 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -695,6 +695,7 @@ gen6_blorp_emit_wm_config(struct brw_context *brw,
dw6 |= 0 << GEN6_WM_NUM_SF_OUTPUTS_SHIFT; /* No inputs from SF */
if (params->use_wm_prog) {
   dw2 |= 1 << GEN6_WM_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */
+  dw2 |= !prog_data->has_control_flow ? GEN6_WM_SPF_MODE : 0;
   dw4 |= prog_data->first_curbe_grf << GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
   dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
   dw5 |= GEN6_WM_KILL_ENABLE; /* TODO: temporarily smash on */
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp 
b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 4bf9396..3dbe174 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -583,6 +583,7 @@ gen7_blorp_emit_ps_config(struct brw_context *brw,
   dw4 |= SET_FIELD(1, HSW_PS_SAMPLE_MASK); /* 1 sample for now */
if (params->use_wm_prog) {
   dw2 |= 1 << GEN7_PS_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */
+  dw2 |= !prog_data->has_control_flow ? GEN7_PS_SPF_MODE : 0;
   dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
   dw5 |= prog_data->first_curbe_grf << GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
}
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/8] i965/vec4: Set has_control_flow to true on an IF or WHILE instruction.

2014-05-06 Thread Matt Turner
All of the other control flow instructions are dependent on the
existence of an IF or WHILE instruction.
---
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp  | 3 +++
 src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index ba8d26d..89656d1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1318,6 +1318,9 @@ vec4_generator::generate_code(exec_list *instructions, 
bool *has_control_flow)
 
   unsigned pre_emit_nr_insn = p->nr_insn;
 
+  if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_WHILE)
+ *has_control_flow = true;
+
   generate_vec4_instruction(inst, dst, src);
 
   if (inst->no_dd_clear || inst->no_dd_check) {
diff --git a/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp 
b/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp
index 222e81a..42d025e 100644
--- a/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp
@@ -850,6 +850,9 @@ gen8_vec4_generator::generate_code(exec_list *instructions,
 
   const unsigned pre_emit_nr_inst = nr_inst;
 
+  if (ir->opcode == BRW_OPCODE_IF || ir->opcode == BRW_OPCODE_WHILE)
+ *has_control_flow = true;
+
   generate_vec4_instruction(ir, dst, src);
 
   if (ir->no_dd_clear || ir->no_dd_check) {
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/8] i965/vec4: Add plumbing for communicating single program flow.

2014-05-06 Thread Matt Turner
---
 src/mesa/drivers/dri/i965/brw_context.h   |  2 ++
 src/mesa/drivers/dri/i965/brw_vec4.cpp|  6 --
 src/mesa/drivers/dri/i965/brw_vec4.h  | 10 ++
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp  |  7 ---
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp |  6 --
 src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp |  8 +---
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 18149b5..08760de 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -598,6 +598,8 @@ struct brw_vec4_prog_data {
 * is the size of the URB entry used for output.
 */
GLuint urb_entry_size;
+
+   bool has_control_flow;
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index daff364..9e68ebc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1831,11 +1831,13 @@ brw_vs_emit(struct brw_context *brw,
if (brw->gen >= 8) {
   gen8_vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
 mem_ctx, INTEL_DEBUG & DEBUG_VS);
-  assembly = g.generate_assembly(&v.instructions, final_assembly_size);
+  assembly = g.generate_assembly(&v.instructions, final_assembly_size,
+ &prog_data->base.has_control_flow);
} else {
   vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
mem_ctx, INTEL_DEBUG & DEBUG_VS);
-  assembly = g.generate_assembly(&v.instructions, final_assembly_size);
+  assembly = g.generate_assembly(&v.instructions, final_assembly_size,
+ &prog_data->base.has_control_flow);
}
 
if (unlikely(brw->perf_debug) && shader) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h 
b/src/mesa/drivers/dri/i965/brw_vec4.h
index ebe707f..e895659 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -647,10 +647,11 @@ public:
   bool debug_flag);
~vec4_generator();
 
-   const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size);
+   const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size,
+ bool *has_control_flow);
 
 private:
-   void generate_code(exec_list *instructions);
+   void generate_code(exec_list *instructions, bool *has_control_flow);
void generate_vec4_instruction(vec4_instruction *inst,
   struct brw_reg dst,
   struct brw_reg *src);
@@ -748,10 +749,11 @@ public:
bool debug_flag);
~gen8_vec4_generator();
 
-   const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size);
+   const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size,
+ bool *has_control_flow);
 
 private:
-   void generate_code(exec_list *instructions);
+   void generate_code(exec_list *instructions, bool *has_control_flow);
void generate_vec4_instruction(vec4_instruction *inst,
   struct brw_reg dst,
   struct brw_reg *src);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index bcacde9..ba8d26d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1260,7 +1260,7 @@ 
vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
 }
 
 void
-vec4_generator::generate_code(exec_list *instructions)
+vec4_generator::generate_code(exec_list *instructions, bool *has_control_flow)
 {
int last_native_insn_offset = 0;
const char *last_annotation_string = NULL;
@@ -1359,10 +1359,11 @@ vec4_generator::generate_code(exec_list *instructions)
 
 const unsigned *
 vec4_generator::generate_assembly(exec_list *instructions,
-  unsigned *assembly_size)
+  unsigned *assembly_size,
+  bool *has_control_flow)
 {
brw_set_access_mode(p, BRW_ALIGN_16);
-   generate_code(instructions);
+   generate_code(instructions, has_control_flow);
return brw_get_program(p, assembly_size);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 1321a94..428ed60 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -570,11 +570,13 @@ generate_assembly(struct brw_context *brw,
if (brw->gen >= 8) {
   gen8_vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
 INTEL_DEBUG & DEBUG_GS);
-  return g.generate_assembly(instructions, final

[Mesa-dev] [PATCH 8/8] i965/gs: Enable SPF when the shader contains no control flow.

2014-05-06 Thread Matt Turner
---
 src/mesa/drivers/dri/i965/gen7_gs_state.c | 4 +++-
 src/mesa/drivers/dri/i965/gen8_gs_state.c | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c 
b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index d18ae15..d7ba4a0 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -96,7 +96,9 @@ upload_gs_state(struct brw_context *brw)
   OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
  GEN6_GS_SAMPLER_COUNT_SHIFT) |
 ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) <<
- GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+ GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+(!brw->gs.prog_data->base.has_control_flow ? GEN6_GS_SPF_MODE 
: 0));
+
 
   if (brw->gs.prog_data->base.total_scratch) {
  OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c 
b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 97fbf84..e5260db 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -61,7 +61,8 @@ gen8_upload_gs_state(struct brw_context *brw)
 ((ALIGN(stage_state->sampler_count, 4)/4) <<
  GEN6_GS_SAMPLER_COUNT_SHIFT) |
 ((prog_data->base.binding_table.size_bytes / 4) <<
- GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+ GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+(!brw->gs.prog_data->base.has_control_flow ? GEN6_GS_SPF_MODE 
: 0));
 
   if (brw->gs.prog_data->base.total_scratch) {
  OUT_RELOC64(stage_state->scratch_bo,
-- 
1.8.3.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 7/8] i965/vs: Enable SPF when the shader contains no control flow.

2014-05-06 Thread Chia-I Wu
On Wed, May 7, 2014 at 9:38 AM, Matt Turner  wrote:
> ---
>  src/mesa/drivers/dri/i965/gen6_vs_state.c | 3 ++-
>  src/mesa/drivers/dri/i965/gen7_vs_state.c | 3 ++-
>  src/mesa/drivers/dri/i965/gen8_vs_state.c | 3 ++-
>  3 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c 
> b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> index 0af87d1..bdfb9b5 100644
> --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> @@ -167,7 +167,8 @@ upload_vs_state(struct brw_context *brw)
> OUT_BATCH(floating_point_mode |
>  ((ALIGN(stage_state->sampler_count, 4)/4) << 
> GEN6_VS_SAMPLER_COUNT_SHIFT) |
>   ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
> -  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> +  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
> + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 
> 0));
The doc says bit 31 is "Single Vertex Dispatch".  When this bit is
set, I vaguely remember VS_INVOCATION_COUNT does get doubled for the
same workload, and the performance is hurt.

>
> if (brw->vs.prog_data->base.total_scratch) {
>OUT_RELOC(stage_state->scratch_bo,
> diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c 
> b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> index b5fc871..f9c9abc 100644
> --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> @@ -97,7 +97,8 @@ upload_vs_state(struct brw_context *brw)
>  ((ALIGN(stage_state->sampler_count, 4)/4) <<
>GEN6_VS_SAMPLER_COUNT_SHIFT) |
>   ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
> -  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> +  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
> + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 
> 0));
>
> if (brw->vs.prog_data->base.total_scratch) {
>OUT_RELOC(stage_state->scratch_bo,
> diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c 
> b/src/mesa/drivers/dri/i965/gen8_vs_state.c
> index 373cfe4..a83d78b 100644
> --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
> @@ -85,7 +85,8 @@ upload_vs_state(struct brw_context *brw)
>   ((ALIGN(stage_state->sampler_count, 4) / 4) <<
> GEN6_VS_SAMPLER_COUNT_SHIFT) |
>   ((prog_data->base.binding_table.size_bytes / 4) <<
> -   GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> +   GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
> + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 
> 0));
>
> if (prog_data->total_scratch) {
>OUT_RELOC64(stage_state->scratch_bo,
> --
> 1.8.3.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev



-- 
o...@lunarg.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 7/8] i965/vs: Enable SPF when the shader contains no control flow.

2014-05-06 Thread Matt Turner
On Tue, May 6, 2014 at 7:14 PM, Chia-I Wu  wrote:
> On Wed, May 7, 2014 at 9:38 AM, Matt Turner  wrote:
>> ---
>>  src/mesa/drivers/dri/i965/gen6_vs_state.c | 3 ++-
>>  src/mesa/drivers/dri/i965/gen7_vs_state.c | 3 ++-
>>  src/mesa/drivers/dri/i965/gen8_vs_state.c | 3 ++-
>>  3 files changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c 
>> b/src/mesa/drivers/dri/i965/gen6_vs_state.c
>> index 0af87d1..bdfb9b5 100644
>> --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
>> +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
>> @@ -167,7 +167,8 @@ upload_vs_state(struct brw_context *brw)
>> OUT_BATCH(floating_point_mode |
>>  ((ALIGN(stage_state->sampler_count, 4)/4) << 
>> GEN6_VS_SAMPLER_COUNT_SHIFT) |
>>   ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
>> -  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
>> +  GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
>> + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE 
>> : 0));
> The doc says bit 31 is "Single Vertex Dispatch".  When this bit is
> set, I vaguely remember VS_INVOCATION_COUNT does get doubled for the
> same workload, and the performance is hurt.

Yeah. Chris Forbes mentioned this to me too. I'll drop the last four patches.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 20/21] mesa: add support for threaded glCompileShader

2014-05-06 Thread Chia-I Wu
On Mon, May 5, 2014 at 5:37 AM, Chia-I Wu  wrote:
> On Sat, May 3, 2014 at 1:59 AM, Ian Romanick  wrote:
>> On 04/22/2014 01:58 AM, Chia-I Wu wrote:
>>> From: Chia-I Wu 
>>>
>>> Threaded glCompileShader can be enabled for a context by calling
>>> _mesa_enable_glsl_threadpool.  It will initialize the singleton GLSL thread
>>> pool and defer glCompileShader calls to the thread pool.
>>>
>>> For applications to benefit from threaded glCompileShader, they have to
>>> compile shaders in this fashion
>>>
>>>  for (i = 0; i < num_shaders; i++)
>>>glCompileShader(shaders[i]);
>>>  for (i = 0; i < num_shaders; i++)
>>>glGetShaderiv(shaders[i], GL_COMPILE_STATUS, &val);
>>
>> I think when you try this series on some real applications, you will be
>> disappointed.  Eric had a pretty similar branch
>> (http://cgit.freedesktop.org/~anholt/mesa/log/?h=compiler-threads), but
>> it saw no benefit on any applications... because everybody does
>>
>> for (i = 0; i < num_shaders; i++) {
>> glCompileShader(shaders[i]);
>> glGetShaderiv(shaders[i], GL_COMPILE_STATUS, &val);
>> }
>>
>> or
>>
>> for (i = 0; i < num_shaders; i++) {
>> glCompileShader(shaders[i]);
>> glAttachShader(prog, shaders[i]);
>> }
>>
>> glLinkProgram(prog);
> Yeah, I am aware of the situation with real-world applications.  Only
> applications that are modified to not immediately check compilation results
> will get the speed up in compile times.  That is why this feature needs to be
> enabled through drirc.  We, at LunarG, are working with major game engines
> vendors to ensure this performance benefit is realized.
>
>> I'm also curious about your test case... did you link the shaders?  As
>> far as I'm aware, the bulk of time spent in the compiler happens during
>> linking (final optimizations and register allocation).  Eric's data
>> (http://lists.freedesktop.org/archives/mesa-dev/2014-April/057494.html)
>> says we spend more than 2x time in linking than in compiling.
> No, I did not.  In my other experiment with Unigine Tropics, the
> distribution of time was more like
>
> glCompileShader: 50%
> glLinkProgram FE: 25%
> glLinkProgram BE: 25%
I've rerun the test (source attached).  The numbers from compiling and
linking Unigine Tropics shaders are

  _mesa_CompileShader: 54.8%
  link_shaders: 17.1%
  brw_link_shaders: 27.9%

The numbers from running on another set of shaders (took about 100 seconds) are

  _mesa_CompileShader: 50.4%
  link_shaders: 5.6%
  brw_link_shaders: 43.8%


-- 
o...@lunarg.com
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define MAX_PROGRAMS 4100
struct {
	GLuint prog;
	long long prog_time;
	long long shader_times[2];
} progs[MAX_PROGRAMS];

static GLuint create_shader(int id, GLenum type)
{
	char filename[32];
	size_t size;
	FILE *fp;
	char *buf;
	GLuint sh;

	switch (type) {
	case GL_VERTEX_SHADER:
		snprintf(filename, sizeof(filename), "%d.vert", id);
		break;
	case GL_FRAGMENT_SHADER:
		snprintf(filename, sizeof(filename), "%d.frag", id);
		break;
	default:
		return 0;
		break;
	}

	fp = fopen(filename, "rb");
	if (!fp)
		return 0;

	fseek(fp, 0, SEEK_END);
	size = ftell(fp);
	if (!size) {
		fclose(fp);
		return 0;
	}

	fseek(fp, 0, SEEK_SET);
	buf = malloc(size + 1);
	if (!buf) {
		fclose(fp);
		return 0;
	}

	if (fread(buf, 1, size, fp) != size) {
		printf("error reading %s\n", filename);
		fclose(fp);
		return 0;
	}

	buf[size] = '\0';
	fclose(fp);

	sh = glCreateShader(type);
	glShaderSource(sh, 1, (void*)&buf, NULL);

	return sh;
}

static void compile_all_shaders(void)
{
	struct timeval start, end;
	GLint val;
	int i;

	for (i = 0; i < MAX_PROGRAMS; i++) {
		GLuint prog;
		int num_shaders, j;

		prog = glCreateProgram();

		num_shaders = 0;
		for (j = 0; j < 2; j++) {
			GLenum type = (j == 0) ? GL_VERTEX_SHADER : GL_FRAGMENT_SHADER;
			GLuint sh;

			sh = create_shader(i, type);
			if (!sh)
continue;

			num_shaders++;
			gettimeofday(&start, NULL);
			glCompileShader(sh);
			gettimeofday(&end, NULL);

			glGetShaderiv(sh, GL_COMPILE_STATUS, &val);
			if (!val) {
char buf[1024];
GLsizei len;
glGetShaderInfoLog(sh, sizeof(buf), &len, buf);
printf("%d.%d: %*s\n", i, j, len, buf);
			}

			progs[i].shader_times[j] = 100ll * (end.tv_sec - start.tv_sec) +
((long long) end.tv_usec - start.tv_usec);

			glAttachShader(prog, sh);
			glDeleteShader(sh);
		}

		if (num_shaders < j) {
			if (num_shaders) {
;//glProgramParameteri(prog, GL_PROGRAM_SEPARABLE, GL_TRUE);
			}
			else {
glDeleteProgram(prog);
continue;
			}
		}

		gettimeofday(&start, NULL);
		glLinkProgram(prog);
		gettimeofday(&end, NULL);

		glGetProgramiv(prog, GL_LINK_STATUS, &val);
		if (!val) {
			char buf[1024];
			GLsizei len;
			glGetProgramInfoLog(prog, sizeof(buf), &len, buf);
			printf("%d: %*s\n", i, len, buf);
		}

		progs[i].prog = prog;
		progs[i].prog_time = 100ll * (end.tv_sec - start.tv_sec) +
			((long long) end.t