[Mesa-dev] [PATCH] softpipe: implement seamless cubemap support.

2012-12-11 Thread Dave Airlie
This adds seamless sampling for cubemap boundaries if requested.

The corner case averaging is messy but seems like it should be spec
compliant.

The face direction stuff is also a bit messy, I've no idea if that could
or should be simpler, or even if all my directions are fully correct!

Signed-off-by: Dave Airlie 
---
 src/gallium/drivers/softpipe/sp_screen.c |   2 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c | 143 +--
 2 files changed, 135 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_screen.c 
b/src/gallium/drivers/softpipe/sp_screen.c
index 909fa1c..7ca259e 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -127,7 +127,7 @@ softpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
   return 1;
case PIPE_CAP_SEAMLESS_CUBE_MAP:
case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-  return 0;
+  return 1;
case PIPE_CAP_SCALED_RESOLVE:
   return 0;
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 7558ef1..9500a03 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -607,6 +607,100 @@ get_texel_2d(const struct sp_sampler_variant *samp,
}
 }
 
+static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = {
+   /* pos X first then neg X is Z different, Y the same */
+   /* PIPE_TEX_FACE_POS_X,*/
+   { PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+   /* PIPE_TEX_FACE_NEG_X */
+   { PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+
+   /* pos Y first then neg Y is X different, X the same */
+   /* PIPE_TEX_FACE_POS_Y */
+   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
+ PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z },
+
+   /* PIPE_TEX_FACE_NEG_Y */
+   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
+ PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z },
+
+   /* pos Z first then neg Y is X different, X the same */
+   /* PIPE_TEX_FACE_POS_Z */
+   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+
+   /* PIPE_TEX_FACE_NEG_Z */
+   { PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }
+};
+
+static INLINE unsigned
+get_next_face(unsigned face, int x, int y)
+{
+   int idx = 0;
+
+   if (x == 0 && y == 0)
+  return face;
+   if (x == -1) idx = 0;
+   else if (x == 1) idx = 1;
+   else if (y == -1) idx = 2;
+   else if (y == 1) idx = 3;
+
+   return face_array[face][idx];
+}
+
+static INLINE const float *
+get_texel_cube_seamless(const struct sp_sampler_variant *samp,
+union tex_tile_address addr, int x, int y,
+float *corner)
+{
+   const struct pipe_resource *texture = samp->view->texture;
+   unsigned level = addr.bits.level;
+   unsigned face = addr.bits.face;
+   int new_x, new_y;
+   int max_x, max_y;
+   int c;
+
+   max_x = (int) u_minify(texture->width0, level);
+   max_y = (int) u_minify(texture->height0, level);
+   new_x = x;
+   new_y = y;
+
+   /* the corner case */
+   if ((x < 0 || x >= max_x) &&
+   (y < 0 || y >= max_y)) {
+  const float *c1, *c2, *c3;
+  int fx = x < 0 ? 0 : max_x - 1;
+  int fy = y < 0 ? 0 : max_y - 1;
+  c1 = get_texel_2d_no_border( samp, addr, fx, fy);
+  addr.bits.face = get_next_face(face, (x < 0) ? -1 : 1, 0);
+  c2 = get_texel_2d_no_border( samp, addr, (x < 0) ? max_x - 1 : 0, fy);
+  addr.bits.face = get_next_face(face, 0, (y < 0) ? -1 : 1);
+  c3 = get_texel_2d_no_border( samp, addr, fx, (y < 0) ?  max_y - 1 : 0);
+  for (c = 0; c < TGSI_QUAD_SIZE; c++)
+ corner[c] = CLAMP((c1[c] + c2[c] + c3[c]), 0.0F, 1.0F) / 3;
+
+  return corner;
+   }
+   /* change the face */
+   if (x < 0) {
+  new_x = max_x - 1;
+  face = get_next_face(face, -1, 0);
+   } else if (x >= max_x) {
+  new_x = 0;
+  face = get_next_face(face, 1, 0);
+   } else if (y < 0) {
+  new_y = max_y - 1;
+  face = get_next_face(face, 0, -1);
+   } else if (y >= max_y) {
+  new_y = 0;
+  face = get_next_face(face, 0, 1);
+   }
+
+   addr.bits.face = face;
+   return get_texel_2d_no_border( samp, addr, new_x, new_y );
+}
 
 /* Gather a quad of adjacent texels within a tile:
  */
@@ -1121,6 +1215,7 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
union tex_tile_address addr;
const float *out;
int c;
+   float corner0[TGSI_QUAD_SIZE];
 
width = u_minify(texture->width0, level);
height = u_minify(texture->height0, level);
@@ -1131,10 +1226,23 @@ img_filter_cube_nearest(struct tgsi_sampler 
*tgsi_sampler,
addr.value = 0;
addr.bits.level = level;
 
-   samp->nearest_texcoord_s(s, width, &x);
-   samp->nearest_texcoord_t(t, height, &y);
+   /*
+* If NEAREST filtering is

[Mesa-dev] [Bug 58137] New: [r300g, r600g] corruption on 0 A.D. game with postproc effects enabled

2012-12-11 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=58137

  Priority: medium
Bug ID: 58137
  Assignee: mesa-dev@lists.freedesktop.org
   Summary: [r300g, r600g] corruption on 0 A.D. game with postproc
effects enabled
  Severity: normal
Classification: Unclassified
OS: Linux (All)
  Reporter: fabio@libero.it
  Hardware: x86 (IA32)
Status: NEW
   Version: git
 Component: Mesa core
   Product: Mesa

Enabling postproc effects in 0 A.D. ( postproc = true
in ~/.config/0ad/config/local.cfg ) results in this rendering corruption:
http://www.wildfiregames.com/forum/index.php?app=core&module=attach§ion=attach&attach_rel_module=post&attach_id=4929

Tested by me on RV530/r300g with current git and another user on Radeon HD
4670/r600g, see:
http://www.wildfiregames.com/forum/index.php?showtopic=16839&#entry257923

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa syncobj: don't store a pointer to the set_entry

2012-12-11 Thread Stefan Dösinger
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Am 2012-12-11 00:47, schrieb Ian Romanick:
>> [  760.187261] [drm:radeon_cs_ib_chunk] *ERROR* Invalid command
>> stream ! [  760.192898] radeon :01:00.0: 
>> evergreen_cs_track_validate_stencil:602 stencil read bo base 
>> 4148500480 not aligned with 16384 [  760.192901] radeon
>> :01:00.0: evergreen_packet3_check:2098 invalid cmd stream
>> 2440
> 
> It sounds like you should rebase Jordan's original patches out and 
> bisect.  It would be nice to know when things started going wrong.
> That usually makes the fix obvious.

It was a case of PEBKAC - I had an outdated libdrm and inadvertently
made configure accept it.
-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.19 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iQIcBAEBAgAGBQJQxyP3AAoJEN0/YqbEcdMwjLcP/RwyNoCRKFFFVM46P+AwTFNP
0bQ3paJ0IluZUoeLe1Ihpl1PlmkoD//ZUFPlNamUKK85bNpE3gdDbJ7VbUJagW/b
Xc0xYfssBbL6AGWxKO/+vvPH1T2pTEeFWB7Dw24truL0ORl7bx7Ovr7cCuX4SGnC
TOvtyaX9FSSwbBnr6eyanEahlszTa5tIbtMumX9OJT+MDlNLxkgQjE5LL2zqXRZi
7moSBUNyELVKa4i0Yb9A9Tp0GXJBoZ328z5iBP7I5LKOZek245zzVWPkFQ11eTmV
Hz5HVXvoVeDfhxY4pJSi73DjPcwEUHu3YJ8iDSjvpTe+N0Uj9Wud7by75y9WE2MI
NI3nB4G0dP05dvps9MZXJnlvbejEMKinIb6jqCJ85wT+5H75CRyKUyXrZVfD5vfb
F5UXJZEaMLt5fGV+T5NnZRMjC+PjMamovsaV5aT+pyfdZg+2U3cTCJQxEDCVkBpb
hxolLm+PyR3o/fYED/82SiWkmY1wpAPCrOdp5AiPfheq6+anm+In0jThv7xygG0u
isge3WQEKu8UxwZhFo4wPQkfHoSac1VX/wO036YvWQTnLpxTvGDRWGu225KPqVCx
gxVj0Qzi90YjaiFSlHnPje+pSrvVyRUiGBIjDc/VRtLHlmy+0hrryGBGwp04U7M2
83AMyMmi/FRzB+GsVpoJ
=sulN
-END PGP SIGNATURE-
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] st/mesa: remove the prefix 'Gallium 0.4 on' from the renderer string

2012-12-11 Thread Marek Olšák
We already have the Mesa version in the version string, isn't that enough
to detect Mesa?
---
 src/mesa/state_tracker/st_cb_strings.c |5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_strings.c 
b/src/mesa/state_tracker/st_cb_strings.c
index 2132379..3ce9fab 100644
--- a/src/mesa/state_tracker/st_cb_strings.c
+++ b/src/mesa/state_tracker/st_cb_strings.c
@@ -39,8 +39,6 @@
 #include "st_context.h"
 #include "st_cb_strings.h"
 
-#define ST_VERSION_STRING "0.4"
-
 static const GLubyte *
 st_get_string(struct gl_context * ctx, GLenum name)
 {
@@ -55,8 +53,7 @@ st_get_string(struct gl_context * ctx, GLenum name)
}
 
case GL_RENDERER:
-  util_snprintf(st->renderer, sizeof(st->renderer), "Gallium %s on %s", 
-   ST_VERSION_STRING,
+  util_snprintf(st->renderer, sizeof(st->renderer), "%s",
   screen->get_name( screen ));
 
   return (GLubyte *) st->renderer;
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/mesa: remove the prefix 'Gallium 0.4 on' from the renderer string

2012-12-11 Thread Jose Fonseca
This will break apps that expect the current to tweak their behavior.  Changing 
it now will cause pain to app developers and ourselves, and I honestly don't 
the good of it.

For good or bad we have these strings. So I'd prefer we focused on making our 
drivers rock solid so that app developers don't feel the need to detect and 
workaround issues in our drivers.

Jose

- Original Message -
> We already have the Mesa version in the version string, isn't that
> enough
> to detect Mesa?
> ---
>  src/mesa/state_tracker/st_cb_strings.c |5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/src/mesa/state_tracker/st_cb_strings.c
> b/src/mesa/state_tracker/st_cb_strings.c
> index 2132379..3ce9fab 100644
> --- a/src/mesa/state_tracker/st_cb_strings.c
> +++ b/src/mesa/state_tracker/st_cb_strings.c
> @@ -39,8 +39,6 @@
>  #include "st_context.h"
>  #include "st_cb_strings.h"
>  
> -#define ST_VERSION_STRING "0.4"
> -
>  static const GLubyte *
>  st_get_string(struct gl_context * ctx, GLenum name)
>  {
> @@ -55,8 +53,7 @@ st_get_string(struct gl_context * ctx, GLenum name)
> }
>  
> case GL_RENDERER:
> -  util_snprintf(st->renderer, sizeof(st->renderer), "Gallium %s
> on %s",
> -   ST_VERSION_STRING,
> +  util_snprintf(st->renderer, sizeof(st->renderer), "%s",
>  screen->get_name( screen ));
>  
>return (GLubyte *) st->renderer;
> --
> 1.7.10.4
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/mesa: remove the prefix 'Gallium 0.4 on' from the renderer string

2012-12-11 Thread Henri Verbeet
On 11 December 2012 13:57, Marek Olšák  wrote:
> We already have the Mesa version in the version string, isn't that enough
> to detect Mesa?
In theory, although the vendor string would IMO be the expected place for that.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] softpipe: implement seamless cubemap support.

2012-12-11 Thread Roland Scheidegger
Am 11.12.2012 10:52, schrieb Dave Airlie:
> This adds seamless sampling for cubemap boundaries if requested.
> 
> The corner case averaging is messy but seems like it should be spec
> compliant.
> 
> The face direction stuff is also a bit messy, I've no idea if that could
> or should be simpler, or even if all my directions are fully correct!
> 
> Signed-off-by: Dave Airlie 
> ---
>  src/gallium/drivers/softpipe/sp_screen.c |   2 +-
>  src/gallium/drivers/softpipe/sp_tex_sample.c | 143 
> +--
>  2 files changed, 135 insertions(+), 10 deletions(-)
> 
> diff --git a/src/gallium/drivers/softpipe/sp_screen.c 
> b/src/gallium/drivers/softpipe/sp_screen.c
> index 909fa1c..7ca259e 100644
> --- a/src/gallium/drivers/softpipe/sp_screen.c
> +++ b/src/gallium/drivers/softpipe/sp_screen.c
> @@ -127,7 +127,7 @@ softpipe_get_param(struct pipe_screen *screen, enum 
> pipe_cap param)
>return 1;
> case PIPE_CAP_SEAMLESS_CUBE_MAP:
> case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
> -  return 0;
> +  return 1;
> case PIPE_CAP_SCALED_RESOLVE:
>return 0;
> case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
> diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
> b/src/gallium/drivers/softpipe/sp_tex_sample.c
> index 7558ef1..9500a03 100644
> --- a/src/gallium/drivers/softpipe/sp_tex_sample.c
> +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
> @@ -607,6 +607,100 @@ get_texel_2d(const struct sp_sampler_variant *samp,
> }
>  }
>  
> +static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = {
> +   /* pos X first then neg X is Z different, Y the same */
> +   /* PIPE_TEX_FACE_POS_X,*/
> +   { PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z,
> + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
> +   /* PIPE_TEX_FACE_NEG_X */
> +   { PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z,
> + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
> +
> +   /* pos Y first then neg Y is X different, X the same */
> +   /* PIPE_TEX_FACE_POS_Y */
> +   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
> + PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z },
> +
> +   /* PIPE_TEX_FACE_NEG_Y */
> +   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
> + PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z },
> +
> +   /* pos Z first then neg Y is X different, X the same */
> +   /* PIPE_TEX_FACE_POS_Z */
> +   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
> + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
> +
> +   /* PIPE_TEX_FACE_NEG_Z */
> +   { PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X,
> + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }
> +};
> +
> +static INLINE unsigned
> +get_next_face(unsigned face, int x, int y)
> +{
> +   int idx = 0;
> +
> +   if (x == 0 && y == 0)
> +  return face;
> +   if (x == -1) idx = 0;
> +   else if (x == 1) idx = 1;
> +   else if (y == -1) idx = 2;
> +   else if (y == 1) idx = 3;
> +
> +   return face_array[face][idx];
> +}
> +
> +static INLINE const float *
> +get_texel_cube_seamless(const struct sp_sampler_variant *samp,
> +union tex_tile_address addr, int x, int y,
> +float *corner)
> +{
> +   const struct pipe_resource *texture = samp->view->texture;
> +   unsigned level = addr.bits.level;
> +   unsigned face = addr.bits.face;
> +   int new_x, new_y;
> +   int max_x, max_y;
> +   int c;
> +
> +   max_x = (int) u_minify(texture->width0, level);
> +   max_y = (int) u_minify(texture->height0, level);
> +   new_x = x;
> +   new_y = y;
> +
> +   /* the corner case */
> +   if ((x < 0 || x >= max_x) &&
> +   (y < 0 || y >= max_y)) {
> +  const float *c1, *c2, *c3;
> +  int fx = x < 0 ? 0 : max_x - 1;
> +  int fy = y < 0 ? 0 : max_y - 1;
> +  c1 = get_texel_2d_no_border( samp, addr, fx, fy);
> +  addr.bits.face = get_next_face(face, (x < 0) ? -1 : 1, 0);
> +  c2 = get_texel_2d_no_border( samp, addr, (x < 0) ? max_x - 1 : 0, fy);
> +  addr.bits.face = get_next_face(face, 0, (y < 0) ? -1 : 1);
> +  c3 = get_texel_2d_no_border( samp, addr, fx, (y < 0) ?  max_y - 1 : 0);
> +  for (c = 0; c < TGSI_QUAD_SIZE; c++)
> + corner[c] = CLAMP((c1[c] + c2[c] + c3[c]), 0.0F, 1.0F) / 3;
> +
> +  return corner;
I wonder if the recommended corner handling is worth it. As far as I can
tell the spec would allow you to simply drop the special handling and it
would still be conformant (same for dx10). Though since softpipe is more
about correctness rather than performance, I guess this is ok (and the
code would very rarely get hit anyway).

> +   }
> +   /* change the face */
> +   if (x < 0) {
> +  new_x = max_x - 1;
> +  face = get_next_face(face, -1, 0);
> +   } else if (x >= max_x) {
> +  new_x = 0;
> +  face = get_next_face(face, 1, 0);
> +   } else if (y < 0) {
> +  new_y = max_y - 1;
> +  face = get_next_face(face, 0, -1);
> +   } else if (y >= max_y) {
> +  new_y = 0;
> +  face = get_next_face(face, 0, 1);
> +   }
> +
> +   addr.bits.face = face;
> +   return get_texel_2d_no

Re: [Mesa-dev] [PATCH] softpipe: implement seamless cubemap support.

2012-12-11 Thread Brian Paul

Just a few minor things below.


On 12/11/2012 02:52 AM, Dave Airlie wrote:

This adds seamless sampling for cubemap boundaries if requested.

The corner case averaging is messy but seems like it should be spec
compliant.

The face direction stuff is also a bit messy, I've no idea if that could
or should be simpler, or even if all my directions are fully correct!

Signed-off-by: Dave Airlie
---
  src/gallium/drivers/softpipe/sp_screen.c |   2 +-
  src/gallium/drivers/softpipe/sp_tex_sample.c | 143 +--
  2 files changed, 135 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_screen.c 
b/src/gallium/drivers/softpipe/sp_screen.c
index 909fa1c..7ca259e 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -127,7 +127,7 @@ softpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
return 1;
 case PIPE_CAP_SEAMLESS_CUBE_MAP:
 case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-  return 0;
+  return 1;
 case PIPE_CAP_SCALED_RESOLVE:
return 0;
 case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 7558ef1..9500a03 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -607,6 +607,100 @@ get_texel_2d(const struct sp_sampler_variant *samp,
 }
  }



Maybe add a comment on this array explaining what it's for.



+static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = {
+   /* pos X first then neg X is Z different, Y the same */
+   /* PIPE_TEX_FACE_POS_X,*/
+   { PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+   /* PIPE_TEX_FACE_NEG_X */
+   { PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+
+   /* pos Y first then neg Y is X different, X the same */
+   /* PIPE_TEX_FACE_POS_Y */
+   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
+ PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z },
+
+   /* PIPE_TEX_FACE_NEG_Y */
+   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
+ PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z },
+
+   /* pos Z first then neg Y is X different, X the same */
+   /* PIPE_TEX_FACE_POS_Z */
+   { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+
+   /* PIPE_TEX_FACE_NEG_Z */
+   { PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X,
+ PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }
+};
+
+static INLINE unsigned
+get_next_face(unsigned face, int x, int y)
+{
+   int idx = 0;
+
+   if (x == 0&&  y == 0)
+  return face;
+   if (x == -1) idx = 0;
+   else if (x == 1) idx = 1;
+   else if (y == -1) idx = 2;
+   else if (y == 1) idx = 3;


Please put the "idx = ..." assignments on separate lines.  The problem 
with "if (c) expr;" is it's a PITA if you want to set a breakpoint on 
expr.




+
+   return face_array[face][idx];
+}
+
+static INLINE const float *
+get_texel_cube_seamless(const struct sp_sampler_variant *samp,
+union tex_tile_address addr, int x, int y,
+float *corner)
+{
+   const struct pipe_resource *texture = samp->view->texture;
+   unsigned level = addr.bits.level;
+   unsigned face = addr.bits.face;
+   int new_x, new_y;
+   int max_x, max_y;
+   int c;
+
+   max_x = (int) u_minify(texture->width0, level);
+   max_y = (int) u_minify(texture->height0, level);
+   new_x = x;
+   new_y = y;
+
+   /* the corner case */
+   if ((x<  0 || x>= max_x)&&
+   (y<  0 || y>= max_y)) {
+  const float *c1, *c2, *c3;
+  int fx = x<  0 ? 0 : max_x - 1;
+  int fy = y<  0 ? 0 : max_y - 1;
+  c1 = get_texel_2d_no_border( samp, addr, fx, fy);
+  addr.bits.face = get_next_face(face, (x<  0) ? -1 : 1, 0);
+  c2 = get_texel_2d_no_border( samp, addr, (x<  0) ? max_x - 1 : 0, fy);
+  addr.bits.face = get_next_face(face, 0, (y<  0) ? -1 : 1);
+  c3 = get_texel_2d_no_border( samp, addr, fx, (y<  0) ?  max_y - 1 : 0);
+  for (c = 0; c<  TGSI_QUAD_SIZE; c++)
+ corner[c] = CLAMP((c1[c] + c2[c] + c3[c]), 0.0F, 1.0F) / 3;
+
+  return corner;
+   }
+   /* change the face */
+   if (x<  0) {
+  new_x = max_x - 1;
+  face = get_next_face(face, -1, 0);
+   } else if (x>= max_x) {
+  new_x = 0;
+  face = get_next_face(face, 1, 0);
+   } else if (y<  0) {
+  new_y = max_y - 1;
+  face = get_next_face(face, 0, -1);
+   } else if (y>= max_y) {
+  new_y = 0;
+  face = get_next_face(face, 0, 1);
+   }
+
+   addr.bits.face = face;
+   return get_texel_2d_no_border( samp, addr, new_x, new_y );
+}

  /* Gather a quad of adjacent texels within a tile:
   */
@@ -1121,6 +1215,7 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
 union tex_tile_address addr;
 const float *out;
 int c;
+   float corner0[TGSI_QUAD_SIZE];

 width = u_minify(texture->width0, level);
   

[Mesa-dev] [PATCH 1/2] clover: Don't erase build info of devices not being built

2012-12-11 Thread Tom Stellard
From: Tom Stellard 

Every call to _cl_program::build() was erasing the binaries and logs for
every device associated with the program.  This is incorrect because
it is possible to build a program for only a subset of devices and so
any device not being build should not have this information erased.
---
 src/gallium/state_trackers/clover/core/program.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/state_trackers/clover/core/program.cpp 
b/src/gallium/state_trackers/clover/core/program.cpp
index 6ca8080..5fcda23 100644
--- a/src/gallium/state_trackers/clover/core/program.cpp
+++ b/src/gallium/state_trackers/clover/core/program.cpp
@@ -42,10 +42,10 @@ _cl_program::_cl_program(clover::context &ctx,
 
 void
 _cl_program::build(const std::vector &devs) {
-   __binaries.clear();
-   __logs.clear();
 
for (auto dev : devs) {
+  __binaries.erase(dev);
+  __logs.erase(dev);
   try {
  auto module = (dev->ir_format() == PIPE_SHADER_IR_TGSI ?
 compile_program_tgsi(__source) :
-- 
1.7.11.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] clover: Add support for compiler flags

2012-12-11 Thread Tom Stellard
From: Tom Stellard 

---
 src/gallium/state_trackers/clover/api/program.cpp  |  7 +++-
 .../state_trackers/clover/core/compiler.hpp| 12 +-
 src/gallium/state_trackers/clover/core/program.cpp | 12 --
 src/gallium/state_trackers/clover/core/program.hpp |  3 +-
 .../state_trackers/clover/llvm/invocation.cpp  | 49 +++---
 5 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/src/gallium/state_trackers/clover/api/program.cpp 
b/src/gallium/state_trackers/clover/api/program.cpp
index 74de840..06f96f1 100644
--- a/src/gallium/state_trackers/clover/api/program.cpp
+++ b/src/gallium/state_trackers/clover/api/program.cpp
@@ -142,15 +142,18 @@ clBuildProgram(cl_program prog, cl_uint count, const 
cl_device_id *devs,
(!pfn_notify && user_data))
   throw error(CL_INVALID_VALUE);
 
+   if (!opts)
+  opts = "";
+
if (devs) {
   if (any_of([&](const cl_device_id dev) {
return !prog->ctx.has_device(dev);
 }, devs, devs + count))
  throw error(CL_INVALID_DEVICE);
 
-  prog->build({ devs, devs + count });
+  prog->build({ devs, devs + count }, opts);
} else {
-  prog->build(prog->ctx.devs);
+  prog->build(prog->ctx.devs, opts);
}
 
return CL_SUCCESS;
diff --git a/src/gallium/state_trackers/clover/core/compiler.hpp 
b/src/gallium/state_trackers/clover/core/compiler.hpp
index a43050a..3869507 100644
--- a/src/gallium/state_trackers/clover/core/compiler.hpp
+++ b/src/gallium/state_trackers/clover/core/compiler.hpp
@@ -44,9 +44,19 @@ namespace clover {
   compat::vector log;
};
 
+   class invalid_option_error {
+   public:
+  invalid_option_error() {
+  }
+
+  virtual ~invalid_option_error() {
+  }
+   };
+
module compile_program_llvm(const compat::string &source,
enum pipe_shader_ir ir,
-   const compat::string &target);
+   const compat::string &target,
+   const compat::string &opts);
 
module compile_program_tgsi(const compat::string &source);
 }
diff --git a/src/gallium/state_trackers/clover/core/program.cpp 
b/src/gallium/state_trackers/clover/core/program.cpp
index 5fcda23..92f1d6f 100644
--- a/src/gallium/state_trackers/clover/core/program.cpp
+++ b/src/gallium/state_trackers/clover/core/program.cpp
@@ -41,21 +41,27 @@ _cl_program::_cl_program(clover::context &ctx,
 }
 
 void
-_cl_program::build(const std::vector &devs) {
+_cl_program::build(const std::vector &devs,
+   const char *opts) {
 
for (auto dev : devs) {
   __binaries.erase(dev);
   __logs.erase(dev);
+  __opts.erase(dev);
+
+  __opts.insert({ dev, opts });
   try {
  auto module = (dev->ir_format() == PIPE_SHADER_IR_TGSI ?
 compile_program_tgsi(__source) :
 compile_program_llvm(__source, dev->ir_format(),
-dev->ir_target()));
+dev->ir_target(), build_opts(dev)));
  __binaries.insert({ dev, module });
 
   } catch (build_error &e) {
  __logs.insert({ dev, e.what() });
  throw error(CL_BUILD_PROGRAM_FAILURE);
+  } catch (invalid_option_error &e) {
+ throw error(CL_INVALID_BUILD_OPTIONS);
   }
}
 }
@@ -77,7 +83,7 @@ _cl_program::build_status(clover::device *dev) const {
 
 std::string
 _cl_program::build_opts(clover::device *dev) const {
-   return {};
+   return __opts.count(dev) ? __opts.find(dev)->second : "";
 }
 
 std::string
diff --git a/src/gallium/state_trackers/clover/core/program.hpp 
b/src/gallium/state_trackers/clover/core/program.hpp
index f3858f6..0cda8ee 100644
--- a/src/gallium/state_trackers/clover/core/program.hpp
+++ b/src/gallium/state_trackers/clover/core/program.hpp
@@ -41,7 +41,7 @@ public:
const std::vector &devs,
const std::vector &binaries);
 
-   void build(const std::vector &devs);
+   void build(const std::vector &devs, const char *opts);
 
const std::string &source() const;
const std::map &binaries() const;
@@ -55,6 +55,7 @@ public:
 private:
std::map __binaries;
std::map __logs;
+   std::map __opts;
std::string __source;
 };
 
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 2b07053..c997367 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -23,6 +23,7 @@
 #include "core/compiler.hpp"
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -51,6 +52,7 @@
 #include 
 #include 
 #include 
+#include 
 
 using namespace clover;
 
@@ -98,15 +100,49 @@ namespace {
 
llvm::Module *
compile(const std::string &source, const std::string &name,
-   const std::string &triple) {
+   const std::string &triple, const std::s

Re: [Mesa-dev] [PATCH 12/13] mesa: Support querying GL_MAX_ELEMENT_INDEX in ES 3

2012-12-11 Thread Brian Paul

On 12/10/2012 03:28 PM, Matt Turner wrote:

The ES 3 spec says that the minumum allowable value is 2^24-1, but the
GL 4.3 and ARB_ES3_compatibility specs require 2^32-1, so return 2^32-1.

Fixes es3conform's element_index_uint_constants test.
---
  src/mesa/main/context.c  |3 +++
  src/mesa/main/get.c  |1 +
  src/mesa/main/get_hash_params.py |3 +++
  src/mesa/main/mtypes.h   |3 +++
  4 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index fc2db12..241a1f9 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -656,6 +656,9 @@ _mesa_init_constants(struct gl_context *ctx)

 /* PrimitiveRestart */
 ctx->Const.PrimitiveRestartInSoftware = GL_FALSE;
+
+   /* ES 3.0 or ARB_ES3_compatibility */
+   ctx->Const.MaxElementIndex = UINT_MAX;
  }


diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 115d3c5..c7f8ada 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -304,6 +304,7 @@ static const int 
extra_ARB_uniform_buffer_object_and_geometry_shader[] = {


  EXTRA_EXT(ARB_ES2_compatibility);
+EXTRA_EXT(ARB_ES3_compatibility);
  EXTRA_EXT(ARB_texture_cube_map);
  EXTRA_EXT(MESA_texture_array);
  EXTRA_EXT2(EXT_secondary_color, ARB_vertex_program);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index d0e8a76..cb58394 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -321,6 +321,9 @@ descriptor=[

  # Enums in  OpenGL and ES 3.0
  { "apis": ["GL", "GL_CORE", "GLES3"], "params": [
+# GL_ARB_ES3_compatibility
+  [ "MAX_ELEMENT_INDEX", "CONTEXT_INT64(Const.MaxElementIndex), 
extra_ARB_ES3_compatibility"],
+
  # GL_ARB_fragment_shader
[ "MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB", 
"CONTEXT_INT(Const.FragmentProgram.MaxUniformComponents), extra_ARB_fragment_shader" ],

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index bd180a5..c9bef15 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2940,6 +2940,9 @@ struct gl_constants

 /** GL_ARB_map_buffer_alignment */
 GLuint MinMapBufferAlignment;
+
+   /** ES 3.0 or GL_ARB_ES3_compatibility */
+   GLint64 MaxElementIndex;


Since the value can only be positive (and it's a Z+ type in the spec's 
state table), I think this should be GLuint64.




  };




-Brian
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/6] AMDGPU: remove nonsense setPrefLoopAlignment

2012-12-11 Thread Christian König
The Align parameter is a power of two, so 16 results in 64K
alignment. Additional to that even 16 byte alignment doesn't
make any sense, so just remove it.

Signed-off-by: Christian König 
---
 lib/Target/AMDGPU/AMDILISelLowering.cpp |1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Target/AMDGPU/AMDILISelLowering.cpp 
b/lib/Target/AMDGPU/AMDILISelLowering.cpp
index 6a5d841..8bfd30c 100644
--- a/lib/Target/AMDGPU/AMDILISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDILISelLowering.cpp
@@ -217,7 +217,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
 
   setSchedulingPreference(Sched::RegPressure);
   setPow2DivIsCheap(false);
-  setPrefLoopAlignment(16);
   setSelectIsExpensive(true);
   setJumpIsExpensive(true);
 
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/6] AMDGPU: BB operand support for SI

2012-12-11 Thread Christian König
Signed-off-by: Christian König 
---
 lib/Target/AMDGPU/AMDGPUMCInstLower.cpp|   10 --
 lib/Target/AMDGPU/AMDGPUMCInstLower.h  |5 -
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp   |   10 +-
 lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp |6 ++
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp 
b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index de4053e..32275a2b 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -21,11 +21,14 @@
 #include "llvm/Constants.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-AMDGPUMCInstLower::AMDGPUMCInstLower() { }
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
+  Ctx(ctx)
+{ }
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
@@ -50,13 +53,16 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, 
MCInst &OutMI) const {
 case MachineOperand::MO_Register:
   MCOp = MCOperand::CreateReg(MO.getReg());
   break;
+case MachineOperand::MO_MachineBasicBlock:
+  MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+   MO.getMBB()->getSymbol(), Ctx));
 }
 OutMI.addOperand(MCOp);
   }
 }
 
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  AMDGPUMCInstLower MCInstLowering;
+  AMDGPUMCInstLower MCInstLowering(OutContext);
 
   if (MI->isBundle()) {
 const MachineBasicBlock *MBB = MI->getParent();
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h 
b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index d7bf827..d7d538e 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -14,12 +14,15 @@
 namespace llvm {
 
 class MCInst;
+class MCContext;
 class MachineInstr;
 
 class AMDGPUMCInstLower {
 
+  MCContext &Ctx;
+
 public:
-  AMDGPUMCInstLower();
+  AMDGPUMCInstLower(MCContext &ctx);
 
   /// \brief Lower a MachineInstr to an MCInst
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp 
b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 3417fbc..8f41ebb 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -47,7 +47,7 @@ public:
   virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
   virtual unsigned getNumFixupKinds() const { return 0; };
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-  uint64_t Value) const { assert(!"Not implemented"); }
+  uint64_t Value) const;
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
 const MCInstFragment *DF,
 const MCAsmLayout &Layout) const {
@@ -80,3 +80,11 @@ AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
 raw_ostream &OS) const 
{
   return new AMDGPUMCObjectWriter(OS);
 }
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+  unsigned DataSize, uint64_t Value) const {
+
+  uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
+  assert(Fixup.getKind() == FK_PCRel_4);
+  *Dst = (Value - 4) / 4;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp 
b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 7f271d1..c47dc99 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
@@ -149,6 +150,11 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst 
&MI,
 } Imm;
 Imm.F = MO.getFPImm();
 return Imm.I;
+  } else if (MO.isExpr()) {
+const MCExpr *Expr = MO.getExpr();
+MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
+Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+return 0;
   } else{
 llvm_unreachable("Encoding of this operand type is not supported yet.");
   }
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/6] AMDGPU: enable S_*N2_* instructions

2012-12-11 Thread Christian König
They seem to work fine.

Signed-off-by: Christian König 
---
 lib/Target/AMDGPU/SIInstructions.td |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInstructions.td 
b/lib/Target/AMDGPU/SIInstructions.td
index ea8de91..008652f 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -971,10 +971,10 @@ def S_OR_B32 : SOP2_32 <0x0010, "S_OR_B32", []>;
 def S_OR_B64 : SOP2_64 <0x0011, "S_OR_B64", []>;
 def S_XOR_B32 : SOP2_32 <0x0012, "S_XOR_B32", []>;
 def S_XOR_B64 : SOP2_64 <0x0013, "S_XOR_B64", []>;
-def S_ANDN2_B32 : SOP2_ANDN2 <0x0014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_ANDN2 <0x0015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_ORN2 <0x0016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_ORN2 <0x0017, "S_ORN2_B64", []>;
+def S_ANDN2_B32 : SOP2_32 <0x0014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x0015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x0016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x0017, "S_ORN2_B64", []>;
 def S_NAND_B32 : SOP2_32 <0x0018, "S_NAND_B32", []>;
 def S_NAND_B64 : SOP2_64 <0x0019, "S_NAND_B64", []>;
 def S_NOR_B32 : SOP2_32 <0x001a, "S_NOR_B32", []>;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/6] AMDGPU: Add control flow optimization

2012-12-11 Thread Christian König
Branch if we have enough instructions so that it makes sense.
Also remove branches if they don't make sense.

Signed-off-by: Christian König 
---
 lib/Target/AMDGPU/SILowerControlFlow.cpp |   49 ++
 1 file changed, 49 insertions(+)

diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp 
b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 1abcb88..507cb54 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -63,9 +63,13 @@ namespace {
 class SILowerControlFlowPass : public MachineFunctionPass {
 
 private:
+  static const unsigned SkipThreshold = 12;
+
   static char ID;
   const TargetInstrInfo *TII;
 
+  void Skip(MachineInstr &MI, MachineOperand &To);
+
   void If(MachineInstr &MI);
   void Else(MachineInstr &MI);
   void Break(MachineInstr &MI);
@@ -74,6 +78,8 @@ private:
   void Loop(MachineInstr &MI);
   void EndCf(MachineInstr &MI);
 
+  void Branch(MachineInstr &MI);
+
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
 MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
@@ -94,6 +100,31 @@ FunctionPass 
*llvm::createSILowerControlFlowPass(TargetMachine &tm) {
   return new SILowerControlFlowPass(tm);
 }
 
+void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+
+  unsigned NumInstr = 0;
+
+  for (MachineBasicBlock *MBB = *From.getParent()->succ_begin();
+   NumInstr < SkipThreshold && MBB != To.getMBB() && !MBB->succ_empty();
+   MBB = *MBB->succ_begin()) {
+
+for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ NumInstr < SkipThreshold && I != E; ++I) {
+
+  if (I->isBundle() || !I->isBundled())
+++NumInstr;
+}
+  }
+
+  if (NumInstr < SkipThreshold)
+return;
+
+  DebugLoc DL = From.getDebugLoc();
+  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+  .addOperand(To)
+  .addReg(AMDGPU::EXEC);
+}
+
 void SILowerControlFlowPass::If(MachineInstr &MI) {
 
   MachineBasicBlock &MBB = *MI.getParent();
@@ -108,6 +139,8 @@ void SILowerControlFlowPass::If(MachineInstr &MI) {
   .addReg(AMDGPU::EXEC)
   .addReg(Reg);
 
+  Skip(MI, MI.getOperand(2));
+
   MI.eraseFromParent();
 }
 
@@ -125,6 +158,8 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) {
   .addReg(AMDGPU::EXEC)
   .addReg(Dst);
 
+  Skip(MI, MI.getOperand(2));
+
   MI.eraseFromParent();
 }
 
@@ -206,6 +241,16 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+void SILowerControlFlowPass::Branch(MachineInstr &MI) {
+
+  MachineBasicBlock *Next = MI.getParent()->getNextNode();
+  MachineBasicBlock *Target = MI.getOperand(0).getMBB();
+  if (Target == Next)
+MI.eraseFromParent();
+  else
+assert(0);
+}
+
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
   bool HaveCf = false;
@@ -249,6 +294,10 @@ bool 
SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   HaveCf = true;
   EndCf(MI);
   break;
+
+case AMDGPU::S_BRANCH:
+  Branch(MI);
+  break;
   }
 }
   }
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/6] AMDGPU: Remove unecessary VREG alignment.

2012-12-11 Thread Christian König
Unlike SGPRs VGPRs doesn't need to be aligned.

Signed-off-by: Christian König 
---
 lib/Target/AMDGPU/SIRegisterInfo.td |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td 
b/lib/Target/AMDGPU/SIRegisterInfo.td
index e52311a..c3f1361 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -105,15 +105,15 @@ def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
 
 // VGPR 64-bit registers
 def VGPR_64 : RegisterTuples<[low, high],
- [(add (decimate VGPR_32, 2)),
-  (add (decimate (rotl VGPR_32, 1), 2))]>;
+ [(add VGPR_32),
+  (add (rotl VGPR_32, 1))]>;
 
 // VGPR 128-bit registers
 def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
-  [(add (decimate VGPR_32, 4)),
-   (add (decimate (rotl VGPR_32, 1), 4)),
-   (add (decimate (rotl VGPR_32, 2), 4)),
-   (add (decimate (rotl VGPR_32, 3), 4))]>;
+  [(add VGPR_32),
+   (add (rotl VGPR_32, 1)),
+   (add (rotl VGPR_32, 2)),
+   (add (rotl VGPR_32, 3))]>;
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/egl/drm: only unref the udev device if needed

2012-12-11 Thread Tobias Droste
Could you commit and push it to master?

Am Mi, 5. Dezember 2012, 09:31:48 schrieben Sie:
> On Tue, Dec 4, 2012 at 12:50 PM, Tobias Droste  wrote:
> > Anyone interested? ;-)
> > 
> > I would just push it, but I don't have the rights to do so.
> 
> Looks reasonable to me.
> 
> Reviewed-by: Alex Deucher 
> 
> > Am Do, 29. November 2012, 17:02:28 schrieben Sie:
> >> Fixes compiler warning:
> >> 
> >> drm/native_drm.c: In function ‘native_create_display’:
> >> drm/native_drm.c:180:21: warning: ‘device’ may be used uninitialized in
> >> this function [-Wmaybe-uninitialized] drm/native_drm.c:157:24: note:
> >> ‘device’ was declared here
> >> 
> >> Signed-off-by: Tobias Droste 
> >> ---
> >> 
> >>  src/gallium/state_trackers/egl/drm/native_drm.c |9 +
> >>  1 Datei geändert, 5 Zeilen hinzugefügt(+), 4 Zeilen entfernt(-)
> >> 
> >> diff --git a/src/gallium/state_trackers/egl/drm/native_drm.c
> >> b/src/gallium/state_trackers/egl/drm/native_drm.c index 23fc137..f0c0f54
> >> 100644
> >> --- a/src/gallium/state_trackers/egl/drm/native_drm.c
> >> +++ b/src/gallium/state_trackers/egl/drm/native_drm.c
> >> @@ -161,23 +161,24 @@ drm_get_device_name(int fd)
> >> 
> >> udev = udev_new();
> >> if (fstat(fd, &buf) < 0) {
> >> 
> >>_eglLog(_EGL_WARNING, "failed to stat fd %d", fd);
> >> 
> >> -  goto out;
> >> +  goto outudev;
> >> 
> >> }
> >> 
> >> device = udev_device_new_from_devnum(udev, 'c', buf.st_rdev);
> >> if (device == NULL) {
> >> 
> >>_eglLog(_EGL_WARNING,
> >>
> >>"could not create udev device for fd %d", fd);
> >> 
> >> -  goto out;
> >> +  goto outdevice;
> >> 
> >> }
> >> 
> >> tmp = udev_device_get_devnode(device);
> >> if (!tmp)
> >> 
> >> -  goto out;
> >> +  goto outdevice;
> >> 
> >> device_name = strdup(tmp);
> >> 
> >> -out:
> >> 
> >> +outdevice:
> >> udev_device_unref(device);
> >> 
> >> +outudev:
> >> udev_unref(udev);
> >>  
> >>  #endif
> > 
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/egl/drm: only unref the udev device if needed

2012-12-11 Thread Alex Deucher
On Tue, Dec 11, 2012 at 12:49 PM, Tobias Droste  wrote:
> Could you commit and push it to master?

Done.  Thanks!

Alex

>
> Am Mi, 5. Dezember 2012, 09:31:48 schrieben Sie:
>> On Tue, Dec 4, 2012 at 12:50 PM, Tobias Droste  wrote:
>> > Anyone interested? ;-)
>> >
>> > I would just push it, but I don't have the rights to do so.
>>
>> Looks reasonable to me.
>>
>> Reviewed-by: Alex Deucher 
>>
>> > Am Do, 29. November 2012, 17:02:28 schrieben Sie:
>> >> Fixes compiler warning:
>> >>
>> >> drm/native_drm.c: In function ‘native_create_display’:
>> >> drm/native_drm.c:180:21: warning: ‘device’ may be used uninitialized in
>> >> this function [-Wmaybe-uninitialized] drm/native_drm.c:157:24: note:
>> >> ‘device’ was declared here
>> >>
>> >> Signed-off-by: Tobias Droste 
>> >> ---
>> >>
>> >>  src/gallium/state_trackers/egl/drm/native_drm.c |9 +
>> >>  1 Datei geändert, 5 Zeilen hinzugefügt(+), 4 Zeilen entfernt(-)
>> >>
>> >> diff --git a/src/gallium/state_trackers/egl/drm/native_drm.c
>> >> b/src/gallium/state_trackers/egl/drm/native_drm.c index 23fc137..f0c0f54
>> >> 100644
>> >> --- a/src/gallium/state_trackers/egl/drm/native_drm.c
>> >> +++ b/src/gallium/state_trackers/egl/drm/native_drm.c
>> >> @@ -161,23 +161,24 @@ drm_get_device_name(int fd)
>> >>
>> >> udev = udev_new();
>> >> if (fstat(fd, &buf) < 0) {
>> >>
>> >>_eglLog(_EGL_WARNING, "failed to stat fd %d", fd);
>> >>
>> >> -  goto out;
>> >> +  goto outudev;
>> >>
>> >> }
>> >>
>> >> device = udev_device_new_from_devnum(udev, 'c', buf.st_rdev);
>> >> if (device == NULL) {
>> >>
>> >>_eglLog(_EGL_WARNING,
>> >>
>> >>"could not create udev device for fd %d", fd);
>> >>
>> >> -  goto out;
>> >> +  goto outdevice;
>> >>
>> >> }
>> >>
>> >> tmp = udev_device_get_devnode(device);
>> >> if (!tmp)
>> >>
>> >> -  goto out;
>> >> +  goto outdevice;
>> >>
>> >> device_name = strdup(tmp);
>> >>
>> >> -out:
>> >>
>> >> +outdevice:
>> >> udev_device_unref(device);
>> >>
>> >> +outudev:
>> >> udev_unref(udev);
>> >>
>> >>  #endif
>> >
>> > ___
>> > mesa-dev mailing list
>> > mesa-dev@lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] clover: Don't erase build info of devices not being built

2012-12-11 Thread Francisco Jerez
Tom Stellard  writes:

> From: Tom Stellard 
>
> Every call to _cl_program::build() was erasing the binaries and logs for
> every device associated with the program.  This is incorrect because
> it is possible to build a program for only a subset of devices and so
> any device not being build should not have this information erased.
> ---
>  src/gallium/state_trackers/clover/core/program.cpp | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>

Thanks.  For both patches:

Reviewed-by: Francisco Jerez 

> diff --git a/src/gallium/state_trackers/clover/core/program.cpp 
> b/src/gallium/state_trackers/clover/core/program.cpp
> index 6ca8080..5fcda23 100644
> --- a/src/gallium/state_trackers/clover/core/program.cpp
> +++ b/src/gallium/state_trackers/clover/core/program.cpp
> @@ -42,10 +42,10 @@ _cl_program::_cl_program(clover::context &ctx,
>  
>  void
>  _cl_program::build(const std::vector &devs) {
> -   __binaries.clear();
> -   __logs.clear();
>  
> for (auto dev : devs) {
> +  __binaries.erase(dev);
> +  __logs.erase(dev);
>try {
>   auto module = (dev->ir_format() == PIPE_SHADER_IR_TGSI ?
>  compile_program_tgsi(__source) :


pgpC3rvXAF4YR.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] i965: Scale shader_time to compensate for resets.

2012-12-11 Thread Eric Anholt
Some shaders experience resets more than others, which skews the numbers
reported.  Attempt to correct for this by linearly scaling according to
the number of resets that happen.

Note that will not be accurate if invocations of shaders have varying
times and longer invocations are more likely to reset.  However, this
should at least be better than the previous situation.
---
 src/mesa/drivers/dri/i965/brw_context.h |6 +++
 src/mesa/drivers/dri/i965/brw_fs.cpp|   10 -
 src/mesa/drivers/dri/i965/brw_program.c |   72 ---
 src/mesa/drivers/dri/i965/brw_vec4.cpp  |4 +-
 4 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index dc25cab..eba9eb5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -655,8 +655,14 @@ struct brw_tracked_state {
 enum shader_time_shader_type {
ST_NONE,
ST_VS,
+   ST_VS_WRITTEN,
+   ST_VS_RESET,
ST_FS8,
+   ST_FS8_WRITTEN,
+   ST_FS8_RESET,
ST_FS16,
+   ST_FS16_WRITTEN,
+   ST_FS16_RESET,
 };
 
 /* Flags for brw->state.cache.
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2f4c669..f428a83 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -505,12 +505,16 @@ fs_visitor::emit_shader_time_end()
 {
current_annotation = "shader time end";
 
-   enum shader_time_shader_type type;
+   enum shader_time_shader_type type, written_type, reset_type;
if (dispatch_width == 8) {
   type = ST_FS8;
+  written_type = ST_FS8_WRITTEN;
+  reset_type = ST_FS8_RESET;
} else {
   assert(dispatch_width == 16);
   type = ST_FS16;
+  written_type = ST_FS16_WRITTEN;
+  reset_type = ST_FS16_RESET;
}
 
fs_reg shader_end_time = get_timestamp();
@@ -537,7 +541,9 @@ fs_visitor::emit_shader_time_end()
emit(ADD(diff, diff, fs_reg(-2u)));
 
emit_shader_time_write(type, diff);
-
+   emit_shader_time_write(written_type, fs_reg(1u));
+   emit(BRW_OPCODE_ELSE);
+   emit_shader_time_write(reset_type, fs_reg(1u));
emit(BRW_OPCODE_ENDIF);
 
pop_force_uncompressed();
diff --git a/src/mesa/drivers/dri/i965/brw_program.c 
b/src/mesa/drivers/dri/i965/brw_program.c
index 1859041..0358241 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -222,16 +222,73 @@ compare_time(const void *a, const void *b)
 }
 
 static void
+get_written_and_reset(struct brw_context *brw, int i,
+  uint64_t *written, uint64_t *reset)
+{
+   enum shader_time_shader_type type = brw->shader_time.types[i];
+   assert(type == ST_VS || type == ST_FS8 || type == ST_FS16);
+
+   /* Find where we recorded written and reset. */
+   int wi, ri;
+
+   for (wi = i; brw->shader_time.types[wi] != type + 1; wi++)
+  ;
+
+   for (ri = i; brw->shader_time.types[ri] != type + 2; ri++)
+  ;
+
+   *written = brw->shader_time.cumulative[wi];
+   *reset = brw->shader_time.cumulative[ri];
+}
+
+static void
 brw_report_shader_time(struct brw_context *brw)
 {
if (!brw->shader_time.bo || !brw->shader_time.num_entries)
   return;
 
+   uint64_t scaled[brw->shader_time.num_entries];
uint64_t *sorted[brw->shader_time.num_entries];
double total = 0;
for (int i = 0; i < brw->shader_time.num_entries; i++) {
-  sorted[i] = &brw->shader_time.cumulative[i];
-  total += brw->shader_time.cumulative[i];
+  uint64_t written = 0, reset = 0;
+
+  sorted[i] = &scaled[i];
+
+  switch (brw->shader_time.types[i]) {
+  case ST_VS_WRITTEN:
+  case ST_VS_RESET:
+  case ST_FS8_WRITTEN:
+  case ST_FS8_RESET:
+  case ST_FS16_WRITTEN:
+  case ST_FS16_RESET:
+ /* We'll handle these when along with the time. */
+ scaled[i] = 0;
+ continue;
+
+  case ST_VS:
+  case ST_FS8:
+  case ST_FS16:
+ get_written_and_reset(brw, i, &written, &reset);
+ break;
+
+  default:
+ /* I sometimes want to print things that aren't the 3 shader times.
+  * Just print the sum in that case.
+  */
+ written = 1;
+ reset = 0;
+ break;
+  }
+
+  uint64_t time = brw->shader_time.cumulative[i];
+  if (written) {
+ scaled[i] = time / written * (written + reset);
+  } else {
+ scaled[i] = time;
+  }
+
+  total += scaled[i];
}
 
if (total == 0) {
@@ -245,7 +302,10 @@ brw_report_shader_time(struct brw_context *brw)
printf("type   ID  cycles spent   %% of total\n");
for (int s = 0; s < brw->shader_time.num_entries; s++) {
   /* Work back from the sorted pointers times to a time to print. */
-  int i = sorted[s] - brw->shader_time.cumulative;
+  int i = sorted[s] - scaled;
+
+  if (scaled[i] == 0)
+ continue;
 
   int shader_num = -1;
   if 

[Mesa-dev] [PATCH 3/3] i965: Print a total time for the different shader stages.

2012-12-11 Thread Eric Anholt
Sometimes I've got a patch for a performance optimization that's not
showing a statistically significant performance difference on reported
FPS, but still seems like a good idea because it ought to reduce time
spent in the shader.  If I can see the total number of cycles spent in
the shader stage being optimized, it may show that the patch is still
worthwhile (or point out that it's actually broken in some way).
---
 src/mesa/drivers/dri/i965/brw_program.c |   48 ---
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_program.c 
b/src/mesa/drivers/dri/i965/brw_program.c
index 0358241..d7b240a 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -242,6 +242,21 @@ get_written_and_reset(struct brw_context *brw, int i,
 }
 
 static void
+print_shader_time_line(const char *name, int shader_num,
+   uint64_t time, uint64_t total)
+{
+   printf("%s", name);
+   for (int i = strlen(name); i < 10; i++)
+  printf(" ");
+   printf("%4d: ", shader_num);
+
+   printf("%16lld (%7.2f Gcycles)  %4.1f%%\n",
+  (long long)time,
+  (double)time / 10.0,
+  (double)time / total * 100.0);
+}
+
+static void
 brw_report_shader_time(struct brw_context *brw)
 {
if (!brw->shader_time.bo || !brw->shader_time.num_entries)
@@ -249,13 +264,16 @@ brw_report_shader_time(struct brw_context *brw)
 
uint64_t scaled[brw->shader_time.num_entries];
uint64_t *sorted[brw->shader_time.num_entries];
+   uint64_t total_by_type[ST_FS16 + 1];
+   memset(total_by_type, 0, sizeof(total_by_type));
double total = 0;
for (int i = 0; i < brw->shader_time.num_entries; i++) {
   uint64_t written = 0, reset = 0;
+  enum shader_time_shader_type type = brw->shader_time.types[i];
 
   sorted[i] = &scaled[i];
 
-  switch (brw->shader_time.types[i]) {
+  switch (type) {
   case ST_VS_WRITTEN:
   case ST_VS_RESET:
   case ST_FS8_WRITTEN:
@@ -288,6 +306,16 @@ brw_report_shader_time(struct brw_context *brw)
  scaled[i] = time;
   }
 
+  switch (type) {
+  case ST_VS:
+  case ST_FS8:
+  case ST_FS16:
+ total_by_type[type] += scaled[i];
+ break;
+  default:
+ break;
+  }
+
   total += scaled[i];
}
 
@@ -314,24 +342,24 @@ brw_report_shader_time(struct brw_context *brw)
 
   switch (brw->shader_time.types[i]) {
   case ST_VS:
- printf("vs   %4d: ", shader_num);
+ print_shader_time_line("vs", shader_num, scaled[i], total);
  break;
   case ST_FS8:
- printf("fs8  %4d: ", shader_num);
+ print_shader_time_line("fs8", shader_num, scaled[i], total);
  break;
   case ST_FS16:
- printf("fs16 %4d: ", shader_num);
+ print_shader_time_line("fs16", shader_num, scaled[i], total);
  break;
   default:
- printf("other: ");
+ print_shader_time_line("other", shader_num, scaled[i], total);
  break;
   }
-
-  printf("%16lld (%7.2f Gcycles)  %4.1f%%\n",
- (long long)scaled[i],
- (double)scaled[i] / 10.0,
- (double)scaled[i] / total * 100.0);
}
+
+   printf("\n");
+   print_shader_time_line("total vs", -1, total_by_type[ST_VS], total);
+   print_shader_time_line("total fs8", -1, total_by_type[ST_FS8], total);
+   print_shader_time_line("total fs16", -1, total_by_type[ST_FS16], total);
 }
 
 static void
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] i965: Adjust the split between shader_time_end() and shader_time_write().

2012-12-11 Thread Eric Anholt
I'm about to emit other kinds of writes besides time deltas, and it
turns out with the frequency of resets, we couldn't really use the old
time delta write() function more than once in a shader.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp   |   53 +---
 src/mesa/drivers/dri/i965/brw_fs.h |2 +-
 src/mesa/drivers/dri/i965/brw_vec4.cpp |   49 ++---
 src/mesa/drivers/dri/i965/brw_vec4.h   |2 +-
 4 files changed, 55 insertions(+), 51 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ac0bb56..2f4c669 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -513,38 +513,22 @@ fs_visitor::emit_shader_time_end()
   type = ST_FS16;
}
 
-   emit_shader_time_write(type, shader_start_time, get_timestamp());
-}
-
-void
-fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-   fs_reg start, fs_reg end)
-{
-   /* Choose an index in the buffer and set up tracking information for our
-* printouts.
-*/
-   int shader_time_index = brw->shader_time.num_entries++;
-   assert(shader_time_index <= brw->shader_time.max_entries);
-   brw->shader_time.types[shader_time_index] = type;
-   if (prog) {
-  _mesa_reference_shader_program(ctx,
- 
&brw->shader_time.programs[shader_time_index],
- prog);
-   }
+   fs_reg shader_end_time = get_timestamp();
 
/* Check that there weren't any timestamp reset events (assuming these
 * were the only two timestamp reads that happened).
 */
-   fs_reg reset = end;
+   fs_reg reset = shader_end_time;
reset.smear = 2;
fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
test->conditional_mod = BRW_CONDITIONAL_Z;
emit(IF(BRW_PREDICATE_NORMAL));
 
push_force_uncompressed();
+   fs_reg start = shader_start_time;
start.negate = true;
fs_reg diff = fs_reg(this, glsl_type::uint_type);
-   emit(ADD(diff, start, end));
+   emit(ADD(diff, start, shader_end_time));
 
/* If there were no instructions between the two timestamp gets, the diff
 * is 2 cycles.  Remove that overhead, so I can forget about that when
@@ -552,6 +536,29 @@ fs_visitor::emit_shader_time_write(enum 
shader_time_shader_type type,
 */
emit(ADD(diff, diff, fs_reg(-2u)));
 
+   emit_shader_time_write(type, diff);
+
+   emit(BRW_OPCODE_ENDIF);
+
+   pop_force_uncompressed();
+}
+
+void
+fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
+   fs_reg value)
+{
+   /* Choose an index in the buffer and set up tracking information for our
+* printouts.
+*/
+   int shader_time_index = brw->shader_time.num_entries++;
+   assert(shader_time_index <= brw->shader_time.max_entries);
+   brw->shader_time.types[shader_time_index] = type;
+   if (prog) {
+  _mesa_reference_shader_program(ctx,
+ 
&brw->shader_time.programs[shader_time_index],
+ prog);
+   }
+
int base_mrf = 6;
 
fs_reg offset_mrf = fs_reg(MRF, base_mrf);
@@ -560,15 +567,11 @@ fs_visitor::emit_shader_time_write(enum 
shader_time_shader_type type,
 
fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
time_mrf.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(time_mrf, diff));
+   emit(MOV(time_mrf, value));
 
fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
inst->base_mrf = base_mrf;
inst->mlen = 2;
-
-   pop_force_uncompressed();
-
-   emit(BRW_OPCODE_ENDIF);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index d4ddb47..6caf7c3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -392,7 +392,7 @@ public:
void emit_shader_time_begin();
void emit_shader_time_end();
void emit_shader_time_write(enum shader_time_shader_type type,
-   fs_reg start, fs_reg end);
+   fs_reg value);
 
bool try_rewrite_rhs_to_dst(ir_assignment *ir,
   fs_reg dst,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index dc9d9d5..28199dd 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1081,29 +1081,11 @@ vec4_visitor::emit_shader_time_end()
current_annotation = "shader time end";
src_reg shader_end_time = get_timestamp();
 
-   emit_shader_time_write(ST_VS, shader_start_time, shader_end_time);
-}
-
-void
-vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
- src_reg start, src_reg end)
-{
-   /* Choose an index in the buffer and set up tracking information for our
-* printouts.
-*/
-   int shader_time_index = brw->shader_time.num_entries++;
-   assert(shader_t

Re: [Mesa-dev] [PATCH 3/7] i965/fs: Before reg alloc, schedule instructions to reduce live ranges.

2012-12-11 Thread Eric Anholt
Kenneth Graunke  writes:

> On 12/07/2012 02:58 PM, Eric Anholt wrote:
>> This came from an idea by Ben Segovia.  16-wide pixel shaders are very
>> important for latency hiding on i965, so we want to try really hard to
>> get them.  If scheduling an instruction makes some set of instructions
>> available, those are probably the ones that make the instruction's
>> result dead.  By choosing those first, we'll have a tendency to reduce
>> the amount of live data as opposed to creating more.
>>
>> Previously, we were sometimes getting this behavior out of the
>> scheduler, which was what produced the scheduler's original performance
>> wins on lightsmark.  Unfortunately, that was mostly an accident of the
>> lame instruction latency information that I had, which made it
>> impossible to fix the actual scheduling for performance.  Now that we've
>> fixed the scheduling for setup for register allocation, we can safely
>> update the latency parameters for the final schedule.
>>
>> In shader-db, we lose 37 16-wide shaders, but gain 90 new ones.  4
>> shaders that were spilling change how many registers spill, for a
>> reduction of 70/3899 instructions.
>> ---
>>   .../dri/i965/brw_fs_schedule_instructions.cpp  |   49 
>> +---
>>   1 file changed, 43 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
>> index d48ad1e..3941eac 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
>> + for (schedule_node *node = (schedule_node 
>> *)instructions.get_tail();
>> +  node != instructions.get_head()->prev;
>> +  node = (schedule_node *)node->prev) {
>> +schedule_node *n = (schedule_node *)node;
>> +
>> +if (!chosen || chosen->inst->regs_written() > 1) {
>> +   chosen = n;
>> +   if (chosen->inst->regs_written() <= 1)
>> +  break;
>> +}
>
> I don't think the if condition is necessary here.  Just doing
>
> for (...) {
>   chosen = (schedule_node *) node;
>   if (chosen->inst->regs_written() <= 1)
>   break;
> }
>
> should accomplish the same thing.

Yeah, it was a leftover of trying a bunch of more complicated
heuristics.

>> - if (!chosen || n->unblocked_time < chosen_time) {
>> -chosen = n;
>> -chosen_time = n->unblocked_time;
>> - }
>> + chosen_time = chosen->unblocked_time;
>
> It seems plausible that there could be no nodes to schedule...which 
> means chosen would be NULL here.  Perhaps just move chosen_time = 
> chosen->unblocked_time into the if...break above.

This is all in a loop while (!instructions.is_empty()), and these loops
have to pick one of those.


pgpqLqB62uIjJ.pgp
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/dri: add a way to force MSAA on with an environment variable

2012-12-11 Thread Ian Romanick

On 12/10/2012 03:51 PM, Eric Anholt wrote:

Marek Olšák  writes:


There are 2 ways. I prefer the former:
   GALLIUM_MSAA=n
   __GL_FSAA_MODE=n

Tested with ETQW, which doesn't support MSAA on Linux. This is
the only way to get MSAA there.


This sounds like something that would be nice to add as a driconf
knob.


I like the idea of also accepting __GL_FSAA_MODE... since that's what 
the NVIDIA closed-source driver uses.  The other name probably should 
use a driconf option.


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 12/13] mesa: Support querying GL_MAX_ELEMENT_INDEX in ES 3

2012-12-11 Thread Ian Romanick

On 12/10/2012 02:28 PM, Matt Turner wrote:

The ES 3 spec says that the minumum allowable value is 2^24-1, but the
GL 4.3 and ARB_ES3_compatibility specs require 2^32-1, so return 2^32-1.

Fixes es3conform's element_index_uint_constants test.
---
  src/mesa/main/context.c  |3 +++
  src/mesa/main/get.c  |1 +
  src/mesa/main/get_hash_params.py |3 +++
  src/mesa/main/mtypes.h   |3 +++
  4 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index fc2db12..241a1f9 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -656,6 +656,9 @@ _mesa_init_constants(struct gl_context *ctx)

 /* PrimitiveRestart */
 ctx->Const.PrimitiveRestartInSoftware = GL_FALSE;
+
+   /* ES 3.0 or ARB_ES3_compatibility */
+   ctx->Const.MaxElementIndex = UINT_MAX;


This should explicitly be 0x0.  It is possible for UINT_MAX to 
have some other value.  It seems like madness these days, but stranger 
things have happened.



  }


diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 115d3c5..c7f8ada 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -304,6 +304,7 @@ static const int 
extra_ARB_uniform_buffer_object_and_geometry_shader[] = {


  EXTRA_EXT(ARB_ES2_compatibility);
+EXTRA_EXT(ARB_ES3_compatibility);
  EXTRA_EXT(ARB_texture_cube_map);
  EXTRA_EXT(MESA_texture_array);
  EXTRA_EXT2(EXT_secondary_color, ARB_vertex_program);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index d0e8a76..cb58394 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -321,6 +321,9 @@ descriptor=[

  # Enums in  OpenGL and ES 3.0
  { "apis": ["GL", "GL_CORE", "GLES3"], "params": [
+# GL_ARB_ES3_compatibility
+  [ "MAX_ELEMENT_INDEX", "CONTEXT_INT64(Const.MaxElementIndex), 
extra_ARB_ES3_compatibility"],
+
  # GL_ARB_fragment_shader
[ "MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB", 
"CONTEXT_INT(Const.FragmentProgram.MaxUniformComponents), extra_ARB_fragment_shader" ],

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index bd180a5..c9bef15 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2940,6 +2940,9 @@ struct gl_constants

 /** GL_ARB_map_buffer_alignment */
 GLuint MinMapBufferAlignment;
+
+   /** ES 3.0 or GL_ARB_ES3_compatibility */
+   GLint64 MaxElementIndex;


In addition to Brian's comment, I'm not a huge fan of the way we comment 
structure fields like this.  I'd much rather see some description of 
what the field means than what extension it is for.  Something like:


   /**
* Maximum value supported for an index in DrawElements and friends.
*
* This must be at least (1ull<<24)-1.  The default value is
* (1ull<<32)-1.
*
* \since ES 3.0 or GL_ARB_ES3_compatibility
* \sa _mesa_init_constants
*/


  };





___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 12/13] mesa: Support querying GL_MAX_ELEMENT_INDEX in ES 3

2012-12-11 Thread Matt Turner
On Tue, Dec 11, 2012 at 11:00 AM, Ian Romanick  wrote:
> On 12/10/2012 02:28 PM, Matt Turner wrote:
>>
>> The ES 3 spec says that the minumum allowable value is 2^24-1, but the
>> GL 4.3 and ARB_ES3_compatibility specs require 2^32-1, so return 2^32-1.
>>
>> Fixes es3conform's element_index_uint_constants test.
>> ---
>>   src/mesa/main/context.c  |3 +++
>>   src/mesa/main/get.c  |1 +
>>   src/mesa/main/get_hash_params.py |3 +++
>>   src/mesa/main/mtypes.h   |3 +++
>>   4 files changed, 10 insertions(+), 0 deletions(-)
>>
>> diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
>> index fc2db12..241a1f9 100644
>> --- a/src/mesa/main/context.c
>> +++ b/src/mesa/main/context.c
>> @@ -656,6 +656,9 @@ _mesa_init_constants(struct gl_context *ctx)
>>
>>  /* PrimitiveRestart */
>>  ctx->Const.PrimitiveRestartInSoftware = GL_FALSE;
>> +
>> +   /* ES 3.0 or ARB_ES3_compatibility */
>> +   ctx->Const.MaxElementIndex = UINT_MAX;
>
>
> This should explicitly be 0x0.  It is possible for UINT_MAX to have
> some other value.  It seems like madness these days, but stranger things
> have happened.
>
>
>>   }
>>
>>
>> diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
>> index 115d3c5..c7f8ada 100644
>> --- a/src/mesa/main/get.c
>> +++ b/src/mesa/main/get.c
>> @@ -304,6 +304,7 @@ static const int
>> extra_ARB_uniform_buffer_object_and_geometry_shader[] = {
>>
>>
>>   EXTRA_EXT(ARB_ES2_compatibility);
>> +EXTRA_EXT(ARB_ES3_compatibility);
>>   EXTRA_EXT(ARB_texture_cube_map);
>>   EXTRA_EXT(MESA_texture_array);
>>   EXTRA_EXT2(EXT_secondary_color, ARB_vertex_program);
>> diff --git a/src/mesa/main/get_hash_params.py
>> b/src/mesa/main/get_hash_params.py
>> index d0e8a76..cb58394 100644
>> --- a/src/mesa/main/get_hash_params.py
>> +++ b/src/mesa/main/get_hash_params.py
>> @@ -321,6 +321,9 @@ descriptor=[
>>
>>   # Enums in  OpenGL and ES 3.0
>>   { "apis": ["GL", "GL_CORE", "GLES3"], "params": [
>> +# GL_ARB_ES3_compatibility
>> +  [ "MAX_ELEMENT_INDEX", "CONTEXT_INT64(Const.MaxElementIndex),
>> extra_ARB_ES3_compatibility"],
>> +
>>   # GL_ARB_fragment_shader
>> [ "MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB",
>> "CONTEXT_INT(Const.FragmentProgram.MaxUniformComponents),
>> extra_ARB_fragment_shader" ],
>>
>> diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
>> index bd180a5..c9bef15 100644
>> --- a/src/mesa/main/mtypes.h
>> +++ b/src/mesa/main/mtypes.h
>> @@ -2940,6 +2940,9 @@ struct gl_constants
>>
>>  /** GL_ARB_map_buffer_alignment */
>>  GLuint MinMapBufferAlignment;
>> +
>> +   /** ES 3.0 or GL_ARB_ES3_compatibility */
>> +   GLint64 MaxElementIndex;
>
>
> In addition to Brian's comment, I'm not a huge fan of the way we comment
> structure fields like this.  I'd much rather see some description of what
> the field means than what extension it is for.  Something like:
>
>/**
> * Maximum value supported for an index in DrawElements and friends.
> *
> * This must be at least (1ull<<24)-1.  The default value is
> * (1ull<<32)-1.
> *
> * \since ES 3.0 or GL_ARB_ES3_compatibility
> * \sa _mesa_init_constants
> */
>
>>   };

Both comments sound good to me.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 01/13] Add ES 3 handling to get.c and get_hash_generator.py

2012-12-11 Thread Ian Romanick

On 12/10/2012 04:06 PM, Jordan Justen wrote:

On Mon, Dec 10, 2012 at 2:28 PM, Matt Turner  wrote:

@@ -966,6 +973,15 @@ find_value(const char *func, GLenum pname, void **p, union 
value *v)
 int api;

 api = ctx->API;
+   /* We index into the table_set[] list of per-API hash tables using the API's
+* value in the gl_api enum. Since GLES 3 doesn't have an API_OPENGL* enum
+* value since it's compatible with GLES2 its entry in table_set[] is at the
+* end.
+*/
+   STATIC_ASSERT(Elements(table_set) == API_OPENGL_LAST + 2);
+   if (_mesa_is_gles3(ctx)) {
+  api = API_OPENGL_LAST + 1;
+   }


This seems somewhat unexpected. How do we keep track of the fact that
API_OPENGL_LAST + 1 is used for GLES3 in this case?


Having API_OPENGL_LAST and the STATIC_ASSERT should be sufficient.


Are we sure GLES3 isn't a separate API from GLES2? :) I guess I don't
know the motivation for keeping GLES2/3 under a combined API_GLES2
brings. Wasn't there the idea that since GLES3 was backward
compatible, we'd just upgrade GLES2 to add the GLES3 features? But,
now it seems we keep separating them in various ways.


The problem is that we have a zillion places that check of GLES2.  I 
don't want to have to find and update all of those to check for separate 
GLES2 and GLES3.  Every single place that checks for GLES2 would have to 
be updated.



diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 57dddf8..bd180a5 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3335,6 +3335,7 @@ typedef enum
 API_OPENGLES,
 API_OPENGLES2,
 API_OPENGL_CORE,
+   API_OPENGL_LAST = API_OPENGL_CORE,


We could instead add:
API_COUNT

Then we wouldn't need to update the API_OPENGL_LAST enum if newer
API's are added.


The problem is we'll get compiler warnings in switch-statements that 
don't include either a default case or a case for API_COUNT.  Since the 
warning are potentially useful, I'd rather not be forced to hide them.



-Jordan
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 06/13] mesa: Allow glGet* queries on ARB_transform_feedback2 data in ES 3

2012-12-11 Thread Ian Romanick

On 12/10/2012 02:28 PM, Matt Turner wrote:

Fixes the transform_feedback2_init_defaults test from es3conform.

The ES 3 spec lists these as TRANSFORM_FEEDBACK_PAUSED and
TRANSFORM_FEEDBACK_ACTIVE.
---
  src/mesa/main/get.c  |8 +++-
  src/mesa/main/get_hash_params.py |   10 +-
  2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 146612c..115d3c5 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -289,6 +289,13 @@ static const int extra_texture_buffer_object[] = {
 EXTRA_END
  };

+/* FIXME: Remove this when i965 exposes transform_feedback2 */


This comment should be removed.  These get enums are valid if either 
ARB_transformfeedback2 or ES3, so the check is correct.  XFB2 is a 
superset of ES3, so I don't think we should assume that every driver 
that implements ES3 also supports XFB2... even if they all happen to do so.



+static const int extra_ARB_transform_feedback2_api_es3[] = {
+   EXT(ARB_transform_feedback2),
+   EXTRA_API_ES3,
+   EXTRA_END
+};
+
  static const int extra_ARB_uniform_buffer_object_and_geometry_shader[] = {
 EXT(ARB_uniform_buffer_object),
 EXT(ARB_geometry_shader4),
@@ -321,7 +328,6 @@ EXTRA_EXT(ARB_seamless_cube_map);
  EXTRA_EXT(ARB_sync);
  EXTRA_EXT(ARB_vertex_shader);
  EXTRA_EXT(EXT_transform_feedback);
-EXTRA_EXT(ARB_transform_feedback2);
  EXTRA_EXT(ARB_transform_feedback3);
  EXTRA_EXT(EXT_pixel_buffer_object);
  EXTRA_EXT(ARB_vertex_program);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index d9fa693..ffdf96a 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -324,6 +324,11 @@ descriptor=[
  # GL_ARB_sync
[ "MAX_SERVER_WAIT_TIMEOUT", "CONTEXT_INT64(Const.MaxServerWaitTimeout), 
extra_ARB_sync" ],

+# GL_ARB_transform_feedback2
+  [ "TRANSFORM_FEEDBACK_BUFFER_PAUSED", "LOC_CUSTOM, TYPE_BOOLEAN, 0, 
extra_ARB_transform_feedback2_api_es3" ],
+  [ "TRANSFORM_FEEDBACK_BUFFER_ACTIVE", "LOC_CUSTOM, TYPE_BOOLEAN, 0, 
extra_ARB_transform_feedback2_api_es3" ],
+  [ "TRANSFORM_FEEDBACK_BINDING", "LOC_CUSTOM, TYPE_INT, 0, 
extra_ARB_transform_feedback2_api_es3" ],
+
  # GL_ARB_uniform_buffer_object
[ "MAX_VERTEX_UNIFORM_BLOCKS", 
"CONTEXT_INT(Const.VertexProgram.MaxUniformBlocks), extra_ARB_uniform_buffer_object" ],
[ "MAX_FRAGMENT_UNIFORM_BLOCKS", 
"CONTEXT_INT(Const.FragmentProgram.MaxUniformBlocks), extra_ARB_uniform_buffer_object" ],
@@ -621,11 +626,6 @@ descriptor=[
  # GL_EXT_texture_integer
[ "RGBA_INTEGER_MODE_EXT", "BUFFER_BOOL(_IntegerColor), 
extra_EXT_texture_integer" ],

-# GL_ARB_transform_feedback2
-  [ "TRANSFORM_FEEDBACK_BUFFER_PAUSED", "LOC_CUSTOM, TYPE_BOOLEAN, 0, 
extra_ARB_transform_feedback2" ],
-  [ "TRANSFORM_FEEDBACK_BUFFER_ACTIVE", "LOC_CUSTOM, TYPE_BOOLEAN, 0, 
extra_ARB_transform_feedback2" ],
-  [ "TRANSFORM_FEEDBACK_BINDING", "LOC_CUSTOM, TYPE_INT, 0, 
extra_ARB_transform_feedback2" ],
-
  # GL_ARB_transform_feedback3
[ "MAX_TRANSFORM_FEEDBACK_BUFFERS", 
"CONTEXT_INT(Const.MaxTransformFeedbackBuffers), extra_ARB_transform_feedback3" ],
[ "MAX_VERTEX_STREAMS", "CONTEXT_INT(Const.MaxVertexStreams), 
extra_ARB_transform_feedback3" ],



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 11/13] mesa: De-duplicate ES2 queries

2012-12-11 Thread Ian Romanick

On 12/10/2012 02:28 PM, Matt Turner wrote:

 From GL/GLES/GL_CORE and GLES2 -> GL/GL_CORE/GLES2.

Yes, we really were exposing ES2_compatibility queries on ES 1.
---
  src/mesa/main/get_hash_params.py |   16 ++--
  1 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 650fb38..d0e8a76 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -204,12 +204,6 @@ descriptor=[
[ "TEXTURE_COORD_ARRAY_TYPE", "LOC_CUSTOM, TYPE_ENUM, offsetof(struct 
gl_client_array, Type), NO_EXTRA" ],
[ "TEXTURE_COORD_ARRAY_STRIDE", "LOC_CUSTOM, TYPE_INT, offsetof(struct 
gl_client_array, Stride), NO_EXTRA" ],

-# GL_ARB_ES2_compatibility
-  [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ],
-  [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), 
extra_ARB_ES2_compatibility" ],
-  [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, 
extra_ARB_ES2_compatibility" ],
-  [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, 
extra_ARB_ES2_compatibility" ],
-
  # GL_ARB_multitexture
[ "MAX_TEXTURE_UNITS", "CONTEXT_INT(Const.MaxTextureUnits), NO_EXTRA" ],
[ "CLIENT_ACTIVE_TEXTURE", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
@@ -310,6 +304,12 @@ descriptor=[

  # GL_NV_read_buffer
[ "READ_BUFFER", "LOC_CUSTOM, TYPE_ENUM, NO_OFFSET, 
extra_NV_read_buffer_api_gl" ],
+
+# GL_ARB_ES2_compatibility
+  [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ],
+  [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), 
extra_ARB_ES2_compatibility" ],
+  [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, 
extra_ARB_ES2_compatibility" ],
+  [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, 
extra_ARB_ES2_compatibility" ],


Are there any drivers in Mesa that support ES2 but do not advertise 
ARB_ES2_compatibility?  I think this will break those drivers.



  ]},

  # GLES3 is not a typo.
@@ -373,10 +373,6 @@ descriptor=[

  # Enums unique to OpenGL ES 2.0
  { "apis": ["GLES2"], "params": [
-  [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
-  [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), NO_EXTRA" ],
-  [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
-  [ "SHADER_COMPILER", "CONST(1), NO_EXTRA" ],
  # OES_get_program_binary
[ "NUM_SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ],
[ "SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ],



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 06/13] mesa: Allow glGet* queries on ARB_transform_feedback2 data in ES 3

2012-12-11 Thread Matt Turner
On Tue, Dec 11, 2012 at 11:08 AM, Ian Romanick  wrote:
> On 12/10/2012 02:28 PM, Matt Turner wrote:
>>
>> Fixes the transform_feedback2_init_defaults test from es3conform.
>>
>> The ES 3 spec lists these as TRANSFORM_FEEDBACK_PAUSED and
>> TRANSFORM_FEEDBACK_ACTIVE.
>> ---
>>   src/mesa/main/get.c  |8 +++-
>>   src/mesa/main/get_hash_params.py |   10 +-
>>   2 files changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
>> index 146612c..115d3c5 100644
>> --- a/src/mesa/main/get.c
>> +++ b/src/mesa/main/get.c
>> @@ -289,6 +289,13 @@ static const int extra_texture_buffer_object[] = {
>>  EXTRA_END
>>   };
>>
>> +/* FIXME: Remove this when i965 exposes transform_feedback2 */
>
>
> This comment should be removed.  These get enums are valid if either
> ARB_transformfeedback2 or ES3, so the check is correct.  XFB2 is a superset
> of ES3, so I don't think we should assume that every driver that implements
> ES3 also supports XFB2... even if they all happen to do so.

This is the only case of an extension that  i965 doesn't expose whose
enums are in ES 3. I suppose some of the other extensions could not be
exposed but a driver support ES 3, although it seems unlikely.

Should all of the other patches (for extensions that i965 exposes) be
modified to add EXTRA_API_ES3 to their extra_* blocks? I guess that's
technically more correct.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 01/13] Add ES 3 handling to get.c and get_hash_generator.py

2012-12-11 Thread Ian Romanick

On 12/10/2012 02:28 PM, Matt Turner wrote:

Other than the comments on 6, 11, and 12, the series is

Reviewed-by: Ian Romanick 


---
  src/mesa/main/get.c |   16 
  src/mesa/main/get_hash_generator.py |8 +++-
  src/mesa/main/mtypes.h  |1 +
  3 files changed, 24 insertions(+), 1 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index cd239a6..146612c 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -130,6 +130,7 @@ enum value_extra {
 EXTRA_VERSION_32,
 EXTRA_API_GL,
 EXTRA_API_ES2,
+   EXTRA_API_ES3,
 EXTRA_NEW_BUFFERS,
 EXTRA_NEW_FRAG_CLAMP,
 EXTRA_VALID_DRAW_BUFFER,
@@ -873,6 +874,12 @@ check_extra(struct gl_context *ctx, const char *func, 
const struct value_desc *d
enabled++;
 }
 break;
+  case EXTRA_API_ES3:
+if (_mesa_is_gles3(ctx)) {
+   total++;
+   enabled++;
+}
+break;
case EXTRA_API_GL:
 if (_mesa_is_desktop_gl(ctx)) {
total++;
@@ -966,6 +973,15 @@ find_value(const char *func, GLenum pname, void **p, union 
value *v)
 int api;

 api = ctx->API;
+   /* We index into the table_set[] list of per-API hash tables using the API's
+* value in the gl_api enum. Since GLES 3 doesn't have an API_OPENGL* enum
+* value since it's compatible with GLES2 its entry in table_set[] is at the
+* end.
+*/
+   STATIC_ASSERT(Elements(table_set) == API_OPENGL_LAST + 2);
+   if (_mesa_is_gles3(ctx)) {
+  api = API_OPENGL_LAST + 1;
+   }
 mask = Elements(table(api)) - 1;
 hash = (pname * prime_factor);
 while (1) {
diff --git a/src/mesa/main/get_hash_generator.py 
b/src/mesa/main/get_hash_generator.py
index 4b3f5f4..04bf9ff 100644
--- a/src/mesa/main/get_hash_generator.py
+++ b/src/mesa/main/get_hash_generator.py
@@ -44,7 +44,7 @@ prime_factor = 89
  prime_step = 281
  hash_table_size = 1024

-gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2"])
+gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3"])

  def print_header():
 print "typedef const unsigned short table_t[%d];\n" % (hash_table_size)
@@ -67,6 +67,7 @@ api_enum = [
 'GLES',
 'GLES2',
 'GL_CORE',
+   'GLES3', # Not in gl_api enum in mtypes.h
  ]

  def api_index(api):
@@ -166,6 +167,9 @@ def generate_hash_tables(enum_list, enabled_apis, 
param_descriptors):

   for api in valid_apis:
  add_to_hash_table(tables[api], hash_val, len(params))
+# Also add GLES2 items to the GLES3 hash table
+if api == "GLES2":
+   add_to_hash_table(tables["GLES3"], hash_val, len(params))

   params.append(["GL_" + enum_name, param[1]])

@@ -183,6 +187,8 @@ def opt_to_apis(feature):
 apis = set([_map[feature]])
 if "GL" in apis:
apis.add("GL_CORE")
+   if "GLES2" in apis:
+  apis.add("GLES3")

 return apis

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 57dddf8..bd180a5 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3335,6 +3335,7 @@ typedef enum
 API_OPENGLES,
 API_OPENGLES2,
 API_OPENGL_CORE,
+   API_OPENGL_LAST = API_OPENGL_CORE,
  } gl_api;

  /**



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 11/13] mesa: De-duplicate ES2 queries

2012-12-11 Thread Matt Turner
On Tue, Dec 11, 2012 at 11:12 AM, Ian Romanick  wrote:
> On 12/10/2012 02:28 PM, Matt Turner wrote:
>>
>>  From GL/GLES/GL_CORE and GLES2 -> GL/GL_CORE/GLES2.
>>
>> Yes, we really were exposing ES2_compatibility queries on ES 1.
>> ---
>>   src/mesa/main/get_hash_params.py |   16 ++--
>>   1 files changed, 6 insertions(+), 10 deletions(-)
>>
>> diff --git a/src/mesa/main/get_hash_params.py
>> b/src/mesa/main/get_hash_params.py
>> index 650fb38..d0e8a76 100644
>> --- a/src/mesa/main/get_hash_params.py
>> +++ b/src/mesa/main/get_hash_params.py
>> @@ -204,12 +204,6 @@ descriptor=[
>> [ "TEXTURE_COORD_ARRAY_TYPE", "LOC_CUSTOM, TYPE_ENUM, offsetof(struct
>> gl_client_array, Type), NO_EXTRA" ],
>> [ "TEXTURE_COORD_ARRAY_STRIDE", "LOC_CUSTOM, TYPE_INT, offsetof(struct
>> gl_client_array, Stride), NO_EXTRA" ],
>>
>> -# GL_ARB_ES2_compatibility
>> -  [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ],
>> -  [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying),
>> extra_ARB_ES2_compatibility" ],
>> -  [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0,
>> extra_ARB_ES2_compatibility" ],
>> -  [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0,
>> extra_ARB_ES2_compatibility" ],
>> -
>>   # GL_ARB_multitexture
>> [ "MAX_TEXTURE_UNITS", "CONTEXT_INT(Const.MaxTextureUnits), NO_EXTRA"
>> ],
>> [ "CLIENT_ACTIVE_TEXTURE", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
>> @@ -310,6 +304,12 @@ descriptor=[
>>
>>   # GL_NV_read_buffer
>> [ "READ_BUFFER", "LOC_CUSTOM, TYPE_ENUM, NO_OFFSET,
>> extra_NV_read_buffer_api_gl" ],
>> +
>> +# GL_ARB_ES2_compatibility
>> +  [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ],
>> +  [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying),
>> extra_ARB_ES2_compatibility" ],
>> +  [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0,
>> extra_ARB_ES2_compatibility" ],
>> +  [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0,
>> extra_ARB_ES2_compatibility" ],
>
>
> Are there any drivers in Mesa that support ES2 but do not advertise
> ARB_ES2_compatibility?  I think this will break those drivers.
>
>
>>   ]},
>>
>>   # GLES3 is not a typo.
>> @@ -373,10 +373,6 @@ descriptor=[
>>
>>   # Enums unique to OpenGL ES 2.0
>>   { "apis": ["GLES2"], "params": [
>> -  [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA"
>> ],
>> -  [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), NO_EXTRA" ],
>> -  [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
>> -  [ "SHADER_COMPILER", "CONST(1), NO_EXTRA" ],
>>   # OES_get_program_binary
>> [ "NUM_SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ],
>> [ "SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ],
>>
>

Oh, the original version of this patch added EXTRA_API_ES2 to the
extra_* block. Consider that fixed.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 06/13] mesa: Allow glGet* queries on ARB_transform_feedback2 data in ES 3

2012-12-11 Thread Ian Romanick

On 12/11/2012 11:14 AM, Matt Turner wrote:

On Tue, Dec 11, 2012 at 11:08 AM, Ian Romanick  wrote:

On 12/10/2012 02:28 PM, Matt Turner wrote:


Fixes the transform_feedback2_init_defaults test from es3conform.

The ES 3 spec lists these as TRANSFORM_FEEDBACK_PAUSED and
TRANSFORM_FEEDBACK_ACTIVE.
---
   src/mesa/main/get.c  |8 +++-
   src/mesa/main/get_hash_params.py |   10 +-
   2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 146612c..115d3c5 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -289,6 +289,13 @@ static const int extra_texture_buffer_object[] = {
  EXTRA_END
   };

+/* FIXME: Remove this when i965 exposes transform_feedback2 */



This comment should be removed.  These get enums are valid if either
ARB_transformfeedback2 or ES3, so the check is correct.  XFB2 is a superset
of ES3, so I don't think we should assume that every driver that implements
ES3 also supports XFB2... even if they all happen to do so.


This is the only case of an extension that  i965 doesn't expose whose
enums are in ES 3. I suppose some of the other extensions could not be
exposed but a driver support ES 3, although it seems unlikely.

Should all of the other patches (for extensions that i965 exposes) be
modified to add EXTRA_API_ES3 to their extra_* blocks? I guess that's
technically more correct.


In all of the other cases, the driver must expose the extensions in 
order to have ES3.  Note the checks in compute_version_es2.  This is 
because ES3 took the whole extension.  The parts of XFB2 that a need to 
be supported are either already required by meta or are implemented 
completely in core Mesa.  XFB2 is kind of weird in this respect.


So, I don't think any of the other checks need to be modified.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/4] r600g: use u_upload_mgr for allocating staging transfer buffers

2012-12-11 Thread Aaron Watry
I'm not familiar enough with the existing code to feel comfortable 
reviewing it, but I've run it through a full piglit test run (using 
tests/all.tests w/ GL/GLX enabled) without noticing any issues.


Also, Reaction Quake 3 performance went up by ~25% as a result of this 
series on my Radeon 6850.

http://openbenchmarking.org/result/1212102-SU-SUBALLOCT33

For my part:
Tested-by: Aaron Watry 

--Aaron Watry


u_upload_mgr suballocates memory from a large buffer and maps the allocated
range (unsychronized), which is perfect for short-lived staging buffers.

This reduces the number of relocations sent to the kernel.
---
  src/gallium/drivers/r600/r600_buffer.c |   30 +++---
  1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_buffer.c 
b/src/gallium/drivers/r600/r600_buffer.c
index 9e2cf66..e674e13 100644
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -66,7 +66,8 @@ static void *r600_buffer_get_transfer(struct pipe_context 
*ctx,
unsigned usage,
const struct pipe_box *box,
  struct pipe_transfer **ptransfer,
- void *data, struct r600_resource *staging)
+ void *data, struct r600_resource *staging,
+ unsigned offset)
  {
struct r600_context *rctx = (struct r600_context*)ctx;
struct r600_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
@@ -77,8 +78,7 @@ static void *r600_buffer_get_transfer(struct pipe_context 
*ctx,
transfer->transfer.box = *box;
transfer->transfer.stride = 0;
transfer->transfer.layer_stride = 0;
-   transfer->staging = NULL;
-   transfer->offset = 0;
+   transfer->offset = offset;
transfer->staging = staging;
*ptransfer =&transfer->transfer;
return data;
@@ -147,18 +147,17 @@ static void *r600_buffer_transfer_map(struct pipe_context 
*ctx,
if (rctx->ws->cs_is_buffer_referenced(rctx->cs, 
rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
rctx->ws->buffer_is_busy(rbuffer->buf, 
RADEON_USAGE_READWRITE)) {
/* Do a wait-free write-only transfer using a temporary 
buffer. */
-   struct r600_resource *staging = (struct r600_resource*)
-   pipe_buffer_create(ctx->screen, 
PIPE_BIND_VERTEX_BUFFER,
-  PIPE_USAGE_STAGING,
-  box->width + (box->x % 
R600_MAP_BUFFER_ALIGNMENT));
-   data = rctx->ws->buffer_map(staging->cs_buf, rctx->cs, 
PIPE_TRANSFER_WRITE);
+   unsigned offset;
+   struct r600_resource *staging = NULL;

-   if (!data)
-   return NULL;
+   u_upload_alloc(rctx->uploader, 0, box->width + (box->x 
% R600_MAP_BUFFER_ALIGNMENT),
+   &offset, (struct pipe_resource**)&staging, 
(void**)&data);

-   data += box->x % R600_MAP_BUFFER_ALIGNMENT;
-   return r600_buffer_get_transfer(ctx, resource, level, 
usage, box,
-   ptransfer, data, 
staging);
+   if (staging) {
+   data += box->x % R600_MAP_BUFFER_ALIGNMENT;
+   return r600_buffer_get_transfer(ctx, resource, 
level, usage, box,
+   ptransfer, 
data, staging, offset);
+   }
}
}

@@ -169,7 +168,7 @@ static void *r600_buffer_transfer_map(struct pipe_context 
*ctx,
data += box->x;

return r600_buffer_get_transfer(ctx, resource, level, usage, box,
-   ptransfer, data, NULL);
+   ptransfer, data, NULL, 0);
  }

  static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
@@ -180,7 +179,8 @@ static void r600_buffer_transfer_unmap(struct pipe_context 
*pipe,

if (rtransfer->staging) {
struct pipe_box box;
-   u_box_1d(transfer->box.x % R600_MAP_BUFFER_ALIGNMENT, 
transfer->box.width,&box);
+   u_box_1d(rtransfer->offset + transfer->box.x % 
R600_MAP_BUFFER_ALIGNMENT,
+transfer->box.width,&box);

/* Copy the staging buffer into the original one. */
r600_copy_buffer(pipe, transfer->resource, transfer->box.x,
--
1.7.10.4
   

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/4] r600g: use u_upload_mgr for allocating staging transfer buffers

2012-12-11 Thread Marek Olšák
On Tue, Dec 11, 2012 at 8:24 PM, Aaron Watry  wrote:
> I'm not familiar enough with the existing code to feel comfortable reviewing
> it, but I've run it through a full piglit test run (using tests/all.tests w/
> GL/GLX enabled) without noticing any issues.
>
> Also, Reaction Quake 3 performance went up by ~25% as a result of this
> series on my Radeon 6850.
> http://openbenchmarking.org/result/1212102-SU-SUBALLOCT33
>
> For my part:
> Tested-by: Aaron Watry 

Awesome, thanks for testing!

Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/4] r600g: use u_upload_mgr for allocating staging transfer buffers

2012-12-11 Thread Alex Deucher
On Mon, Dec 10, 2012 at 3:47 PM, Marek Olšák  wrote:
> u_upload_mgr suballocates memory from a large buffer and maps the allocated
> range (unsychronized), which is perfect for short-lived staging buffers.
>
> This reduces the number of relocations sent to the kernel.

Series looks good to me.

Reviewed-by: Alex Deucher 


> ---
>  src/gallium/drivers/r600/r600_buffer.c |   30 +++---
>  1 file changed, 15 insertions(+), 15 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/r600_buffer.c 
> b/src/gallium/drivers/r600/r600_buffer.c
> index 9e2cf66..e674e13 100644
> --- a/src/gallium/drivers/r600/r600_buffer.c
> +++ b/src/gallium/drivers/r600/r600_buffer.c
> @@ -66,7 +66,8 @@ static void *r600_buffer_get_transfer(struct pipe_context 
> *ctx,
>unsigned usage,
>const struct pipe_box *box,
>   struct pipe_transfer **ptransfer,
> - void *data, struct r600_resource 
> *staging)
> + void *data, struct r600_resource 
> *staging,
> + unsigned offset)
>  {
> struct r600_context *rctx = (struct r600_context*)ctx;
> struct r600_transfer *transfer = 
> util_slab_alloc(&rctx->pool_transfers);
> @@ -77,8 +78,7 @@ static void *r600_buffer_get_transfer(struct pipe_context 
> *ctx,
> transfer->transfer.box = *box;
> transfer->transfer.stride = 0;
> transfer->transfer.layer_stride = 0;
> -   transfer->staging = NULL;
> -   transfer->offset = 0;
> +   transfer->offset = offset;
> transfer->staging = staging;
> *ptransfer = &transfer->transfer;
> return data;
> @@ -147,18 +147,17 @@ static void *r600_buffer_transfer_map(struct 
> pipe_context *ctx,
> if (rctx->ws->cs_is_buffer_referenced(rctx->cs, 
> rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
> rctx->ws->buffer_is_busy(rbuffer->buf, 
> RADEON_USAGE_READWRITE)) {
> /* Do a wait-free write-only transfer using a 
> temporary buffer. */
> -   struct r600_resource *staging = (struct 
> r600_resource*)
> -   pipe_buffer_create(ctx->screen, 
> PIPE_BIND_VERTEX_BUFFER,
> -  PIPE_USAGE_STAGING,
> -  box->width + (box->x % 
> R600_MAP_BUFFER_ALIGNMENT));
> -   data = rctx->ws->buffer_map(staging->cs_buf, 
> rctx->cs, PIPE_TRANSFER_WRITE);
> +   unsigned offset;
> +   struct r600_resource *staging = NULL;
>
> -   if (!data)
> -   return NULL;
> +   u_upload_alloc(rctx->uploader, 0, box->width + 
> (box->x % R600_MAP_BUFFER_ALIGNMENT),
> +  &offset, (struct 
> pipe_resource**)&staging, (void**)&data);
>
> -   data += box->x % R600_MAP_BUFFER_ALIGNMENT;
> -   return r600_buffer_get_transfer(ctx, resource, level, 
> usage, box,
> -   ptransfer, data, 
> staging);
> +   if (staging) {
> +   data += box->x % R600_MAP_BUFFER_ALIGNMENT;
> +   return r600_buffer_get_transfer(ctx, 
> resource, level, usage, box,
> +   ptransfer, 
> data, staging, offset);
> +   }
> }
> }
>
> @@ -169,7 +168,7 @@ static void *r600_buffer_transfer_map(struct pipe_context 
> *ctx,
> data += box->x;
>
> return r600_buffer_get_transfer(ctx, resource, level, usage, box,
> -   ptransfer, data, NULL);
> +   ptransfer, data, NULL, 0);
>  }
>
>  static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
> @@ -180,7 +179,8 @@ static void r600_buffer_transfer_unmap(struct 
> pipe_context *pipe,
>
> if (rtransfer->staging) {
> struct pipe_box box;
> -   u_box_1d(transfer->box.x % R600_MAP_BUFFER_ALIGNMENT, 
> transfer->box.width, &box);
> +   u_box_1d(rtransfer->offset + transfer->box.x % 
> R600_MAP_BUFFER_ALIGNMENT,
> +transfer->box.width, &box);
>
> /* Copy the staging buffer into the original one. */
> r600_copy_buffer(pipe, transfer->resource, transfer->box.x,
> --
> 1.7.10.4
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/list

[Mesa-dev] [PATCH] llvmpipe: remove unneeded draw_flush() call

2012-12-11 Thread Brian Paul
This is redundant since we're calling draw_bind_fragment_shader()
which already does a flush.

v2: the redundant flush in llvmpipe_set_constant_buffer() has
already been removed by commit 3427466e6dbbb8db7c1ecda6b3859ca1cc5827a3
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index bf59a43..5a8351b 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -2312,8 +2312,6 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void 
*fs)
if (llvmpipe->fs == fs)
   return;
 
-   draw_flush(llvmpipe->draw);
-
llvmpipe->fs = (struct lp_fragment_shader *) fs;
 
draw_bind_fragment_shader(llvmpipe->draw,
-- 
1.7.3.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 01/13] Add ES 3 handling to get.c and get_hash_generator.py

2012-12-11 Thread Kenneth Graunke

On 12/11/2012 11:04 AM, Ian Romanick wrote:

On 12/10/2012 04:06 PM, Jordan Justen wrote:

On Mon, Dec 10, 2012 at 2:28 PM, Matt Turner  wrote:

@@ -966,6 +973,15 @@ find_value(const char *func, GLenum pname, void
**p, union value *v)
 int api;

 api = ctx->API;
+   /* We index into the table_set[] list of per-API hash tables
using the API's
+* value in the gl_api enum. Since GLES 3 doesn't have an
API_OPENGL* enum
+* value since it's compatible with GLES2 its entry in
table_set[] is at the
+* end.
+*/
+   STATIC_ASSERT(Elements(table_set) == API_OPENGL_LAST + 2);
+   if (_mesa_is_gles3(ctx)) {
+  api = API_OPENGL_LAST + 1;
+   }


This seems somewhat unexpected. How do we keep track of the fact that
API_OPENGL_LAST + 1 is used for GLES3 in this case?


Having API_OPENGL_LAST and the STATIC_ASSERT should be sufficient.


Are we sure GLES3 isn't a separate API from GLES2? :) I guess I don't
know the motivation for keeping GLES2/3 under a combined API_GLES2
brings. Wasn't there the idea that since GLES3 was backward
compatible, we'd just upgrade GLES2 to add the GLES3 features? But,
now it seems we keep separating them in various ways.


The problem is that we have a zillion places that check of GLES2.  I
don't want to have to find and update all of those to check for separate
GLES2 and GLES3.  Every single place that checks for GLES2 would have to
be updated.


Where "zillion" means 50 (measured by git grep API_OPENGLES2 | wc -l). 
In contrast, there are 72 occurrances of API_OPENGLES3.


I'm not taking a position one way or another, just saying that 50 isn't 
that much work.


--Ken
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/5] i965/fs: Add a note explaining a detail of register_coalesce_2().

2012-12-11 Thread Eric Anholt
---
 src/mesa/drivers/dri/i965/brw_fs.cpp |   21 +
 1 file changed, 21 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index f428a83..c520364 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1846,7 +1846,28 @@ fs_visitor::register_coalesce_2()
   }
 
   inst->remove();
+
+  /* We don't need to recalculate live intervals inside the loop despite
+   * flagging live_intervals_valid because we only use live intervals for
+   * the interferes test, and we must have had a situation where the
+   * intervals were:
+   *
+   *  from  to
+   *  ^
+   *  |
+   *  v
+   *^
+   *|
+   *v
+   *
+   * Some register R that might get coalesced with one of these two could
+   * only be referencing "to", otherwise "from"'s range would have been
+   * longer.  R's range could also only start at the end of "to" or later,
+   * otherwise it will conflict with "to" when we try to coalesce "to"
+   * into Rw anyway.
+   */
   live_intervals_valid = false;
+
   progress = true;
   continue;
}
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/5] i965/fs: Drop an unnecessary _safe on a list walk.

2012-12-11 Thread Eric Anholt
---
 src/mesa/drivers/dri/i965/brw_fs.cpp |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c520364..62800b1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1828,7 +1828,7 @@ fs_visitor::register_coalesce_2()
   int reg_to = inst->dst.reg;
   int reg_to_offset = inst->dst.reg_offset;
 
-  foreach_list_safe(node, &this->instructions) {
+  foreach_list(node, &this->instructions) {
 fs_inst *scan_inst = (fs_inst *)node;
 
 if (scan_inst->dst.file == GRF &&
-- 
1.7.10.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/5] i965/vs: Add a unit test for opt_compute_to_mrf().

2012-12-11 Thread Eric Anholt
The compute-to-mrf code is really twitchy, and it's hard to construct
GLSL testcases for it.  This unit test is also really hard to work with
(for example, if your instruction is removed by dead code elimination,
you end up inspecting something irrelevant), but I did use it for
debugging some of the commits to follow.

I called it test_vec4_register_coalesce because the compute-to-mrf code
is about to morph into that.
---
 src/mesa/drivers/dri/i965/.gitignore   |1 +
 src/mesa/drivers/dri/i965/Makefile.am  |   10 +-
 .../dri/i965/test_vec4_register_coalesce.cpp   |  124 
 3 files changed, 133 insertions(+), 2 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp

diff --git a/src/mesa/drivers/dri/i965/.gitignore 
b/src/mesa/drivers/dri/i965/.gitignore
index c6ea403..8208295 100644
--- a/src/mesa/drivers/dri/i965/.gitignore
+++ b/src/mesa/drivers/dri/i965/.gitignore
@@ -2,3 +2,4 @@ Makefile
 i965_symbols_test
 libi965_dri.la
 test_eu_compact
+test_vec4_register_coalesce
diff --git a/src/mesa/drivers/dri/i965/Makefile.am 
b/src/mesa/drivers/dri/i965/Makefile.am
index f5ca12b..8e96467 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -66,8 +66,14 @@ i965_dri_la_SOURCES =
 i965_dri_la_LIBADD = $(COMMON_LIBS)
 i965_dri_la_LDFLAGS = -module -avoid-version -shared
 
-TESTS = test_eu_compact
-check_PROGRAMS = test_eu_compact
+TESTS = \
+test_eu_compact \
+test_vec4_register_coalesce
+check_PROGRAMS = $(TESTS)
+
+test_vec4_register_coalesce_LDADD = \
+$(TEST_LIBS) \
+$(top_builddir)/src/gtest/libgtest.la
 
 test_eu_compact_SOURCES = \
test_eu_compact.c
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp 
b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
new file mode 100644
index 000..c79b0fd
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include 
+#include "brw_vec4.h"
+
+using namespace brw;
+
+int ret = 0;
+
+#define register_coalesce(v) _register_coalesce(v, __FUNCTION__)
+
+class register_coalesce_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_context *brw;
+   struct intel_context *intel;
+   struct gl_context *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vs_compile *c;
+   vec4_visitor *v;
+};
+
+void register_coalesce_test::SetUp()
+{
+   brw = (struct brw_context *)calloc(1, sizeof(*brw));
+   intel = &brw->intel;
+   ctx = &intel->ctx;
+
+   c = ralloc(NULL, struct brw_vs_compile);
+   c->vp = ralloc(NULL, struct brw_vertex_program);
+
+   shader_prog = ralloc(NULL, struct gl_shader_program);
+
+   v = new vec4_visitor(brw, c, shader_prog, NULL, NULL);
+
+   _mesa_init_vertex_program(ctx, &c->vp->program, GL_VERTEX_SHADER, 0);
+
+   intel->gen = 4;
+}
+
+static void
+_register_coalesce(vec4_visitor *v, const char *func)
+{
+   bool print = false;
+
+   if (print) {
+  printf("%s: instructions before:\n", func);
+  v->dump_instructions();
+   }
+
+   v->opt_compute_to_mrf();
+
+   if (print) {
+  printf("%s: instructions after:\n", func);
+  v->dump_instructions();
+   }
+}
+
+TEST_F(register_coalesce_test, test_easy_success)
+{
+   src_reg something = src_reg(v, glsl_type::float_type);
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_X;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, src_reg(1.0f)));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(mul->dst.file, MRF);
+}
+
+
+TEST_F(register_coalesce_test, test_multiple_use)
+{
+   src_reg something = src

[Mesa-dev] [PATCH 4/5] i965/vs: Extend opt_compute_to_mrf to handle limited "reswizzling"

2012-12-11 Thread Eric Anholt
The way our visitor works, scalar expression/swizzle results that get
stored in channels other than .x will have an intermediate MOV from
their result in the .x channel to the real .y (or whatever) channel, and
similarly for vec2/vec3 results.

By knowing how to adjust DP4-type instructions for optimizing out a
swizzled MOV, we can reduce instructions in common matrix multiplication
cases.
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp |   99 ++--
 src/mesa/drivers/dri/i965/brw_vec4.h   |2 +
 .../dri/i965/test_vec4_register_coalesce.cpp   |   21 +
 3 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 436ba97..7ab37e7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -600,6 +600,85 @@ vec4_visitor::move_push_constants_to_pull_constants()
pack_uniform_registers();
 }
 
+bool
+vec4_instruction::can_reswizzle_dst(int dst_writemask,
+int swizzle,
+int swizzle_mask)
+{
+   /* If this instruction sets anything not referenced by swizzle, then we'd
+* totally break it when we reswizzle.
+*/
+   if (dst.writemask & ~swizzle_mask)
+  return false;
+
+   switch (opcode) {
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP2:
+  return true;
+   default:
+  /* Check if there happens to be no reswizzling required. */
+  for (int c = 0; c < 4; c++) {
+ int bit = 1 << BRW_GET_SWZ(swizzle, c);
+ /* Skip components of the swizzle not used by the dst. */
+ if (!(dst_writemask & (1 << c)))
+continue;
+
+ /* We don't do the reswizzling yet, so just sanity check that we
+  * don't have to.
+  */
+ if (bit != (1 << c))
+return false;
+  }
+  return true;
+   }
+}
+
+/**
+ * For any channels in the swizzle's source that were populated by this
+ * instruction, rewrite the instruction to put the appropriate result directly
+ * in those channels.
+ *
+ * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
+ */
+void
+vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
+{
+   int new_writemask = 0;
+
+   switch (opcode) {
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP2:
+  for (int c = 0; c < 4; c++) {
+ int bit = 1 << BRW_GET_SWZ(swizzle, c);
+ /* Skip components of the swizzle not used by the dst. */
+ if (!(dst_writemask & (1 << c)))
+continue;
+ /* If we were populating this component, then populate the
+  * corresponding channel of the new dst.
+  */
+ if (dst.writemask & bit)
+new_writemask |= (1 << c);
+  }
+  dst.writemask = new_writemask;
+  break;
+   default:
+  for (int c = 0; c < 4; c++) {
+ int bit = 1 << BRW_GET_SWZ(swizzle, c);
+ /* Skip components of the swizzle not used by the dst. */
+ if (!(dst_writemask & (1 << c)))
+continue;
+
+ /* We don't do the reswizzling yet, so just sanity check that we
+  * don't have to.
+  */
+ assert(bit == (1 << c));
+  }
+  break;
+   }
+}
+
 /*
  * Tries to reduce extra MOV instructions by taking GRFs that get just
  * written and then MOVed into an MRF and making the original write of
@@ -641,26 +720,20 @@ vec4_visitor::opt_compute_to_mrf()
*/
   bool chans_needed[4] = {false, false, false, false};
   int chans_remaining = 0;
+  int swizzle_mask = 0;
   for (int i = 0; i < 4; i++) {
 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
 
 if (!(inst->dst.writemask & (1 << i)))
continue;
 
-/* We don't handle compute-to-MRF across a swizzle.  We would
- * need to be able to rewrite instructions above to output
- * results to different channels.
- */
-if (chan != i)
-   chans_remaining = 5;
+ swizzle_mask |= (1 << chan);
 
 if (!chans_needed[chan]) {
chans_needed[chan] = true;
chans_remaining++;
 }
   }
-  if (chans_remaining > 4)
-continue;
 
   /* Now walk up the instruction stream trying to see if we can
* rewrite everything writing to the GRF into the MRF instead.
@@ -689,6 +762,13 @@ vec4_visitor::opt_compute_to_mrf()
   }
}
 
+/* If we can't handle the swizzle, bail. */
+if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
+  inst->src[0].swizzle,
+  swizzle_mask)) {
+   break;
+}
+
/* Mark which channels we found unconditional writes for. */
if (!scan_inst->predicate) {
   for (int i = 0; i < 4

[Mesa-dev] [PATCH 5/5] i965: Generalize VS compute-to-MRF for compute-to-another-GRF, too.

2012-12-11 Thread Eric Anholt
No statistically significant performance difference on glbenchmark 2.7
(n=60).  It reduces cycles spent in the vertex shader by 3.3% +/- 0.8%
(n=5), but that's only about .3% of all cycles spent according to the
fixed shader_time.
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp |  129 +++-
 src/mesa/drivers/dri/i965/brw_vec4.h   |2 +-
 .../dri/i965/test_vec4_register_coalesce.cpp   |   58 -
 3 files changed, 128 insertions(+), 61 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 7ab37e7..079bbab 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -680,12 +680,12 @@ vec4_instruction::reswizzle_dst(int dst_writemask, int 
swizzle)
 }
 
 /*
- * Tries to reduce extra MOV instructions by taking GRFs that get just
- * written and then MOVed into an MRF and making the original write of
- * the GRF write directly to the MRF instead.
+ * Tries to reduce extra MOV instructions by taking temporary GRFs that get
+ * just written and then MOVed into another reg and making the original write
+ * of the GRF write directly to the final destination instead.
  */
 bool
-vec4_visitor::opt_compute_to_mrf()
+vec4_visitor::opt_register_coalesce()
 {
bool progress = false;
int next_ip = 0;
@@ -699,24 +699,25 @@ vec4_visitor::opt_compute_to_mrf()
   next_ip++;
 
   if (inst->opcode != BRW_OPCODE_MOV ||
+  (inst->dst.file != GRF && inst->dst.file != MRF) ||
  inst->predicate ||
- inst->dst.file != MRF || inst->src[0].file != GRF ||
+ inst->src[0].file != GRF ||
  inst->dst.type != inst->src[0].type ||
  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
 continue;
 
-  int mrf = inst->dst.reg;
+  bool to_mrf = (inst->dst.file == MRF);
 
-  /* Can't compute-to-MRF this GRF if someone else was going to
+  /* Can't coalesce this GRF if someone else was going to
* read it later.
*/
   if (this->virtual_grf_use[inst->src[0].reg] > ip)
 continue;
 
-  /* We need to check interference with the MRF between this
-   * instruction and the earliest instruction involved in writing
-   * the GRF we're eliminating.  To do that, keep track of which
-   * of our source channels we've seen initialized.
+  /* We need to check interference with the final destination between this
+   * instruction and the earliest instruction involved in writing the GRF
+   * we're eliminating.  To do that, keep track of which of our source
+   * channels we've seen initialized.
*/
   bool chans_needed[4] = {false, false, false, false};
   int chans_remaining = 0;
@@ -735,8 +736,9 @@ vec4_visitor::opt_compute_to_mrf()
 }
   }
 
-  /* Now walk up the instruction stream trying to see if we can
-   * rewrite everything writing to the GRF into the MRF instead.
+  /* Now walk up the instruction stream trying to see if we can rewrite
+   * everything writing to the temporary to write into the destination
+   * instead.
*/
   vec4_instruction *scan_inst;
   for (scan_inst = (vec4_instruction *)inst->prev;
@@ -745,22 +747,21 @@ vec4_visitor::opt_compute_to_mrf()
 if (scan_inst->dst.file == GRF &&
 scan_inst->dst.reg == inst->src[0].reg &&
 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
-   /* Found something writing to the reg we want to turn into
-* a compute-to-MRF.
-*/
-
-   /* SEND instructions can't have MRF as a destination. */
-   if (scan_inst->mlen)
-  break;
-
-   if (intel->gen >= 6) {
-  /* gen6 math instructions must have the destination be
-   * GRF, so no compute-to-MRF for them.
-   */
-  if (scan_inst->is_math()) {
- break;
-  }
-   }
+/* Found something writing to the reg we want to coalesce away. */
+if (to_mrf) {
+   /* SEND instructions can't have MRF as a destination. */
+   if (scan_inst->mlen)
+  break;
+
+   if (intel->gen >= 6) {
+  /* gen6 math instructions must have the destination be
+   * GRF, so no compute-to-MRF for them.
+   */
+  if (scan_inst->is_math()) {
+ break;
+  }
+   }
+}
 
 /* If we can't handle the swizzle, bail. */
 if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
@@ -784,9 +785,8 @@ vec4_visitor::opt_compute_to_mrf()
   break;
 }
 
-/* We don't handle flow control here.  Most computation of
- * values that end up in MRFs are shortly before the MRF
- * write anyway.
+/* We don't han

[Mesa-dev] [PATCH 00/10] glsl: Implement varying packing.

2012-12-11 Thread Paul Berry
This patch series adds varying packing to Mesa, so that we can handle
varyings composed of things other than vec4's without using up extra
varying components.

For the initial implementation I've chosen a strategy that operates
exclusively at the GLSL IR level, so that it doesn't require the
cooperation of the driver back-ends.  This means that varying packing
should be immediately useful for all drivers.  However, there are some
types of varying packing that can't be done using GLSL IR alone (for
example, packing a "noperspective" varying and a "smooth" varying
together), but should be possible on some drivers with a small amount
of back-end work.  I'm deferring that work for a later patch series.
Also, packing of floats and ints together into the same "flat varying"
should be possible for drivers that implement
ARB_shader_bit_encoding--I'm also deferring that for a later patch
series.

The strategy is as follows:

- Before assigning locations to varyings, we sort them into "packing
  classes" based on base type and interpolation mode (this is to
  ensure that we don't try to pack floats with ints, or smooth with
  flat, for example).

- Within each packing class, we sort the varyings based on the number
  of vector elements.  Vec4's (as well as matrices and arrays composed
  of vec4's) are packed first, then vec2's, then scalars, since this
  allows us to align them all to their natural alignment boundary, so
  we avoid the performance penalty of "double parking" a varying
  across two varying slots.  Vec3's are packed last, double parking
  them if necessary.

- For any varying slot that doesn't contain exactly one vec4, we
  generate GLSL IR to manually pack/unpack the varying in the shader.
  For instance, the following fragment shader:

  varying vec2 a;
  varying vec2 b;
  varying vec3 c;
  varying vec3 d;
  main()
  {
...
  }

  would get rewritten as follows:

  varying vec4 packed0;
  varying vec4 packed1;
  varying vec4 packed2;
  vec2 a;
  vec2 b;
  vec3 c;
  vec3 d;
  main()
  {
a = packed0.xy;
b = packed0.zw;
c = packed1.xyz;
d.x = packed1.w; // d is "double parked" across slots 1 and 2
d.yz = packed2.xy;
...
  }

  This GLSL IR is generated by a lowering pass, so that in the future
  we will have the option of disabling it for driver back-ends that
  are capable of natively understanding the packed varying format.

- Finally, the linker code to handle transform feedback is modified to
  account for varying packing (e.g. by feeding back just a subset of
  the components of a varying slot rather than the entire varying
  slot).  Fortunately transform feedback already has the
  infrastructure necessary to do this, since it was needed in order to
  implement glClipDistance.


I believe this is enough to be useful for the vast majority of
programs, and to get us passing the GLES3 conformance tests.


Additional improvements, which I'm planning to defer to later patch
series, include:

- Allow uints and ints to be packed together in the same varying slot.
  This should be possible on all back-ends, since ints and uints may
  be interconverted without losing information.

- On back-ends that support ARB_shader_bit_encoding, allow floats and
  ints to be packed together in the same varying slot, since
  ARB_shader_bit_encoding allows floating-point values to be encoded
  into ints without losing information.

- On back-ends that can mix interpolation modes within a single
  varying slot, allow additional packing, with help from the driver
  back-end.  For instance, i965 gen6 and above can in principle mix
  together all interpolation modes except for "flat" within a single
  varying slot, if we do a hopefully small amount of back-end work.

- Allow a driver back-end to advertise a larger number of varying
  components to the linker than it advertises to the client
  program--this will allow us to ensure that varying packing *never*
  fails.  For example, on i965 gen6 and above, after the above
  improvements are made, we should be able to pack any possible
  combination of varyings with a maximum waste of 3 varying
  components.  That means, for example, that if the i965 driver
  advertises 17 varying slots to the linker (== 68 varying
  components), but advertises only 64 varying components to the the
  client program, then varying packing will always succeed.

Note: I also have a new piglit test that exercises this code; I'll be
publishing that to the Piglit list ASAP.

[PATCH 01/10] glsl/lower_clip_distance: Update symbol table.
[PATCH 02/10] glsl/linker: Always invalidate shader ins/outs, even in corner 
cases.
[PATCH 03/10] glsl/linker: Make separate ir_variable field to mean "unmatched".
[PATCH 04/10] glsl: Create a field to store fractional varying locations.
[PATCH 05/10] glsl/linker: Defer recording transform feedback locations.
[PATCH 06/10] glsl/linker: Subdivide the first phase of varying assignment.
[PATCH 07/10] glsl/linker: Sort varyings by packing class, t

[Mesa-dev] [PATCH 01/10] glsl/lower_clip_distance: Update symbol table.

2012-12-11 Thread Paul Berry
This patch modifies the clip distance lowering pass so that the new
symbol it generates (glClipDistanceMESA) is added to the shader's
symbol table.

This will allow a later patch to modify the linker so that it finds
transform feedback varyings using the symbol table rather than having
to iterate through all the declarations in the shader.
---
 src/glsl/ir_optimization.h   | 2 +-
 src/glsl/linker.cpp  | 5 +++--
 src/glsl/lower_clip_distance.cpp | 8 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 2220d51..628096e 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -72,7 +72,7 @@ bool lower_noise(exec_list *instructions);
 bool lower_variable_index_to_cond_assign(exec_list *instructions,
 bool lower_input, bool lower_output, bool lower_temp, bool lower_uniform);
 bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
-bool lower_clip_distance(exec_list *instructions);
+bool lower_clip_distance(gl_shader *shader);
 void lower_output_reads(exec_list *instructions);
 void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
 bool optimize_redundant_jumps(exec_list *instructions);
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 29fc5d8..802323e 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2568,8 +2568,9 @@ link_shaders(struct gl_context *ctx, struct 
gl_shader_program *prog)
   if (!prog->LinkStatus)
 goto done;
 
-  if (ctx->ShaderCompilerOptions[i].LowerClipDistance)
- lower_clip_distance(prog->_LinkedShaders[i]->ir);
+  if (ctx->ShaderCompilerOptions[i].LowerClipDistance) {
+ lower_clip_distance(prog->_LinkedShaders[i]);
+  }
 
   unsigned max_unroll = ctx->ShaderCompilerOptions[i].MaxUnrollIterations;
 
diff --git a/src/glsl/lower_clip_distance.cpp b/src/glsl/lower_clip_distance.cpp
index 0316914..09bdc36 100644
--- a/src/glsl/lower_clip_distance.cpp
+++ b/src/glsl/lower_clip_distance.cpp
@@ -45,6 +45,7 @@
  * LowerClipDistance flag in gl_shader_compiler_options to true.
  */
 
+#include "glsl_symbol_table.h"
 #include "ir_hierarchical_visitor.h"
 #include "ir.h"
 
@@ -334,11 +335,14 @@ lower_clip_distance_visitor::visit_leave(ir_call *ir)
 
 
 bool
-lower_clip_distance(exec_list *instructions)
+lower_clip_distance(gl_shader *shader)
 {
lower_clip_distance_visitor v;
 
-   visit_list_elements(&v, instructions);
+   visit_list_elements(&v, shader->ir);
+
+   if (v.new_clip_distance_var)
+  shader->symbols->add_variable(v.new_clip_distance_var);
 
return v.progress;
 }
-- 
1.8.0.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 02/10] glsl/linker: Always invalidate shader ins/outs, even in corner cases.

2012-12-11 Thread Paul Berry
Previously, link_invalidate_variable_locations() was only called
during assign_attribute_or_color_locations() and
assign_varying_locations().  This meant that in the corner case when
there was only a vertex shader, and varyings were being captured by
transform feedback, link_invalidate_variable_locations() wasn't being
called for the varyings.

This patch migrates the calls to link_invalidate_variable_locations()
to link_shaders(), so that they will be called in all circumstances.
In addition, it modifies the call semantics so that
link_invalidate_variable_locations() need only be called once per
shader stage (rather than once for inputs and once for outputs).
---
 src/glsl/linker.cpp | 43 +++
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 802323e..2523dc9 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -200,19 +200,31 @@ linker_warning(gl_shader_program *prog, const char *fmt, 
...)
 
 
 void
-link_invalidate_variable_locations(gl_shader *sh, enum ir_variable_mode mode,
-  int generic_base)
+link_invalidate_variable_locations(gl_shader *sh, int input_base,
+   int output_base)
 {
foreach_list(node, sh->ir) {
   ir_variable *const var = ((ir_instruction *) node)->as_variable();
 
-  if ((var == NULL) || (var->mode != (unsigned) mode))
-continue;
+  if (var == NULL)
+ continue;
+
+  int base;
+  switch (var->mode) {
+  case ir_var_in:
+ base = input_base;
+ break;
+  case ir_var_out:
+ base = output_base;
+ break;
+  default:
+ continue;
+  }
 
   /* Only assign locations for generic attributes / varyings / etc.
*/
-  if ((var->location >= generic_base) && !var->explicit_location)
- var->location = -1;
+  if ((var->location >= base) && !var->explicit_location)
+ var->location = -1;
}
 }
 
@@ -1309,8 +1321,6 @@ assign_attribute_or_color_locations(gl_shader_program 
*prog,
   (target_index == MESA_SHADER_VERTEX) ? ir_var_in : ir_var_out;
 
 
-   link_invalidate_variable_locations(sh, direction, generic_base);
-
/* Temporary storage for the set of attributes that need locations assigned.
 */
struct temp_attr {
@@ -2072,10 +2082,6 @@ assign_varying_locations(struct gl_context *ctx,
 *not being inputs.  This lets the optimizer eliminate them.
 */
 
-   link_invalidate_variable_locations(producer, ir_var_out, VERT_RESULT_VAR0);
-   if (consumer)
-  link_invalidate_variable_locations(consumer, ir_var_in, 
FRAG_ATTRIB_VAR0);
-
foreach_list(node, producer->ir) {
   ir_variable *const output_var = ((ir_instruction *) node)->as_variable();
 
@@ -2578,6 +2584,19 @@ link_shaders(struct gl_context *ctx, struct 
gl_shader_program *prog)
 ;
}
 
+   /* Mark all generic shader inputs and outputs as unpaired. */
+   if (prog->_LinkedShaders[MESA_SHADER_VERTEX] != NULL) {
+  link_invalidate_variable_locations(
+prog->_LinkedShaders[MESA_SHADER_VERTEX],
+VERT_ATTRIB_GENERIC0, VERT_RESULT_VAR0);
+   }
+   /* FINISHME: Geometry shaders not implemented yet */
+   if (prog->_LinkedShaders[MESA_SHADER_FRAGMENT] != NULL) {
+  link_invalidate_variable_locations(
+prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
+FRAG_ATTRIB_VAR0, FRAG_RESULT_DATA0);
+   }
+
/* FINISHME: The value of the max_attribute_index parameter is
 * FINISHME: implementation dependent based on the value of
 * FINISHME: GL_MAX_VERTEX_ATTRIBS.  GL_MAX_VERTEX_ATTRIBS must be
-- 
1.8.0.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/10] glsl/linker: Make separate ir_variable field to mean "unmatched".

2012-12-11 Thread Paul Berry
Previously, the linker used a value of -1 in ir_variable::location to
denote a generic input or output of the shader that had not yet been
matched up to a variable in another pipeline stage.

This patch introduces a new ir_variable field,
is_unmatched_generic_inout, for that purpose.

In future patches, this will allow us to separate the process of
matching varyings between shader stages from the processes of
assigning locations to those varying.  That will in turn pave the way
for packing varyings.
---
 src/glsl/ir.h   |  9 +
 src/glsl/linker.cpp | 18 ++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 89c516c..e2ecb3d 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -437,6 +437,15 @@ public:
unsigned has_initializer:1;
 
/**
+* Is this variable a generic output or input that has not yet been matched
+* up to a variable in another stage of the pipeline?
+*
+* This is used by the linker as scratch storage while assigning locations
+* to generic inputs and outputs.
+*/
+   unsigned is_unmatched_generic_inout:1;
+
+   /**
 * \brief Layout qualifier for gl_FragDepth.
 *
 * This is not equal to \c ir_depth_layout_none if and only if this
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 2523dc9..ee6dc25 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -225,6 +225,11 @@ link_invalidate_variable_locations(gl_shader *sh, int 
input_base,
*/
   if ((var->location >= base) && !var->explicit_location)
  var->location = -1;
+
+  if ((var->location == -1) && !var->explicit_location)
+ var->is_unmatched_generic_inout = 1;
+  else
+ var->is_unmatched_generic_inout = 0;
}
 }
 
@@ -1362,6 +1367,7 @@ assign_attribute_or_color_locations(gl_shader_program 
*prog,
 if (prog->AttributeBindings->get(binding, var->name)) {
assert(binding >= VERT_ATTRIB_GENERIC0);
var->location = binding;
+var->is_unmatched_generic_inout = 0;
 }
   } else if (target_index == MESA_SHADER_FRAGMENT) {
 unsigned binding;
@@ -1370,6 +1376,7 @@ assign_attribute_or_color_locations(gl_shader_program 
*prog,
 if (prog->FragDataBindings->get(binding, var->name)) {
assert(binding >= FRAG_RESULT_DATA0);
var->location = binding;
+var->is_unmatched_generic_inout = 0;
 
if (prog->FragDataIndexBindings->get(index, var->name)) {
   var->index = index;
@@ -1485,6 +1492,7 @@ assign_attribute_or_color_locations(gl_shader_program 
*prog,
   }
 
   to_assign[i].var->location = generic_base + location;
+  to_assign[i].var->is_unmatched_generic_inout = 0;
   used_locations |= (use_mask << location);
}
 
@@ -1508,7 +1516,7 @@ demote_shader_inputs_and_outputs(gl_shader *sh, enum 
ir_variable_mode mode)
* its value is used by other shader stages.  This will cause the 
variable
* to have a location assigned.
*/
-  if (var->location == -1) {
+  if (var->is_unmatched_generic_inout) {
 var->mode = ir_var_auto;
   }
}
@@ -1985,7 +1993,7 @@ void
 assign_varying_location(ir_variable *input_var, ir_variable *output_var,
 unsigned *input_index, unsigned *output_index)
 {
-   if (output_var->location != -1) {
+   if (!output_var->is_unmatched_generic_inout) {
   /* Location already assigned. */
   return;
}
@@ -1993,9 +2001,11 @@ assign_varying_location(ir_variable *input_var, 
ir_variable *output_var,
if (input_var) {
   assert(input_var->location == -1);
   input_var->location = *input_index;
+  input_var->is_unmatched_generic_inout = 0;
}
 
output_var->location = *output_index;
+   output_var->is_unmatched_generic_inout = 0;
 
/* FINISHME: Support for "varying" records in GLSL 1.50. */
assert(!output_var->type->is_record());
@@ -2105,7 +2115,7 @@ assign_varying_locations(struct gl_context *ctx,
 
  if (!tfeedback_decls[i].is_assigned() &&
  tfeedback_decls[i].matches_var(output_var)) {
-if (output_var->location == -1) {
+if (output_var->is_unmatched_generic_inout) {
assign_varying_location(input_var, output_var, &input_index,
&output_index);
 }
@@ -2124,7 +2134,7 @@ assign_varying_locations(struct gl_context *ctx,
  if ((var == NULL) || (var->mode != ir_var_in))
 continue;
 
- if (var->location == -1) {
+ if (var->is_unmatched_generic_inout) {
 if (prog->Version <= 120) {
/* On page 25 (page 31 of the PDF) of the GLSL 1.20 spec:
 *
-- 
1.8.0.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/10] glsl: Create a field to store fractional varying locations.

2012-12-11 Thread Paul Berry
Currently, the location of each varying is recorded in ir_variable as
a multiple of the size of a vec4.  In order to pack varyings, we need
to be able to record, e.g. that a vec2 is stored in the second half of
a varying slot rather than the first half.

This patch introduces a field ir_variable::location_frac, which
represents the offset within a vec4 where a varying's value is stored.
Varyings that are not subject to packing will always have a
location_frac value of zero.
---
 src/glsl/ir.cpp | 1 +
 src/glsl/ir.h   | 9 +
 src/glsl/linker.cpp | 6 --
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 7b0a487..703f5ec 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -1492,6 +1492,7 @@ ir_variable::ir_variable(const struct glsl_type *type, 
const char *name,
this->explicit_location = false;
this->has_initializer = false;
this->location = -1;
+   this->location_frac = 0;
this->uniform_block = -1;
this->warn_extension = NULL;
this->constant_value = NULL;
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index e2ecb3d..85fc5ce 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -446,6 +446,15 @@ public:
unsigned is_unmatched_generic_inout:1;
 
/**
+* If non-zero, then this variable may be packed along with other variables
+* into a single varying slot, so this offset should be applied when
+* accessing components.  For example, an offset of 1 means that the x
+* component of this variable is actually stored in component y of the
+* location specified by \c location.
+*/
+   unsigned location_frac:2;
+
+   /**
 * \brief Layout qualifier for gl_FragDepth.
 *
 * This is not equal to \c ir_depth_layout_none if and only if this
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index ee6dc25..b13a6aa 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -226,10 +226,12 @@ link_invalidate_variable_locations(gl_shader *sh, int 
input_base,
   if ((var->location >= base) && !var->explicit_location)
  var->location = -1;
 
-  if ((var->location == -1) && !var->explicit_location)
+  if ((var->location == -1) && !var->explicit_location) {
  var->is_unmatched_generic_inout = 1;
-  else
+ var->location_frac = 0;
+  } else {
  var->is_unmatched_generic_inout = 0;
+  }
}
 }
 
-- 
1.8.0.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 05/10] glsl/linker: Defer recording transform feedback locations.

2012-12-11 Thread Paul Berry
This patch subdivides the loop that assigns varying locations into two
phases: one phase to match up varyings between shader stages (and
assign them varying locations), and a second phase to record the
varying assignments for use by transform feedback.

This paves the way for varying packing, which will require us to
further subdivide the first phase.

In addition, it lets us avoid a clumsy O(n^2) algorithm, since we can
now record the locations of all transform feedback varyings in a
single pass through the tfeedback_decls array, rather than have to
iterate through the array after assigning each varying.
---
 src/glsl/linker.cpp | 103 
 1 file changed, 48 insertions(+), 55 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index b13a6aa..dd77010 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1537,18 +1537,12 @@ public:
static bool is_same(const tfeedback_decl &x, const tfeedback_decl &y);
bool assign_location(struct gl_context *ctx, struct gl_shader_program *prog,
 ir_variable *output_var);
-   bool accumulate_num_outputs(struct gl_shader_program *prog, unsigned 
*count);
+   unsigned accumulate_num_outputs(struct gl_shader_program *prog);
bool store(struct gl_context *ctx, struct gl_shader_program *prog,
   struct gl_transform_feedback_info *info, unsigned buffer,
   const unsigned max_outputs) const;
-
-   /**
-* True if assign_location() has been called for this object.
-*/
-   bool is_assigned() const
-   {
-  return this->location != -1;
-   }
+   ir_variable *find_output_var(gl_shader_program *prog,
+gl_shader *producer) const;
 
bool is_next_buffer_separator() const
{
@@ -1561,19 +1555,8 @@ public:
}
 
/**
-* Determine whether this object refers to the variable var.
-*/
-   bool matches_var(ir_variable *var) const
-   {
-  if (this->is_clip_distance_mesa)
- return strcmp(var->name, "gl_ClipDistanceMESA") == 0;
-  else
- return strcmp(var->name, this->var_name) == 0;
-   }
-
-   /**
 * The total number of varying components taken up by this variable.  Only
-* valid if is_assigned() is true.
+* valid if assign_location() has been called.
 */
unsigned num_components() const
{
@@ -1822,34 +1805,18 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
 }
 
 
-bool
-tfeedback_decl::accumulate_num_outputs(struct gl_shader_program *prog,
-   unsigned *count)
+unsigned
+tfeedback_decl::accumulate_num_outputs(struct gl_shader_program *prog)
 {
if (!this->is_varying()) {
-  return true;
-   }
-
-   if (!this->is_assigned()) {
-  /* From GL_EXT_transform_feedback:
-   *   A program will fail to link if:
-   *
-   *   * any variable name specified in the  array is not
-   * declared as an output in the geometry shader (if present) or
-   * the vertex shader (if no geometry shader is present);
-   */
-  linker_error(prog, "Transform feedback varying %s undeclared.",
-   this->orig_name);
-  return false;
+  return 0;
}
 
unsigned translated_size = this->size;
if (this->is_clip_distance_mesa)
   translated_size = (translated_size + 3) / 4;
 
-   *count += translated_size * this->matrix_columns;
-
-   return true;
+   return translated_size * this->matrix_columns;
 }
 
 
@@ -1926,6 +1893,29 @@ tfeedback_decl::store(struct gl_context *ctx, struct 
gl_shader_program *prog,
 }
 
 
+ir_variable *
+tfeedback_decl::find_output_var(gl_shader_program *prog,
+gl_shader *producer) const
+{
+   const char *name = this->is_clip_distance_mesa
+  ? "gl_ClipDistanceMESA" : this->var_name;
+   ir_variable *var = producer->symbols->get_variable(name);
+   if (var && var->mode == ir_var_out)
+  return var;
+
+   /* From GL_EXT_transform_feedback:
+*   A program will fail to link if:
+*
+*   * any variable name specified in the  array is not
+* declared as an output in the geometry shader (if present) or
+* the vertex shader (if no geometry shader is present);
+*/
+   linker_error(prog, "Transform feedback varying %s undeclared.",
+this->orig_name);
+   return NULL;
+}
+
+
 /**
  * Parse all the transform feedback declarations that were passed to
  * glTransformFeedbackVaryings() and store them in tfeedback_decl objects.
@@ -2110,21 +2100,25 @@ assign_varying_locations(struct gl_context *ctx,
  assign_varying_location(input_var, output_var, &input_index,
  &output_index);
   }
+   }
 
-  for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
- if (!tfeedback_decls[i].is_varying())
-continue;
+   for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+  if (!tfeedback_decls[i].is_varying())
+ 

[Mesa-dev] [PATCH 06/10] glsl/linker: Subdivide the first phase of varying assignment.

2012-12-11 Thread Paul Berry
This patch further subdivides the loop that assigns varying locations
into two phases: one phase to match up the varyings between shader
stages, and one phase to assign them varying locations.

In between the two phases the matched varyings are stored in a new
data structure called varying_matches.  This will free us to be able
to assign varying locations in any order, which will pave the way for
packing varyings.

Note that the new varying_matches::assign_locations() function returns
the number of varying slots that were used; this return value will be
used in a future patch.
---
 src/glsl/linker.cpp | 207 +---
 1 file changed, 163 insertions(+), 44 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index dd77010..7fa980f 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1961,58 +1961,167 @@ parse_tfeedback_decls(struct gl_context *ctx, struct 
gl_shader_program *prog,
 
 
 /**
- * Assign a location for a variable that is produced in one pipeline stage
- * (the "producer") and consumed in the next stage (the "consumer").
- *
- * \param input_var is the input variable declaration in the consumer.
- *
- * \param output_var is the output variable declaration in the producer.
- *
- * \param input_index is the counter that keeps track of assigned input
- *locations in the consumer.
- *
- * \param output_index is the counter that keeps track of assigned output
- *locations in the producer.
+ * Data structure recording the relationship between outputs of one shader
+ * stage (the "producer") and inputs of another (the "consumer").
+ */
+class varying_matches
+{
+public:
+   varying_matches();
+   ~varying_matches();
+   void record(ir_variable *producer_var, ir_variable *consumer_var);
+   unsigned assign_locations();
+   void store_locations(unsigned producer_base, unsigned consumer_base) const;
+
+private:
+   /**
+* Structure recording the relationship between a single producer output
+* and a single consumer input.
+*/
+   struct match {
+  /**
+   * The output variable in the producer stage.
+   */
+  ir_variable *producer_var;
+
+  /**
+   * The input variable in the consumer stage.
+   */
+  ir_variable *consumer_var;
+
+  /**
+   * The location which has been assigned for this varying.  This is
+   * expressed in multiples of a float, with the first generic varying
+   * (i.e. the one referred to by VERT_RESULT_VAR0 or FRAG_ATTRIB_VAR0)
+   * represented by the value 0.
+   */
+  unsigned generic_location;
+   } *matches;
+
+   /**
+* The number of elements in the \c matches array that are currently in
+* use.
+*/
+   unsigned num_matches;
+
+   /**
+* The number of elements that were set aside for the \c matches array when
+* it was allocated.
+*/
+   unsigned matches_capacity;
+};
+
+
+varying_matches::varying_matches()
+{
+   /* Note: this initial capacity is rather arbitrarily chosen to be large
+* enough for many cases without wasting an unreasonable amount of space.
+* varying_matches::record() will resize the array if there are more than
+* this number of varyings.
+*/
+   this->matches_capacity = 8;
+   this->matches = (match *)
+  malloc(sizeof(*this->matches) * this->matches_capacity);
+   this->num_matches = 0;
+}
+
+
+varying_matches::~varying_matches()
+{
+   free(this->matches);
+}
+
+
+/**
+ * Record the given producer/consumer variable pair in the list of variables
+ * that should later be assigned locations.
  *
- * It is permissible for \c input_var to be NULL (this happens if a variable
- * is output by the producer and consumed by transform feedback, but not
- * consumed by the consumer).
+ * It is permissible for \c consumer_var to be NULL (this happens if a
+ * variable is output by the producer and consumed by transform feedback, but
+ * not consumed by the consumer).
  *
- * If the variable has already been assigned a location, this function has no
- * effect.
+ * If \c producer_var has already been paired up with a consumer_var, or
+ * producer_var is part of fixed pipeline functionality (and hence already has
+ * a location assigned), this function has no effect.
  */
 void
-assign_varying_location(ir_variable *input_var, ir_variable *output_var,
-unsigned *input_index, unsigned *output_index)
+varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
 {
-   if (!output_var->is_unmatched_generic_inout) {
-  /* Location already assigned. */
+   if (!producer_var->is_unmatched_generic_inout) {
+  /* Either a location already exists for this variable (since it is part
+   * of fixed functionality), or it has already been recorded as part of a
+   * previous match.
+   */
   return;
}
 
-   if (input_var) {
-  assert(input_var->location == -1);
-  input_var->location = *input_index;
-  input_v

[Mesa-dev] [PATCH 07/10] glsl/linker: Sort varyings by packing class, then vector size.

2012-12-11 Thread Paul Berry
This patch paves the way for varying packing by adding a sorting step
before varying assignment, which sorts the varyings into an order that
increases the likelihood of being able to find an efficient packing.

First, varyings are sorted into "packing classes" by considering
attributes that can't be mixed during varying packing--at the moment
this includes base type (float/int/uint/bool) and interpolation mode
(smooth/noperspective/flat/centroid), though later we will hopefully
be able to relax some of these restrictions.  The number of packing
classes places an upper limit on the amount of space that must be
wasted by varying packing, since in theory a shader might nave 4n+1
components worth of varyings in each of m packing classes, resulting
in 3m components worth of wasted space.

Then, within each packing class, varyings are sorted by vector size,
with vec4's coming first, then vec2's, then scalars, and then finally
vec3's.  The motivation for this order is that it ensures that the
only vectors that might be "double parked" (with part of the vector in
one varying slot and the remainder in another) are vec3's.

Note that the varyings aren't actually packed yet, merely placed in an
order that will facilitate packing.
---
 src/glsl/linker.cpp | 104 
 1 file changed, 104 insertions(+)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 7fa980f..cee7386 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1975,11 +1975,41 @@ public:
 
 private:
/**
+* Enum representing the order in which varyings are packed within a
+* packing class.
+*
+* Currently we pack vec4's first, then vec2's, then scalar values, then
+* vec3's.  This order ensures that the only vectors that are at risk of
+* having to be "double parked" (split between two adjacent varying slots)
+* are the vec3's.
+*/
+   enum packing_order_enum {
+  PACKING_ORDER_VEC4,
+  PACKING_ORDER_VEC2,
+  PACKING_ORDER_SCALAR,
+  PACKING_ORDER_VEC3,
+   };
+
+   static unsigned compute_packing_class(ir_variable *var);
+   static packing_order_enum compute_packing_order(ir_variable *var);
+   static int match_comparator(const void *x_generic, const void *y_generic);
+
+   /**
 * Structure recording the relationship between a single producer output
 * and a single consumer input.
 */
struct match {
   /**
+   * Packing class for this varying, computed by compute_packing_class().
+   */
+  unsigned packing_class;
+
+  /**
+   * Packing order for this varying, computed by compute_packing_order().
+   */
+  packing_order_enum packing_order;
+
+  /**
* The output variable in the producer stage.
*/
   ir_variable *producer_var;
@@ -2061,6 +2091,10 @@ varying_matches::record(ir_variable *producer_var, 
ir_variable *consumer_var)
  realloc(this->matches,
  sizeof(*this->matches) * this->matches_capacity);
}
+   this->matches[this->num_matches].packing_class
+  = this->compute_packing_class(producer_var);
+   this->matches[this->num_matches].packing_order
+  = this->compute_packing_order(producer_var);
this->matches[this->num_matches].producer_var = producer_var;
this->matches[this->num_matches].consumer_var = consumer_var;
this->num_matches++;
@@ -2077,6 +2111,10 @@ varying_matches::record(ir_variable *producer_var, 
ir_variable *consumer_var)
 unsigned
 varying_matches::assign_locations()
 {
+   /* Sort varying matches into an order that makes them easy to pack. */
+   qsort(this->matches, this->num_matches, sizeof(*this->matches),
+ &varying_matches::match_comparator);
+
unsigned generic_location = 0;
 
for (unsigned i = 0; i < this->num_matches; i++) {
@@ -2127,6 +2165,72 @@ varying_matches::store_locations(unsigned producer_base,
 
 
 /**
+ * Compute the "packing class" of the given varying.  This is an unsigned
+ * integer with the property that two variables in the same packing class can
+ * be safely backed into the same vec4.
+ */
+unsigned
+varying_matches::compute_packing_class(ir_variable *var)
+{
+   /* In this initial implementation we conservatively assume that variables
+* can only be packed if their base type (float/int/uint/bool) matches and
+* their interpolation and centroid qualifiers match.
+*
+* TODO: relax these restrictions when the driver back-end permits.
+*/
+   unsigned packing_class = var->centroid ? 1 : 0;
+   packing_class *= 4;
+   packing_class += var->interpolation;
+   packing_class *= GLSL_TYPE_ERROR;
+   packing_class += var->type->get_scalar_type()->base_type;
+   return packing_class;
+}
+
+
+/**
+ * Compute the "packing order" of the given varying.  This is a sort key we
+ * use to determine when to attempt to pack the given varying relative to
+ * other varyings in the same packing class.
+ */
+varying_matches::packing_order_enum
+varying_matches::compu

[Mesa-dev] [PATCH 08/10] glsl: Add a lowering pass for packing varyings.

2012-12-11 Thread Paul Berry
This lowering pass generates GLSL code that manually packs varyings
into vec4 slots, for the benefit of back-ends that don't support
packed varyings natively.

No functional change--the lowering pass is not yet used.
---
 src/glsl/Makefile.sources  |   1 +
 src/glsl/ir_optimization.h |   3 +
 src/glsl/lower_packed_varyings.cpp | 371 +
 3 files changed, 375 insertions(+)
 create mode 100644 src/glsl/lower_packed_varyings.cpp

diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 5e098fc..d984c5c 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -57,6 +57,7 @@ LIBGLSL_FILES = \
$(GLSL_SRCDIR)/lower_jumps.cpp \
$(GLSL_SRCDIR)/lower_mat_op_to_vec.cpp \
$(GLSL_SRCDIR)/lower_noise.cpp \
+   $(GLSL_SRCDIR)/lower_packed_varyings.cpp \
$(GLSL_SRCDIR)/lower_texture_projection.cpp \
$(GLSL_SRCDIR)/lower_variable_index_to_cond_assign.cpp \
$(GLSL_SRCDIR)/lower_vec_index_to_cond_assign.cpp \
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 628096e..6b95191 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -75,6 +75,9 @@ bool lower_quadop_vector(exec_list *instructions, bool 
dont_lower_swz);
 bool lower_clip_distance(gl_shader *shader);
 void lower_output_reads(exec_list *instructions);
 void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
+void lower_packed_varyings(void *mem_ctx, unsigned location_base,
+   unsigned locations_used, ir_variable_mode mode,
+   gl_shader *shader);
 bool optimize_redundant_jumps(exec_list *instructions);
 bool optimize_split_arrays(exec_list *instructions, bool linked);
 
diff --git a/src/glsl/lower_packed_varyings.cpp 
b/src/glsl/lower_packed_varyings.cpp
new file mode 100644
index 000..4cb9066
--- /dev/null
+++ b/src/glsl/lower_packed_varyings.cpp
@@ -0,0 +1,371 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file lower_varyings_to_packed.cpp
+ *
+ * This lowering pass generates GLSL code that manually packs varyings into
+ * vec4 slots, for the benefit of back-ends that don't support packed varyings
+ * natively.
+ *
+ * For example, the following shader:
+ *
+ *   out mat3x2 foo;  // location=4, location_frac=0
+ *   out vec3 bar[2]; // location=5, location_frac=2
+ *
+ *   main()
+ *   {
+ * ...
+ *   }
+ *
+ * Is rewritten to:
+ *
+ *   mat3x2 foo;
+ *   vec3 bar[2];
+ *   out vec4 packed4; // location=4, location_frac=0
+ *   out vec4 packed5; // location=5, location_frac=0
+ *   out vec4 packed6; // location=6, location_frac=0
+ *
+ *   main()
+ *   {
+ * ...
+ * packed4.xy = foo[0];
+ * packed4.zw = foo[1];
+ * packed5.xy = foo[2];
+ * packed5.zw = bar[0].xy;
+ * packed6.x = bar[0].z;
+ * packed6.yzw = bar[1];
+ *   }
+ *
+ * This lowering pass properly handles "double parking" of a varying vector
+ * across two varying slots.  For example, in the code above, two of the
+ * components of bar[0] are stored in packed5, and the remaining component is
+ * stored in packed6.
+ *
+ * Note that in theory, the extra instructions may cause some loss of
+ * performance.  However, hopefully in most cases the performance loss will
+ * either be absorbed by a later optimization pass, or it will be offset by
+ * memory bandwidth savings (because fewer varyings are used).
+ */
+
+#include "glsl_symbol_table.h"
+#include "ir.h"
+#include "ir_optimization.h"
+
+/**
+ * Visitor that performs lowering packing.  For each varying declared in the
+ * shader, this visitor determines whether it needs to be packed.  If so, it
+ * demotes it to an ordinary global, creates new packed varyings, and
+ * generates assignments to convert between the original varying and the
+ * packed v

[Mesa-dev] [PATCH 09/10] glsl/linker: Pack within compound varyings.

2012-12-11 Thread Paul Berry
This patch implements varying packing within varyings that are
composed of multiple vectors of size less than 4 (e.g. arrays of
vec2's, or matrices with height less than 4).

Previously, such varyings used up a full 4-wide varying slot for each
constituent vector, meaning that some of the components of each
varying slot went unused.  For example, a mat4x3 would be stored as
follows:

      slots
  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
 <-column1->  x  <-column2->  x  <-column3->  x  <-column4->  x   matrix

(Each * represents a varying component, and the "x"s represent wasted
space).  In addition to wasting precious varying components, this
layout complicated transform feedback, since the constituents of the
varying are expected to be output to the transform feedback buffer
contiguously (e.g. without gaps between the columns, in the case of a
matrix).

This change packs the constituents of each varying together so that
all wasted space is at the end.  For the mat4x3 example, this looks
like so:

      slots
  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
 <-column1-> <-column2-> <-column3-> <-column4->  x   x   x   x   matrix

Note that matrix columns 2 and 3 now cross a boundary between varying
slots (a characteristic I call "double parking" of a varying).

We don't bother trying to eliminate the wasted space at the end of the
varying, since the patch that follows will take care of that.

Since compiler back-ends don't (yet) support this packed layout, the
lower_packed_varyings function is used to rewrite the shader into a
form where each varying occupies a full varying slot.  Later, if we
add native back-end support for varying packing, we can make this
lowering pass optional.
---
 src/glsl/linker.cpp | 85 ++---
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index cee7386..d6f11a5 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1601,6 +1601,17 @@ private:
int location;
 
/**
+* If non-zero, then this variable may be packed along with other variables
+* into a single varying slot, so this offset should be applied when
+* accessing components.  For example, an offset of 1 means that the x
+* component of this variable is actually stored in component y of the
+* location specified by \c location.
+*
+* Only valid if location != -1.
+*/
+   unsigned location_frac;
+
+   /**
 * If location != -1, the number of vector elements in this variable, or 1
 * if this variable is a scalar.
 */
@@ -1739,6 +1750,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
   /* Array variable */
   const unsigned matrix_cols =
  output_var->type->fields.array->matrix_columns;
+  const unsigned vector_elements =
+ output_var->type->fields.array->vector_elements;
   unsigned actual_array_size = this->is_clip_distance_mesa ?
  prog->Vert.ClipDistanceArraySize : output_var->type->array_size();
 
@@ -1754,16 +1767,22 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
  if (this->is_clip_distance_mesa) {
 this->location =
output_var->location + this->array_subscript / 4;
+this->location_frac = this->array_subscript % 4;
  } else {
-this->location =
-   output_var->location + this->array_subscript * matrix_cols;
+unsigned fine_location
+   = output_var->location * 4 + output_var->location_frac;
+unsigned array_elem_size = vector_elements * matrix_cols;
+fine_location += array_elem_size * this->array_subscript;
+this->location = fine_location / 4;
+this->location_frac = fine_location % 4;
  }
  this->size = 1;
   } else {
  this->location = output_var->location;
+ this->location_frac = output_var->location_frac;
  this->size = actual_array_size;
   }
-  this->vector_elements = output_var->type->fields.array->vector_elements;
+  this->vector_elements = vector_elements;
   this->matrix_columns = matrix_cols;
   if (this->is_clip_distance_mesa)
  this->type = GL_FLOAT;
@@ -1778,6 +1797,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
  return false;
   }
   this->location = output_var->location;
+  this->location_frac = output_var->location_frac;
   this->size = 1;
   this->vector_elements = output_var->type->vector_elements;
   this->matrix_columns = output_var->type->matrix_columns;
@@ -1812,11 +1832,7 @@ tfeedback_decl::accumulate_num_outputs(struct 
gl_shader_program *prog)
   return 0;
}
 
-   unsigned translated_size = this->size;
-   if (this->is_clip_distance_mesa)
-  tran

[Mesa-dev] [PATCH 10/10] glsl/linker: Pack between varyings.

2012-12-11 Thread Paul Berry
This patch implements varying packing between varyings.

Previously, each varying occupied components 0 through N-1 of its
assigned varying slot, so there was no way to pack two varyings into
the same slot.  For example, if the varyings were a float, a vec2, a
vec3, and another vec2, they would be stored as follows:

      slots
  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
 flt  x   x   xx   x  <--vec3--->  xx   x   varyings

(Each * represents a varying component, and the "x"s represent wasted
space).

This change packs the varyings together to eliminate wasted space
between varyings, like so:

      slots
  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
   flt <--vec3--->  x   x   x   x   x   x   x   x   varyings

Note that we take advantage of the sort order introduced in previous
patches (vec4's first, then vec2's, then scalars, then vec3's) to
minimize how often a varying is "double parked" (split across varying
slots).
---
 src/glsl/linker.cpp | 49 +
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index d6f11a5..80aa260 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1996,6 +1996,7 @@ private:
 
static unsigned compute_packing_class(ir_variable *var);
static packing_order_enum compute_packing_order(ir_variable *var);
+   static unsigned compute_num_components(ir_variable *var);
static int match_comparator(const void *x_generic, const void *y_generic);
 
/**
@@ -2012,6 +2013,7 @@ private:
* Packing order for this varying, computed by compute_packing_order().
*/
   packing_order_enum packing_order;
+  unsigned num_components;
 
   /**
* The output variable in the producer stage.
@@ -2099,6 +2101,8 @@ varying_matches::record(ir_variable *producer_var, 
ir_variable *consumer_var)
   = this->compute_packing_class(producer_var);
this->matches[this->num_matches].packing_order
   = this->compute_packing_order(producer_var);
+   this->matches[this->num_matches].num_components
+  = this->compute_num_components(producer_var);
this->matches[this->num_matches].producer_var = producer_var;
this->matches[this->num_matches].consumer_var = consumer_var;
this->num_matches++;
@@ -2122,20 +2126,19 @@ varying_matches::assign_locations()
unsigned generic_location = 0;
 
for (unsigned i = 0; i < this->num_matches; i++) {
-  this->matches[i].generic_location = generic_location;
-
-  ir_variable *producer_var = this->matches[i].producer_var;
-
-  if (producer_var->type->is_array()) {
- const unsigned slots = producer_var->type->length
-* producer_var->type->fields.array->matrix_columns;
+  /* Advance to the next slot if this varying has a different packing
+   * class than the previous one, and we're not already on a slot
+   * boundary.
+   */
+  if (i > 0 && generic_location % 4 != 0 &&
+  this->matches[i - 1].packing_class
+  != this->matches[i].packing_class) {
+ generic_location += 4 - generic_location % 4;
+  }
 
- generic_location += 4 * slots;
-  } else {
- const unsigned slots = producer_var->type->matrix_columns;
+  this->matches[i].generic_location = generic_location;
 
- generic_location += 4 * slots;
-  }
+  generic_location += this->matches[i].num_components;
}
 
return (generic_location + 3) / 4;
@@ -2219,6 +,28 @@ varying_matches::compute_packing_order(ir_variable *var)
 
 
 /**
+ * Compute the number of components that this variable will occupy when
+ * properly packed.
+ */
+unsigned
+varying_matches::compute_num_components(ir_variable *var)
+{
+   const glsl_type *type = var->type;
+   unsigned multipiler = 1;
+
+   if (type->is_array()) {
+  multipiler *= type->length;
+  type = type->fields.array;
+   }
+
+   /* FINISHME: Support for "varying" records in GLSL 1.50. */
+   assert(!type->is_record());
+
+   return multipiler * type->components();
+}
+
+
+/**
  * Comparison function passed to qsort() to sort varyings by packing_class and
  * then by packing_order.
  */
-- 
1.8.0.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 42516] Assertion `src.File != TGSI_FILE_NULL' failed with llvmpipe renderer

2012-12-11 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=42516

Brian Paul  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |WORKSFORME

--- Comment #4 from Brian Paul  ---
There hasn't been any follow-up about testing something newer than Mesa 7.11
and I don't see the problem with later versions.  Closing.

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: Fix computation of default vertex attrib stride for 2_10_10_10 formats.

2012-12-11 Thread Paul Berry
Previously, if the client program didn't specify a stride when setting
up a vertex attribute, we used _mesa_sizeof_type() to compute the size
of the type, and multiplied it by the number of components.

This didn't work for the 2_10_10_10 formats, since _mesa_sizeof_type()
returns -1 for those types, resulting in all kinds of havoc, since it
was causing the hardware to be programmed with a negative stride
value.

This patch adds a new function _mesa_bytes_per_vertex_attrib(), which
is similar to the existing function _mesa_bytes_per_pixel(), but which
computes the size of a vertex attribute based on the type and the
number of formats.  For packed formats (currently only the 2_10_10_10
formats), it verifies that the number of components is correct and
returns the size of the packed format.  For unpacked formats, it
returns the size of the type times the number of components.

In addition, this patch adds an assertion so that if we ever forget to
update _mesa_bytes_per_vertex_attrib() when adding a new vertex
format, we'll see the problem quickly rather than having to debug a
subtle conformance test failure.

Fixes GLES3 conformance tests
vertex_type_2_10_10_10_rev_{conversion,divisor,stride_pointer}.test.
---
 src/mesa/main/glformats.c | 42 ++
 src/mesa/main/glformats.h |  3 +++
 src/mesa/main/varray.c|  3 ++-
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index fefa9c4..b6b16ca 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -308,6 +308,48 @@ _mesa_bytes_per_pixel(GLenum format, GLenum type)
 
 
 /**
+ * Get the number of bytes for a vertex attrib with the given number of
+ * components ant type.
+ *
+ * \param comps number of components.
+ * \param type data type.
+ *
+ * \return bytes per attribute, or -1 if a bad comps/type combination was 
given.
+ */
+GLint
+_mesa_bytes_per_vertex_attrib(GLint comps, GLenum type)
+{
+   switch (type) {
+   case GL_BYTE:
+   case GL_UNSIGNED_BYTE:
+  return comps * sizeof(GLubyte);
+   case GL_SHORT:
+   case GL_UNSIGNED_SHORT:
+  return comps * sizeof(GLshort);
+   case GL_INT:
+   case GL_UNSIGNED_INT:
+  return comps * sizeof(GLint);
+   case GL_FLOAT:
+  return comps * sizeof(GLfloat);
+   case GL_HALF_FLOAT_ARB:
+  return comps * sizeof(GLhalfARB);
+   case GL_DOUBLE:
+  return comps * sizeof(GLdouble);
+   case GL_FIXED:
+  return comps * sizeof(GLfixed);
+   case GL_INT_2_10_10_10_REV:
+   case GL_UNSIGNED_INT_2_10_10_10_REV:
+  if (comps == 4)
+ return sizeof(GLuint);
+  else
+ return -1;
+   default:
+  return -1;
+   }
+}
+
+
+/**
  * Test if the given format is an integer (non-normalized) format.
  */
 GLboolean
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 5d09951..ccfb5e1 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -49,6 +49,9 @@ _mesa_components_in_format( GLenum format );
 extern GLint
 _mesa_bytes_per_pixel( GLenum format, GLenum type );
 
+extern GLint
+_mesa_bytes_per_vertex_attrib(GLint comps, GLenum type);
+
 extern GLboolean
 _mesa_is_type_integer(GLenum type);
 
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index f770143..5e4d6c3 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -251,7 +251,8 @@ update_array(struct gl_context *ctx,
   return;
}
 
-   elementSize = _mesa_sizeof_type(type) * size;
+   elementSize = _mesa_bytes_per_vertex_attrib(size, type);
+   assert(elementSize != -1);
 
array = &ctx->Array.ArrayObj->VertexAttrib[attrib];
array->Size = size;
-- 
1.8.0.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 00/10] glsl: Implement varying packing.

2012-12-11 Thread Aras Pranckevicius
> For the initial implementation I've chosen a strategy that operates
> exclusively at the GLSL IR level, so that it doesn't require the
> cooperation of the driver back-ends.


Wouldn't this negatively affect performance of some GPUs?

Not sure if relevant for Mesa, but e.g. on PowerVR SGX it's really bad to
pack two vec2 texture coordinates into a single vec4. That's because var.xy
texture read can be "prefetched", whereas var.zw texture read is not
prefetched (essentially treated as a dependent texture read), and often
causes stalls in the shader execution.



-- 
Aras Pranckevičius
work: http://unity3d.com
home: http://aras-p.info
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev