[Mesa-dev] [PATCH] softpipe: implement seamless cubemap support.
This adds seamless sampling for cubemap boundaries if requested. The corner case averaging is messy but seems like it should be spec compliant. The face direction stuff is also a bit messy, I've no idea if that could or should be simpler, or even if all my directions are fully correct! Signed-off-by: Dave Airlie --- src/gallium/drivers/softpipe/sp_screen.c | 2 +- src/gallium/drivers/softpipe/sp_tex_sample.c | 143 +-- 2 files changed, 135 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 909fa1c..7ca259e 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -127,7 +127,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - return 0; + return 1; case PIPE_CAP_SCALED_RESOLVE: return 0; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index 7558ef1..9500a03 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -607,6 +607,100 @@ get_texel_2d(const struct sp_sampler_variant *samp, } } +static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = { + /* pos X first then neg X is Z different, Y the same */ + /* PIPE_TEX_FACE_POS_X,*/ + { PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, + /* PIPE_TEX_FACE_NEG_X */ + { PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, + + /* pos Y first then neg Y is X different, X the same */ + /* PIPE_TEX_FACE_POS_Y */ + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z }, + + /* PIPE_TEX_FACE_NEG_Y */ + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z }, + + /* pos Z first then neg Y is X different, X the same */ + /* PIPE_TEX_FACE_POS_Z */ + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, + + /* PIPE_TEX_FACE_NEG_Z */ + { PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y } +}; + +static INLINE unsigned +get_next_face(unsigned face, int x, int y) +{ + int idx = 0; + + if (x == 0 && y == 0) + return face; + if (x == -1) idx = 0; + else if (x == 1) idx = 1; + else if (y == -1) idx = 2; + else if (y == 1) idx = 3; + + return face_array[face][idx]; +} + +static INLINE const float * +get_texel_cube_seamless(const struct sp_sampler_variant *samp, +union tex_tile_address addr, int x, int y, +float *corner) +{ + const struct pipe_resource *texture = samp->view->texture; + unsigned level = addr.bits.level; + unsigned face = addr.bits.face; + int new_x, new_y; + int max_x, max_y; + int c; + + max_x = (int) u_minify(texture->width0, level); + max_y = (int) u_minify(texture->height0, level); + new_x = x; + new_y = y; + + /* the corner case */ + if ((x < 0 || x >= max_x) && + (y < 0 || y >= max_y)) { + const float *c1, *c2, *c3; + int fx = x < 0 ? 0 : max_x - 1; + int fy = y < 0 ? 0 : max_y - 1; + c1 = get_texel_2d_no_border( samp, addr, fx, fy); + addr.bits.face = get_next_face(face, (x < 0) ? -1 : 1, 0); + c2 = get_texel_2d_no_border( samp, addr, (x < 0) ? max_x - 1 : 0, fy); + addr.bits.face = get_next_face(face, 0, (y < 0) ? -1 : 1); + c3 = get_texel_2d_no_border( samp, addr, fx, (y < 0) ? max_y - 1 : 0); + for (c = 0; c < TGSI_QUAD_SIZE; c++) + corner[c] = CLAMP((c1[c] + c2[c] + c3[c]), 0.0F, 1.0F) / 3; + + return corner; + } + /* change the face */ + if (x < 0) { + new_x = max_x - 1; + face = get_next_face(face, -1, 0); + } else if (x >= max_x) { + new_x = 0; + face = get_next_face(face, 1, 0); + } else if (y < 0) { + new_y = max_y - 1; + face = get_next_face(face, 0, -1); + } else if (y >= max_y) { + new_y = 0; + face = get_next_face(face, 0, 1); + } + + addr.bits.face = face; + return get_texel_2d_no_border( samp, addr, new_x, new_y ); +} /* Gather a quad of adjacent texels within a tile: */ @@ -1121,6 +1215,7 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler, union tex_tile_address addr; const float *out; int c; + float corner0[TGSI_QUAD_SIZE]; width = u_minify(texture->width0, level); height = u_minify(texture->height0, level); @@ -1131,10 +1226,23 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler, addr.value = 0; addr.bits.level = level; - samp->nearest_texcoord_s(s, width, &x); - samp->nearest_texcoord_t(t, height, &y); + /* +* If NEAREST filtering is
[Mesa-dev] [Bug 58137] New: [r300g, r600g] corruption on 0 A.D. game with postproc effects enabled
https://bugs.freedesktop.org/show_bug.cgi?id=58137 Priority: medium Bug ID: 58137 Assignee: mesa-dev@lists.freedesktop.org Summary: [r300g, r600g] corruption on 0 A.D. game with postproc effects enabled Severity: normal Classification: Unclassified OS: Linux (All) Reporter: fabio@libero.it Hardware: x86 (IA32) Status: NEW Version: git Component: Mesa core Product: Mesa Enabling postproc effects in 0 A.D. ( postproc = true in ~/.config/0ad/config/local.cfg ) results in this rendering corruption: http://www.wildfiregames.com/forum/index.php?app=core&module=attach§ion=attach&attach_rel_module=post&attach_id=4929 Tested by me on RV530/r300g with current git and another user on Radeon HD 4670/r600g, see: http://www.wildfiregames.com/forum/index.php?showtopic=16839entry257923 -- You are receiving this mail because: You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] mesa syncobj: don't store a pointer to the set_entry
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Am 2012-12-11 00:47, schrieb Ian Romanick: >> [ 760.187261] [drm:radeon_cs_ib_chunk] *ERROR* Invalid command >> stream ! [ 760.192898] radeon :01:00.0: >> evergreen_cs_track_validate_stencil:602 stencil read bo base >> 4148500480 not aligned with 16384 [ 760.192901] radeon >> :01:00.0: evergreen_packet3_check:2098 invalid cmd stream >> 2440 > > It sounds like you should rebase Jordan's original patches out and > bisect. It would be nice to know when things started going wrong. > That usually makes the fix obvious. It was a case of PEBKAC - I had an outdated libdrm and inadvertently made configure accept it. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.19 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQIcBAEBAgAGBQJQxyP3AAoJEN0/YqbEcdMwjLcP/RwyNoCRKFFFVM46P+AwTFNP 0bQ3paJ0IluZUoeLe1Ihpl1PlmkoD//ZUFPlNamUKK85bNpE3gdDbJ7VbUJagW/b Xc0xYfssBbL6AGWxKO/+vvPH1T2pTEeFWB7Dw24truL0ORl7bx7Ovr7cCuX4SGnC TOvtyaX9FSSwbBnr6eyanEahlszTa5tIbtMumX9OJT+MDlNLxkgQjE5LL2zqXRZi 7moSBUNyELVKa4i0Yb9A9Tp0GXJBoZ328z5iBP7I5LKOZek245zzVWPkFQ11eTmV Hz5HVXvoVeDfhxY4pJSi73DjPcwEUHu3YJ8iDSjvpTe+N0Uj9Wud7by75y9WE2MI NI3nB4G0dP05dvps9MZXJnlvbejEMKinIb6jqCJ85wT+5H75CRyKUyXrZVfD5vfb F5UXJZEaMLt5fGV+T5NnZRMjC+PjMamovsaV5aT+pyfdZg+2U3cTCJQxEDCVkBpb hxolLm+PyR3o/fYED/82SiWkmY1wpAPCrOdp5AiPfheq6+anm+In0jThv7xygG0u isge3WQEKu8UxwZhFo4wPQkfHoSac1VX/wO036YvWQTnLpxTvGDRWGu225KPqVCx gxVj0Qzi90YjaiFSlHnPje+pSrvVyRUiGBIjDc/VRtLHlmy+0hrryGBGwp04U7M2 83AMyMmi/FRzB+GsVpoJ =sulN -END PGP SIGNATURE- ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] st/mesa: remove the prefix 'Gallium 0.4 on' from the renderer string
We already have the Mesa version in the version string, isn't that enough to detect Mesa? --- src/mesa/state_tracker/st_cb_strings.c |5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_strings.c b/src/mesa/state_tracker/st_cb_strings.c index 2132379..3ce9fab 100644 --- a/src/mesa/state_tracker/st_cb_strings.c +++ b/src/mesa/state_tracker/st_cb_strings.c @@ -39,8 +39,6 @@ #include "st_context.h" #include "st_cb_strings.h" -#define ST_VERSION_STRING "0.4" - static const GLubyte * st_get_string(struct gl_context * ctx, GLenum name) { @@ -55,8 +53,7 @@ st_get_string(struct gl_context * ctx, GLenum name) } case GL_RENDERER: - util_snprintf(st->renderer, sizeof(st->renderer), "Gallium %s on %s", - ST_VERSION_STRING, + util_snprintf(st->renderer, sizeof(st->renderer), "%s", screen->get_name( screen )); return (GLubyte *) st->renderer; -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/mesa: remove the prefix 'Gallium 0.4 on' from the renderer string
This will break apps that expect the current to tweak their behavior. Changing it now will cause pain to app developers and ourselves, and I honestly don't the good of it. For good or bad we have these strings. So I'd prefer we focused on making our drivers rock solid so that app developers don't feel the need to detect and workaround issues in our drivers. Jose - Original Message - > We already have the Mesa version in the version string, isn't that > enough > to detect Mesa? > --- > src/mesa/state_tracker/st_cb_strings.c |5 + > 1 file changed, 1 insertion(+), 4 deletions(-) > > diff --git a/src/mesa/state_tracker/st_cb_strings.c > b/src/mesa/state_tracker/st_cb_strings.c > index 2132379..3ce9fab 100644 > --- a/src/mesa/state_tracker/st_cb_strings.c > +++ b/src/mesa/state_tracker/st_cb_strings.c > @@ -39,8 +39,6 @@ > #include "st_context.h" > #include "st_cb_strings.h" > > -#define ST_VERSION_STRING "0.4" > - > static const GLubyte * > st_get_string(struct gl_context * ctx, GLenum name) > { > @@ -55,8 +53,7 @@ st_get_string(struct gl_context * ctx, GLenum name) > } > > case GL_RENDERER: > - util_snprintf(st->renderer, sizeof(st->renderer), "Gallium %s > on %s", > - ST_VERSION_STRING, > + util_snprintf(st->renderer, sizeof(st->renderer), "%s", > screen->get_name( screen )); > >return (GLubyte *) st->renderer; > -- > 1.7.10.4 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/mesa: remove the prefix 'Gallium 0.4 on' from the renderer string
On 11 December 2012 13:57, Marek Olšák wrote: > We already have the Mesa version in the version string, isn't that enough > to detect Mesa? In theory, although the vendor string would IMO be the expected place for that. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] softpipe: implement seamless cubemap support.
Am 11.12.2012 10:52, schrieb Dave Airlie: > This adds seamless sampling for cubemap boundaries if requested. > > The corner case averaging is messy but seems like it should be spec > compliant. > > The face direction stuff is also a bit messy, I've no idea if that could > or should be simpler, or even if all my directions are fully correct! > > Signed-off-by: Dave Airlie > --- > src/gallium/drivers/softpipe/sp_screen.c | 2 +- > src/gallium/drivers/softpipe/sp_tex_sample.c | 143 > +-- > 2 files changed, 135 insertions(+), 10 deletions(-) > > diff --git a/src/gallium/drivers/softpipe/sp_screen.c > b/src/gallium/drivers/softpipe/sp_screen.c > index 909fa1c..7ca259e 100644 > --- a/src/gallium/drivers/softpipe/sp_screen.c > +++ b/src/gallium/drivers/softpipe/sp_screen.c > @@ -127,7 +127,7 @@ softpipe_get_param(struct pipe_screen *screen, enum > pipe_cap param) >return 1; > case PIPE_CAP_SEAMLESS_CUBE_MAP: > case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: > - return 0; > + return 1; > case PIPE_CAP_SCALED_RESOLVE: >return 0; > case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: > diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c > b/src/gallium/drivers/softpipe/sp_tex_sample.c > index 7558ef1..9500a03 100644 > --- a/src/gallium/drivers/softpipe/sp_tex_sample.c > +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c > @@ -607,6 +607,100 @@ get_texel_2d(const struct sp_sampler_variant *samp, > } > } > > +static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = { > + /* pos X first then neg X is Z different, Y the same */ > + /* PIPE_TEX_FACE_POS_X,*/ > + { PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z, > + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, > + /* PIPE_TEX_FACE_NEG_X */ > + { PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z, > + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, > + > + /* pos Y first then neg Y is X different, X the same */ > + /* PIPE_TEX_FACE_POS_Y */ > + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, > + PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z }, > + > + /* PIPE_TEX_FACE_NEG_Y */ > + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, > + PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z }, > + > + /* pos Z first then neg Y is X different, X the same */ > + /* PIPE_TEX_FACE_POS_Z */ > + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, > + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, > + > + /* PIPE_TEX_FACE_NEG_Z */ > + { PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X, > + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y } > +}; > + > +static INLINE unsigned > +get_next_face(unsigned face, int x, int y) > +{ > + int idx = 0; > + > + if (x == 0 && y == 0) > + return face; > + if (x == -1) idx = 0; > + else if (x == 1) idx = 1; > + else if (y == -1) idx = 2; > + else if (y == 1) idx = 3; > + > + return face_array[face][idx]; > +} > + > +static INLINE const float * > +get_texel_cube_seamless(const struct sp_sampler_variant *samp, > +union tex_tile_address addr, int x, int y, > +float *corner) > +{ > + const struct pipe_resource *texture = samp->view->texture; > + unsigned level = addr.bits.level; > + unsigned face = addr.bits.face; > + int new_x, new_y; > + int max_x, max_y; > + int c; > + > + max_x = (int) u_minify(texture->width0, level); > + max_y = (int) u_minify(texture->height0, level); > + new_x = x; > + new_y = y; > + > + /* the corner case */ > + if ((x < 0 || x >= max_x) && > + (y < 0 || y >= max_y)) { > + const float *c1, *c2, *c3; > + int fx = x < 0 ? 0 : max_x - 1; > + int fy = y < 0 ? 0 : max_y - 1; > + c1 = get_texel_2d_no_border( samp, addr, fx, fy); > + addr.bits.face = get_next_face(face, (x < 0) ? -1 : 1, 0); > + c2 = get_texel_2d_no_border( samp, addr, (x < 0) ? max_x - 1 : 0, fy); > + addr.bits.face = get_next_face(face, 0, (y < 0) ? -1 : 1); > + c3 = get_texel_2d_no_border( samp, addr, fx, (y < 0) ? max_y - 1 : 0); > + for (c = 0; c < TGSI_QUAD_SIZE; c++) > + corner[c] = CLAMP((c1[c] + c2[c] + c3[c]), 0.0F, 1.0F) / 3; > + > + return corner; I wonder if the recommended corner handling is worth it. As far as I can tell the spec would allow you to simply drop the special handling and it would still be conformant (same for dx10). Though since softpipe is more about correctness rather than performance, I guess this is ok (and the code would very rarely get hit anyway). > + } > + /* change the face */ > + if (x < 0) { > + new_x = max_x - 1; > + face = get_next_face(face, -1, 0); > + } else if (x >= max_x) { > + new_x = 0; > + face = get_next_face(face, 1, 0); > + } else if (y < 0) { > + new_y = max_y - 1; > + face = get_next_face(face, 0, -1); > + } else if (y >= max_y) { > + new_y = 0; > + face = get_next_face(face, 0, 1); > + } > + > + addr.bits.face = face; > + return get_texel_2d_no
Re: [Mesa-dev] [PATCH] softpipe: implement seamless cubemap support.
Just a few minor things below. On 12/11/2012 02:52 AM, Dave Airlie wrote: This adds seamless sampling for cubemap boundaries if requested. The corner case averaging is messy but seems like it should be spec compliant. The face direction stuff is also a bit messy, I've no idea if that could or should be simpler, or even if all my directions are fully correct! Signed-off-by: Dave Airlie --- src/gallium/drivers/softpipe/sp_screen.c | 2 +- src/gallium/drivers/softpipe/sp_tex_sample.c | 143 +-- 2 files changed, 135 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 909fa1c..7ca259e 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -127,7 +127,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - return 0; + return 1; case PIPE_CAP_SCALED_RESOLVE: return 0; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index 7558ef1..9500a03 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -607,6 +607,100 @@ get_texel_2d(const struct sp_sampler_variant *samp, } } Maybe add a comment on this array explaining what it's for. +static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = { + /* pos X first then neg X is Z different, Y the same */ + /* PIPE_TEX_FACE_POS_X,*/ + { PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, + /* PIPE_TEX_FACE_NEG_X */ + { PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, + + /* pos Y first then neg Y is X different, X the same */ + /* PIPE_TEX_FACE_POS_Y */ + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z }, + + /* PIPE_TEX_FACE_NEG_Y */ + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z }, + + /* pos Z first then neg Y is X different, X the same */ + /* PIPE_TEX_FACE_POS_Z */ + { PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }, + + /* PIPE_TEX_FACE_NEG_Z */ + { PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X, + PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y } +}; + +static INLINE unsigned +get_next_face(unsigned face, int x, int y) +{ + int idx = 0; + + if (x == 0&& y == 0) + return face; + if (x == -1) idx = 0; + else if (x == 1) idx = 1; + else if (y == -1) idx = 2; + else if (y == 1) idx = 3; Please put the "idx = ..." assignments on separate lines. The problem with "if (c) expr;" is it's a PITA if you want to set a breakpoint on expr. + + return face_array[face][idx]; +} + +static INLINE const float * +get_texel_cube_seamless(const struct sp_sampler_variant *samp, +union tex_tile_address addr, int x, int y, +float *corner) +{ + const struct pipe_resource *texture = samp->view->texture; + unsigned level = addr.bits.level; + unsigned face = addr.bits.face; + int new_x, new_y; + int max_x, max_y; + int c; + + max_x = (int) u_minify(texture->width0, level); + max_y = (int) u_minify(texture->height0, level); + new_x = x; + new_y = y; + + /* the corner case */ + if ((x< 0 || x>= max_x)&& + (y< 0 || y>= max_y)) { + const float *c1, *c2, *c3; + int fx = x< 0 ? 0 : max_x - 1; + int fy = y< 0 ? 0 : max_y - 1; + c1 = get_texel_2d_no_border( samp, addr, fx, fy); + addr.bits.face = get_next_face(face, (x< 0) ? -1 : 1, 0); + c2 = get_texel_2d_no_border( samp, addr, (x< 0) ? max_x - 1 : 0, fy); + addr.bits.face = get_next_face(face, 0, (y< 0) ? -1 : 1); + c3 = get_texel_2d_no_border( samp, addr, fx, (y< 0) ? max_y - 1 : 0); + for (c = 0; c< TGSI_QUAD_SIZE; c++) + corner[c] = CLAMP((c1[c] + c2[c] + c3[c]), 0.0F, 1.0F) / 3; + + return corner; + } + /* change the face */ + if (x< 0) { + new_x = max_x - 1; + face = get_next_face(face, -1, 0); + } else if (x>= max_x) { + new_x = 0; + face = get_next_face(face, 1, 0); + } else if (y< 0) { + new_y = max_y - 1; + face = get_next_face(face, 0, -1); + } else if (y>= max_y) { + new_y = 0; + face = get_next_face(face, 0, 1); + } + + addr.bits.face = face; + return get_texel_2d_no_border( samp, addr, new_x, new_y ); +} /* Gather a quad of adjacent texels within a tile: */ @@ -1121,6 +1215,7 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler, union tex_tile_address addr; const float *out; int c; + float corner0[TGSI_QUAD_SIZE]; width = u_minify(texture->width0, level);
[Mesa-dev] [PATCH 1/2] clover: Don't erase build info of devices not being built
From: Tom Stellard Every call to _cl_program::build() was erasing the binaries and logs for every device associated with the program. This is incorrect because it is possible to build a program for only a subset of devices and so any device not being build should not have this information erased. --- src/gallium/state_trackers/clover/core/program.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/state_trackers/clover/core/program.cpp b/src/gallium/state_trackers/clover/core/program.cpp index 6ca8080..5fcda23 100644 --- a/src/gallium/state_trackers/clover/core/program.cpp +++ b/src/gallium/state_trackers/clover/core/program.cpp @@ -42,10 +42,10 @@ _cl_program::_cl_program(clover::context &ctx, void _cl_program::build(const std::vector &devs) { - __binaries.clear(); - __logs.clear(); for (auto dev : devs) { + __binaries.erase(dev); + __logs.erase(dev); try { auto module = (dev->ir_format() == PIPE_SHADER_IR_TGSI ? compile_program_tgsi(__source) : -- 1.7.11.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] clover: Add support for compiler flags
From: Tom Stellard --- src/gallium/state_trackers/clover/api/program.cpp | 7 +++- .../state_trackers/clover/core/compiler.hpp| 12 +- src/gallium/state_trackers/clover/core/program.cpp | 12 -- src/gallium/state_trackers/clover/core/program.hpp | 3 +- .../state_trackers/clover/llvm/invocation.cpp | 49 +++--- 5 files changed, 71 insertions(+), 12 deletions(-) diff --git a/src/gallium/state_trackers/clover/api/program.cpp b/src/gallium/state_trackers/clover/api/program.cpp index 74de840..06f96f1 100644 --- a/src/gallium/state_trackers/clover/api/program.cpp +++ b/src/gallium/state_trackers/clover/api/program.cpp @@ -142,15 +142,18 @@ clBuildProgram(cl_program prog, cl_uint count, const cl_device_id *devs, (!pfn_notify && user_data)) throw error(CL_INVALID_VALUE); + if (!opts) + opts = ""; + if (devs) { if (any_of([&](const cl_device_id dev) { return !prog->ctx.has_device(dev); }, devs, devs + count)) throw error(CL_INVALID_DEVICE); - prog->build({ devs, devs + count }); + prog->build({ devs, devs + count }, opts); } else { - prog->build(prog->ctx.devs); + prog->build(prog->ctx.devs, opts); } return CL_SUCCESS; diff --git a/src/gallium/state_trackers/clover/core/compiler.hpp b/src/gallium/state_trackers/clover/core/compiler.hpp index a43050a..3869507 100644 --- a/src/gallium/state_trackers/clover/core/compiler.hpp +++ b/src/gallium/state_trackers/clover/core/compiler.hpp @@ -44,9 +44,19 @@ namespace clover { compat::vector log; }; + class invalid_option_error { + public: + invalid_option_error() { + } + + virtual ~invalid_option_error() { + } + }; + module compile_program_llvm(const compat::string &source, enum pipe_shader_ir ir, - const compat::string &target); + const compat::string &target, + const compat::string &opts); module compile_program_tgsi(const compat::string &source); } diff --git a/src/gallium/state_trackers/clover/core/program.cpp b/src/gallium/state_trackers/clover/core/program.cpp index 5fcda23..92f1d6f 100644 --- a/src/gallium/state_trackers/clover/core/program.cpp +++ b/src/gallium/state_trackers/clover/core/program.cpp @@ -41,21 +41,27 @@ _cl_program::_cl_program(clover::context &ctx, } void -_cl_program::build(const std::vector &devs) { +_cl_program::build(const std::vector &devs, + const char *opts) { for (auto dev : devs) { __binaries.erase(dev); __logs.erase(dev); + __opts.erase(dev); + + __opts.insert({ dev, opts }); try { auto module = (dev->ir_format() == PIPE_SHADER_IR_TGSI ? compile_program_tgsi(__source) : compile_program_llvm(__source, dev->ir_format(), -dev->ir_target())); +dev->ir_target(), build_opts(dev))); __binaries.insert({ dev, module }); } catch (build_error &e) { __logs.insert({ dev, e.what() }); throw error(CL_BUILD_PROGRAM_FAILURE); + } catch (invalid_option_error &e) { + throw error(CL_INVALID_BUILD_OPTIONS); } } } @@ -77,7 +83,7 @@ _cl_program::build_status(clover::device *dev) const { std::string _cl_program::build_opts(clover::device *dev) const { - return {}; + return __opts.count(dev) ? __opts.find(dev)->second : ""; } std::string diff --git a/src/gallium/state_trackers/clover/core/program.hpp b/src/gallium/state_trackers/clover/core/program.hpp index f3858f6..0cda8ee 100644 --- a/src/gallium/state_trackers/clover/core/program.hpp +++ b/src/gallium/state_trackers/clover/core/program.hpp @@ -41,7 +41,7 @@ public: const std::vector &devs, const std::vector &binaries); - void build(const std::vector &devs); + void build(const std::vector &devs, const char *opts); const std::string &source() const; const std::map &binaries() const; @@ -55,6 +55,7 @@ public: private: std::map __binaries; std::map __logs; + std::map __opts; std::string __source; }; diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp index 2b07053..c997367 100644 --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp @@ -23,6 +23,7 @@ #include "core/compiler.hpp" #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include using namespace clover; @@ -98,15 +100,49 @@ namespace { llvm::Module * compile(const std::string &source, const std::string &name, - const std::string &triple) { + const std::string &triple, const std::s
Re: [Mesa-dev] [PATCH 12/13] mesa: Support querying GL_MAX_ELEMENT_INDEX in ES 3
On 12/10/2012 03:28 PM, Matt Turner wrote: The ES 3 spec says that the minumum allowable value is 2^24-1, but the GL 4.3 and ARB_ES3_compatibility specs require 2^32-1, so return 2^32-1. Fixes es3conform's element_index_uint_constants test. --- src/mesa/main/context.c |3 +++ src/mesa/main/get.c |1 + src/mesa/main/get_hash_params.py |3 +++ src/mesa/main/mtypes.h |3 +++ 4 files changed, 10 insertions(+), 0 deletions(-) diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c index fc2db12..241a1f9 100644 --- a/src/mesa/main/context.c +++ b/src/mesa/main/context.c @@ -656,6 +656,9 @@ _mesa_init_constants(struct gl_context *ctx) /* PrimitiveRestart */ ctx->Const.PrimitiveRestartInSoftware = GL_FALSE; + + /* ES 3.0 or ARB_ES3_compatibility */ + ctx->Const.MaxElementIndex = UINT_MAX; } diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index 115d3c5..c7f8ada 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -304,6 +304,7 @@ static const int extra_ARB_uniform_buffer_object_and_geometry_shader[] = { EXTRA_EXT(ARB_ES2_compatibility); +EXTRA_EXT(ARB_ES3_compatibility); EXTRA_EXT(ARB_texture_cube_map); EXTRA_EXT(MESA_texture_array); EXTRA_EXT2(EXT_secondary_color, ARB_vertex_program); diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py index d0e8a76..cb58394 100644 --- a/src/mesa/main/get_hash_params.py +++ b/src/mesa/main/get_hash_params.py @@ -321,6 +321,9 @@ descriptor=[ # Enums in OpenGL and ES 3.0 { "apis": ["GL", "GL_CORE", "GLES3"], "params": [ +# GL_ARB_ES3_compatibility + [ "MAX_ELEMENT_INDEX", "CONTEXT_INT64(Const.MaxElementIndex), extra_ARB_ES3_compatibility"], + # GL_ARB_fragment_shader [ "MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB", "CONTEXT_INT(Const.FragmentProgram.MaxUniformComponents), extra_ARB_fragment_shader" ], diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index bd180a5..c9bef15 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2940,6 +2940,9 @@ struct gl_constants /** GL_ARB_map_buffer_alignment */ GLuint MinMapBufferAlignment; + + /** ES 3.0 or GL_ARB_ES3_compatibility */ + GLint64 MaxElementIndex; Since the value can only be positive (and it's a Z+ type in the spec's state table), I think this should be GLuint64. }; -Brian ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/6] AMDGPU: remove nonsense setPrefLoopAlignment
The Align parameter is a power of two, so 16 results in 64K alignment. Additional to that even 16 byte alignment doesn't make any sense, so just remove it. Signed-off-by: Christian König --- lib/Target/AMDGPU/AMDILISelLowering.cpp |1 - 1 file changed, 1 deletion(-) diff --git a/lib/Target/AMDGPU/AMDILISelLowering.cpp b/lib/Target/AMDGPU/AMDILISelLowering.cpp index 6a5d841..8bfd30c 100644 --- a/lib/Target/AMDGPU/AMDILISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDILISelLowering.cpp @@ -217,7 +217,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setSchedulingPreference(Sched::RegPressure); setPow2DivIsCheap(false); - setPrefLoopAlignment(16); setSelectIsExpensive(true); setJumpIsExpensive(true); -- 1.7.9.5 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/6] AMDGPU: BB operand support for SI
Signed-off-by: Christian König --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp| 10 -- lib/Target/AMDGPU/AMDGPUMCInstLower.h |5 - .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp | 10 +- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp |6 ++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index de4053e..32275a2b 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -21,11 +21,14 @@ #include "llvm/Constants.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; -AMDGPUMCInstLower::AMDGPUMCInstLower() { } +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx): + Ctx(ctx) +{ } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); @@ -50,13 +53,16 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { case MachineOperand::MO_Register: MCOp = MCOperand::CreateReg(MO.getReg()); break; +case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), Ctx)); } OutMI.addOperand(MCOp); } } void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { - AMDGPUMCInstLower MCInstLowering; + AMDGPUMCInstLower MCInstLowering(OutContext); if (MI->isBundle()) { const MachineBasicBlock *MBB = MI->getParent(); diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h index d7bf827..d7d538e 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -14,12 +14,15 @@ namespace llvm { class MCInst; +class MCContext; class MachineInstr; class AMDGPUMCInstLower { + MCContext &Ctx; + public: - AMDGPUMCInstLower(); + AMDGPUMCInstLower(MCContext &ctx); /// \brief Lower a MachineInstr to an MCInst void lower(const MachineInstr *MI, MCInst &OutMI) const; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 3417fbc..8f41ebb 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -47,7 +47,7 @@ public: virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const; virtual unsigned getNumFixupKinds() const { return 0; }; virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value) const { assert(!"Not implemented"); } + uint64_t Value) const; virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCInstFragment *DF, const MCAsmLayout &Layout) const { @@ -80,3 +80,11 @@ AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter( raw_ostream &OS) const { return new AMDGPUMCObjectWriter(OS); } + +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value) const { + + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + assert(Fixup.getKind() == FK_PCRel_4); + *Dst = (Value - 4) / 4; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 7f271d1..c47dc99 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCFixup.h" #include "llvm/Support/raw_ostream.h" #define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1)) @@ -149,6 +150,11 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, } Imm; Imm.F = MO.getFPImm(); return Imm.I; + } else if (MO.isExpr()) { +const MCExpr *Expr = MO.getExpr(); +MCFixupKind Kind = MCFixupKind(FK_PCRel_4); +Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); +return 0; } else{ llvm_unreachable("Encoding of this operand type is not supported yet."); } -- 1.7.9.5 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/6] AMDGPU: enable S_*N2_* instructions
They seem to work fine. Signed-off-by: Christian König --- lib/Target/AMDGPU/SIInstructions.td |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index ea8de91..008652f 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -971,10 +971,10 @@ def S_OR_B32 : SOP2_32 <0x0010, "S_OR_B32", []>; def S_OR_B64 : SOP2_64 <0x0011, "S_OR_B64", []>; def S_XOR_B32 : SOP2_32 <0x0012, "S_XOR_B32", []>; def S_XOR_B64 : SOP2_64 <0x0013, "S_XOR_B64", []>; -def S_ANDN2_B32 : SOP2_ANDN2 <0x0014, "S_ANDN2_B32", []>; -def S_ANDN2_B64 : SOP2_ANDN2 <0x0015, "S_ANDN2_B64", []>; -def S_ORN2_B32 : SOP2_ORN2 <0x0016, "S_ORN2_B32", []>; -def S_ORN2_B64 : SOP2_ORN2 <0x0017, "S_ORN2_B64", []>; +def S_ANDN2_B32 : SOP2_32 <0x0014, "S_ANDN2_B32", []>; +def S_ANDN2_B64 : SOP2_64 <0x0015, "S_ANDN2_B64", []>; +def S_ORN2_B32 : SOP2_32 <0x0016, "S_ORN2_B32", []>; +def S_ORN2_B64 : SOP2_64 <0x0017, "S_ORN2_B64", []>; def S_NAND_B32 : SOP2_32 <0x0018, "S_NAND_B32", []>; def S_NAND_B64 : SOP2_64 <0x0019, "S_NAND_B64", []>; def S_NOR_B32 : SOP2_32 <0x001a, "S_NOR_B32", []>; -- 1.7.9.5 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/6] AMDGPU: Add control flow optimization
Branch if we have enough instructions so that it makes sense. Also remove branches if they don't make sense. Signed-off-by: Christian König --- lib/Target/AMDGPU/SILowerControlFlow.cpp | 49 ++ 1 file changed, 49 insertions(+) diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 1abcb88..507cb54 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -63,9 +63,13 @@ namespace { class SILowerControlFlowPass : public MachineFunctionPass { private: + static const unsigned SkipThreshold = 12; + static char ID; const TargetInstrInfo *TII; + void Skip(MachineInstr &MI, MachineOperand &To); + void If(MachineInstr &MI); void Else(MachineInstr &MI); void Break(MachineInstr &MI); @@ -74,6 +78,8 @@ private: void Loop(MachineInstr &MI); void EndCf(MachineInstr &MI); + void Branch(MachineInstr &MI); + public: SILowerControlFlowPass(TargetMachine &tm) : MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } @@ -94,6 +100,31 @@ FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { return new SILowerControlFlowPass(tm); } +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { + + unsigned NumInstr = 0; + + for (MachineBasicBlock *MBB = *From.getParent()->succ_begin(); + NumInstr < SkipThreshold && MBB != To.getMBB() && !MBB->succ_empty(); + MBB = *MBB->succ_begin()) { + +for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + NumInstr < SkipThreshold && I != E; ++I) { + + if (I->isBundle() || !I->isBundled()) +++NumInstr; +} + } + + if (NumInstr < SkipThreshold) +return; + + DebugLoc DL = From.getDebugLoc(); + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addOperand(To) + .addReg(AMDGPU::EXEC); +} + void SILowerControlFlowPass::If(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); @@ -108,6 +139,8 @@ void SILowerControlFlowPass::If(MachineInstr &MI) { .addReg(AMDGPU::EXEC) .addReg(Reg); + Skip(MI, MI.getOperand(2)); + MI.eraseFromParent(); } @@ -125,6 +158,8 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) { .addReg(AMDGPU::EXEC) .addReg(Dst); + Skip(MI, MI.getOperand(2)); + MI.eraseFromParent(); } @@ -206,6 +241,16 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) { MI.eraseFromParent(); } +void SILowerControlFlowPass::Branch(MachineInstr &MI) { + + MachineBasicBlock *Next = MI.getParent()->getNextNode(); + MachineBasicBlock *Target = MI.getOperand(0).getMBB(); + if (Target == Next) +MI.eraseFromParent(); + else +assert(0); +} + bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { bool HaveCf = false; @@ -249,6 +294,10 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { HaveCf = true; EndCf(MI); break; + +case AMDGPU::S_BRANCH: + Branch(MI); + break; } } } -- 1.7.9.5 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 6/6] AMDGPU: Remove unecessary VREG alignment.
Unlike SGPRs VGPRs doesn't need to be aligned. Signed-off-by: Christian König --- lib/Target/AMDGPU/SIRegisterInfo.td | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index e52311a..c3f1361 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -105,15 +105,15 @@ def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, // VGPR 64-bit registers def VGPR_64 : RegisterTuples<[low, high], - [(add (decimate VGPR_32, 2)), - (add (decimate (rotl VGPR_32, 1), 2))]>; + [(add VGPR_32), + (add (rotl VGPR_32, 1))]>; // VGPR 128-bit registers def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w], - [(add (decimate VGPR_32, 4)), - (add (decimate (rotl VGPR_32, 1), 4)), - (add (decimate (rotl VGPR_32, 2), 4)), - (add (decimate (rotl VGPR_32, 3), 4))]>; + [(add VGPR_32), + (add (rotl VGPR_32, 1)), + (add (rotl VGPR_32, 2)), + (add (rotl VGPR_32, 3))]>; // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, -- 1.7.9.5 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/egl/drm: only unref the udev device if needed
Could you commit and push it to master? Am Mi, 5. Dezember 2012, 09:31:48 schrieben Sie: > On Tue, Dec 4, 2012 at 12:50 PM, Tobias Droste wrote: > > Anyone interested? ;-) > > > > I would just push it, but I don't have the rights to do so. > > Looks reasonable to me. > > Reviewed-by: Alex Deucher > > > Am Do, 29. November 2012, 17:02:28 schrieben Sie: > >> Fixes compiler warning: > >> > >> drm/native_drm.c: In function ‘native_create_display’: > >> drm/native_drm.c:180:21: warning: ‘device’ may be used uninitialized in > >> this function [-Wmaybe-uninitialized] drm/native_drm.c:157:24: note: > >> ‘device’ was declared here > >> > >> Signed-off-by: Tobias Droste > >> --- > >> > >> src/gallium/state_trackers/egl/drm/native_drm.c |9 + > >> 1 Datei geändert, 5 Zeilen hinzugefügt(+), 4 Zeilen entfernt(-) > >> > >> diff --git a/src/gallium/state_trackers/egl/drm/native_drm.c > >> b/src/gallium/state_trackers/egl/drm/native_drm.c index 23fc137..f0c0f54 > >> 100644 > >> --- a/src/gallium/state_trackers/egl/drm/native_drm.c > >> +++ b/src/gallium/state_trackers/egl/drm/native_drm.c > >> @@ -161,23 +161,24 @@ drm_get_device_name(int fd) > >> > >> udev = udev_new(); > >> if (fstat(fd, &buf) < 0) { > >> > >>_eglLog(_EGL_WARNING, "failed to stat fd %d", fd); > >> > >> - goto out; > >> + goto outudev; > >> > >> } > >> > >> device = udev_device_new_from_devnum(udev, 'c', buf.st_rdev); > >> if (device == NULL) { > >> > >>_eglLog(_EGL_WARNING, > >> > >>"could not create udev device for fd %d", fd); > >> > >> - goto out; > >> + goto outdevice; > >> > >> } > >> > >> tmp = udev_device_get_devnode(device); > >> if (!tmp) > >> > >> - goto out; > >> + goto outdevice; > >> > >> device_name = strdup(tmp); > >> > >> -out: > >> > >> +outdevice: > >> udev_device_unref(device); > >> > >> +outudev: > >> udev_unref(udev); > >> > >> #endif > > > > ___ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/egl/drm: only unref the udev device if needed
On Tue, Dec 11, 2012 at 12:49 PM, Tobias Droste wrote: > Could you commit and push it to master? Done. Thanks! Alex > > Am Mi, 5. Dezember 2012, 09:31:48 schrieben Sie: >> On Tue, Dec 4, 2012 at 12:50 PM, Tobias Droste wrote: >> > Anyone interested? ;-) >> > >> > I would just push it, but I don't have the rights to do so. >> >> Looks reasonable to me. >> >> Reviewed-by: Alex Deucher >> >> > Am Do, 29. November 2012, 17:02:28 schrieben Sie: >> >> Fixes compiler warning: >> >> >> >> drm/native_drm.c: In function ‘native_create_display’: >> >> drm/native_drm.c:180:21: warning: ‘device’ may be used uninitialized in >> >> this function [-Wmaybe-uninitialized] drm/native_drm.c:157:24: note: >> >> ‘device’ was declared here >> >> >> >> Signed-off-by: Tobias Droste >> >> --- >> >> >> >> src/gallium/state_trackers/egl/drm/native_drm.c |9 + >> >> 1 Datei geändert, 5 Zeilen hinzugefügt(+), 4 Zeilen entfernt(-) >> >> >> >> diff --git a/src/gallium/state_trackers/egl/drm/native_drm.c >> >> b/src/gallium/state_trackers/egl/drm/native_drm.c index 23fc137..f0c0f54 >> >> 100644 >> >> --- a/src/gallium/state_trackers/egl/drm/native_drm.c >> >> +++ b/src/gallium/state_trackers/egl/drm/native_drm.c >> >> @@ -161,23 +161,24 @@ drm_get_device_name(int fd) >> >> >> >> udev = udev_new(); >> >> if (fstat(fd, &buf) < 0) { >> >> >> >>_eglLog(_EGL_WARNING, "failed to stat fd %d", fd); >> >> >> >> - goto out; >> >> + goto outudev; >> >> >> >> } >> >> >> >> device = udev_device_new_from_devnum(udev, 'c', buf.st_rdev); >> >> if (device == NULL) { >> >> >> >>_eglLog(_EGL_WARNING, >> >> >> >>"could not create udev device for fd %d", fd); >> >> >> >> - goto out; >> >> + goto outdevice; >> >> >> >> } >> >> >> >> tmp = udev_device_get_devnode(device); >> >> if (!tmp) >> >> >> >> - goto out; >> >> + goto outdevice; >> >> >> >> device_name = strdup(tmp); >> >> >> >> -out: >> >> >> >> +outdevice: >> >> udev_device_unref(device); >> >> >> >> +outudev: >> >> udev_unref(udev); >> >> >> >> #endif >> > >> > ___ >> > mesa-dev mailing list >> > mesa-dev@lists.freedesktop.org >> > http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/2] clover: Don't erase build info of devices not being built
Tom Stellard writes: > From: Tom Stellard > > Every call to _cl_program::build() was erasing the binaries and logs for > every device associated with the program. This is incorrect because > it is possible to build a program for only a subset of devices and so > any device not being build should not have this information erased. > --- > src/gallium/state_trackers/clover/core/program.cpp | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > Thanks. For both patches: Reviewed-by: Francisco Jerez > diff --git a/src/gallium/state_trackers/clover/core/program.cpp > b/src/gallium/state_trackers/clover/core/program.cpp > index 6ca8080..5fcda23 100644 > --- a/src/gallium/state_trackers/clover/core/program.cpp > +++ b/src/gallium/state_trackers/clover/core/program.cpp > @@ -42,10 +42,10 @@ _cl_program::_cl_program(clover::context &ctx, > > void > _cl_program::build(const std::vector &devs) { > - __binaries.clear(); > - __logs.clear(); > > for (auto dev : devs) { > + __binaries.erase(dev); > + __logs.erase(dev); >try { > auto module = (dev->ir_format() == PIPE_SHADER_IR_TGSI ? > compile_program_tgsi(__source) : pgpC3rvXAF4YR.pgp Description: PGP signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/3] i965: Scale shader_time to compensate for resets.
Some shaders experience resets more than others, which skews the numbers reported. Attempt to correct for this by linearly scaling according to the number of resets that happen. Note that will not be accurate if invocations of shaders have varying times and longer invocations are more likely to reset. However, this should at least be better than the previous situation. --- src/mesa/drivers/dri/i965/brw_context.h |6 +++ src/mesa/drivers/dri/i965/brw_fs.cpp| 10 - src/mesa/drivers/dri/i965/brw_program.c | 72 --- src/mesa/drivers/dri/i965/brw_vec4.cpp |4 +- 4 files changed, 83 insertions(+), 9 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index dc25cab..eba9eb5 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -655,8 +655,14 @@ struct brw_tracked_state { enum shader_time_shader_type { ST_NONE, ST_VS, + ST_VS_WRITTEN, + ST_VS_RESET, ST_FS8, + ST_FS8_WRITTEN, + ST_FS8_RESET, ST_FS16, + ST_FS16_WRITTEN, + ST_FS16_RESET, }; /* Flags for brw->state.cache. diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 2f4c669..f428a83 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -505,12 +505,16 @@ fs_visitor::emit_shader_time_end() { current_annotation = "shader time end"; - enum shader_time_shader_type type; + enum shader_time_shader_type type, written_type, reset_type; if (dispatch_width == 8) { type = ST_FS8; + written_type = ST_FS8_WRITTEN; + reset_type = ST_FS8_RESET; } else { assert(dispatch_width == 16); type = ST_FS16; + written_type = ST_FS16_WRITTEN; + reset_type = ST_FS16_RESET; } fs_reg shader_end_time = get_timestamp(); @@ -537,7 +541,9 @@ fs_visitor::emit_shader_time_end() emit(ADD(diff, diff, fs_reg(-2u))); emit_shader_time_write(type, diff); - + emit_shader_time_write(written_type, fs_reg(1u)); + emit(BRW_OPCODE_ELSE); + emit_shader_time_write(reset_type, fs_reg(1u)); emit(BRW_OPCODE_ENDIF); pop_force_uncompressed(); diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 1859041..0358241 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -222,16 +222,73 @@ compare_time(const void *a, const void *b) } static void +get_written_and_reset(struct brw_context *brw, int i, + uint64_t *written, uint64_t *reset) +{ + enum shader_time_shader_type type = brw->shader_time.types[i]; + assert(type == ST_VS || type == ST_FS8 || type == ST_FS16); + + /* Find where we recorded written and reset. */ + int wi, ri; + + for (wi = i; brw->shader_time.types[wi] != type + 1; wi++) + ; + + for (ri = i; brw->shader_time.types[ri] != type + 2; ri++) + ; + + *written = brw->shader_time.cumulative[wi]; + *reset = brw->shader_time.cumulative[ri]; +} + +static void brw_report_shader_time(struct brw_context *brw) { if (!brw->shader_time.bo || !brw->shader_time.num_entries) return; + uint64_t scaled[brw->shader_time.num_entries]; uint64_t *sorted[brw->shader_time.num_entries]; double total = 0; for (int i = 0; i < brw->shader_time.num_entries; i++) { - sorted[i] = &brw->shader_time.cumulative[i]; - total += brw->shader_time.cumulative[i]; + uint64_t written = 0, reset = 0; + + sorted[i] = &scaled[i]; + + switch (brw->shader_time.types[i]) { + case ST_VS_WRITTEN: + case ST_VS_RESET: + case ST_FS8_WRITTEN: + case ST_FS8_RESET: + case ST_FS16_WRITTEN: + case ST_FS16_RESET: + /* We'll handle these when along with the time. */ + scaled[i] = 0; + continue; + + case ST_VS: + case ST_FS8: + case ST_FS16: + get_written_and_reset(brw, i, &written, &reset); + break; + + default: + /* I sometimes want to print things that aren't the 3 shader times. + * Just print the sum in that case. + */ + written = 1; + reset = 0; + break; + } + + uint64_t time = brw->shader_time.cumulative[i]; + if (written) { + scaled[i] = time / written * (written + reset); + } else { + scaled[i] = time; + } + + total += scaled[i]; } if (total == 0) { @@ -245,7 +302,10 @@ brw_report_shader_time(struct brw_context *brw) printf("type ID cycles spent %% of total\n"); for (int s = 0; s < brw->shader_time.num_entries; s++) { /* Work back from the sorted pointers times to a time to print. */ - int i = sorted[s] - brw->shader_time.cumulative; + int i = sorted[s] - scaled; + + if (scaled[i] == 0) + continue; int shader_num = -1; if
[Mesa-dev] [PATCH 3/3] i965: Print a total time for the different shader stages.
Sometimes I've got a patch for a performance optimization that's not showing a statistically significant performance difference on reported FPS, but still seems like a good idea because it ought to reduce time spent in the shader. If I can see the total number of cycles spent in the shader stage being optimized, it may show that the patch is still worthwhile (or point out that it's actually broken in some way). --- src/mesa/drivers/dri/i965/brw_program.c | 48 --- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 0358241..d7b240a 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -242,6 +242,21 @@ get_written_and_reset(struct brw_context *brw, int i, } static void +print_shader_time_line(const char *name, int shader_num, + uint64_t time, uint64_t total) +{ + printf("%s", name); + for (int i = strlen(name); i < 10; i++) + printf(" "); + printf("%4d: ", shader_num); + + printf("%16lld (%7.2f Gcycles) %4.1f%%\n", + (long long)time, + (double)time / 10.0, + (double)time / total * 100.0); +} + +static void brw_report_shader_time(struct brw_context *brw) { if (!brw->shader_time.bo || !brw->shader_time.num_entries) @@ -249,13 +264,16 @@ brw_report_shader_time(struct brw_context *brw) uint64_t scaled[brw->shader_time.num_entries]; uint64_t *sorted[brw->shader_time.num_entries]; + uint64_t total_by_type[ST_FS16 + 1]; + memset(total_by_type, 0, sizeof(total_by_type)); double total = 0; for (int i = 0; i < brw->shader_time.num_entries; i++) { uint64_t written = 0, reset = 0; + enum shader_time_shader_type type = brw->shader_time.types[i]; sorted[i] = &scaled[i]; - switch (brw->shader_time.types[i]) { + switch (type) { case ST_VS_WRITTEN: case ST_VS_RESET: case ST_FS8_WRITTEN: @@ -288,6 +306,16 @@ brw_report_shader_time(struct brw_context *brw) scaled[i] = time; } + switch (type) { + case ST_VS: + case ST_FS8: + case ST_FS16: + total_by_type[type] += scaled[i]; + break; + default: + break; + } + total += scaled[i]; } @@ -314,24 +342,24 @@ brw_report_shader_time(struct brw_context *brw) switch (brw->shader_time.types[i]) { case ST_VS: - printf("vs %4d: ", shader_num); + print_shader_time_line("vs", shader_num, scaled[i], total); break; case ST_FS8: - printf("fs8 %4d: ", shader_num); + print_shader_time_line("fs8", shader_num, scaled[i], total); break; case ST_FS16: - printf("fs16 %4d: ", shader_num); + print_shader_time_line("fs16", shader_num, scaled[i], total); break; default: - printf("other: "); + print_shader_time_line("other", shader_num, scaled[i], total); break; } - - printf("%16lld (%7.2f Gcycles) %4.1f%%\n", - (long long)scaled[i], - (double)scaled[i] / 10.0, - (double)scaled[i] / total * 100.0); } + + printf("\n"); + print_shader_time_line("total vs", -1, total_by_type[ST_VS], total); + print_shader_time_line("total fs8", -1, total_by_type[ST_FS8], total); + print_shader_time_line("total fs16", -1, total_by_type[ST_FS16], total); } static void -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/3] i965: Adjust the split between shader_time_end() and shader_time_write().
I'm about to emit other kinds of writes besides time deltas, and it turns out with the frequency of resets, we couldn't really use the old time delta write() function more than once in a shader. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 53 +--- src/mesa/drivers/dri/i965/brw_fs.h |2 +- src/mesa/drivers/dri/i965/brw_vec4.cpp | 49 ++--- src/mesa/drivers/dri/i965/brw_vec4.h |2 +- 4 files changed, 55 insertions(+), 51 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index ac0bb56..2f4c669 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -513,38 +513,22 @@ fs_visitor::emit_shader_time_end() type = ST_FS16; } - emit_shader_time_write(type, shader_start_time, get_timestamp()); -} - -void -fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, - fs_reg start, fs_reg end) -{ - /* Choose an index in the buffer and set up tracking information for our -* printouts. -*/ - int shader_time_index = brw->shader_time.num_entries++; - assert(shader_time_index <= brw->shader_time.max_entries); - brw->shader_time.types[shader_time_index] = type; - if (prog) { - _mesa_reference_shader_program(ctx, - &brw->shader_time.programs[shader_time_index], - prog); - } + fs_reg shader_end_time = get_timestamp(); /* Check that there weren't any timestamp reset events (assuming these * were the only two timestamp reads that happened). */ - fs_reg reset = end; + fs_reg reset = shader_end_time; reset.smear = 2; fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u))); test->conditional_mod = BRW_CONDITIONAL_Z; emit(IF(BRW_PREDICATE_NORMAL)); push_force_uncompressed(); + fs_reg start = shader_start_time; start.negate = true; fs_reg diff = fs_reg(this, glsl_type::uint_type); - emit(ADD(diff, start, end)); + emit(ADD(diff, start, shader_end_time)); /* If there were no instructions between the two timestamp gets, the diff * is 2 cycles. Remove that overhead, so I can forget about that when @@ -552,6 +536,29 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, */ emit(ADD(diff, diff, fs_reg(-2u))); + emit_shader_time_write(type, diff); + + emit(BRW_OPCODE_ENDIF); + + pop_force_uncompressed(); +} + +void +fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, + fs_reg value) +{ + /* Choose an index in the buffer and set up tracking information for our +* printouts. +*/ + int shader_time_index = brw->shader_time.num_entries++; + assert(shader_time_index <= brw->shader_time.max_entries); + brw->shader_time.types[shader_time_index] = type; + if (prog) { + _mesa_reference_shader_program(ctx, + &brw->shader_time.programs[shader_time_index], + prog); + } + int base_mrf = 6; fs_reg offset_mrf = fs_reg(MRF, base_mrf); @@ -560,15 +567,11 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, fs_reg time_mrf = fs_reg(MRF, base_mrf + 1); time_mrf.type = BRW_REGISTER_TYPE_UD; - emit(MOV(time_mrf, diff)); + emit(MOV(time_mrf, value)); fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD)); inst->base_mrf = base_mrf; inst->mlen = 2; - - pop_force_uncompressed(); - - emit(BRW_OPCODE_ENDIF); } void diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index d4ddb47..6caf7c3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -392,7 +392,7 @@ public: void emit_shader_time_begin(); void emit_shader_time_end(); void emit_shader_time_write(enum shader_time_shader_type type, - fs_reg start, fs_reg end); + fs_reg value); bool try_rewrite_rhs_to_dst(ir_assignment *ir, fs_reg dst, diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index dc9d9d5..28199dd 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1081,29 +1081,11 @@ vec4_visitor::emit_shader_time_end() current_annotation = "shader time end"; src_reg shader_end_time = get_timestamp(); - emit_shader_time_write(ST_VS, shader_start_time, shader_end_time); -} - -void -vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type, - src_reg start, src_reg end) -{ - /* Choose an index in the buffer and set up tracking information for our -* printouts. -*/ - int shader_time_index = brw->shader_time.num_entries++; - assert(shader_t
Re: [Mesa-dev] [PATCH 3/7] i965/fs: Before reg alloc, schedule instructions to reduce live ranges.
Kenneth Graunke writes: > On 12/07/2012 02:58 PM, Eric Anholt wrote: >> This came from an idea by Ben Segovia. 16-wide pixel shaders are very >> important for latency hiding on i965, so we want to try really hard to >> get them. If scheduling an instruction makes some set of instructions >> available, those are probably the ones that make the instruction's >> result dead. By choosing those first, we'll have a tendency to reduce >> the amount of live data as opposed to creating more. >> >> Previously, we were sometimes getting this behavior out of the >> scheduler, which was what produced the scheduler's original performance >> wins on lightsmark. Unfortunately, that was mostly an accident of the >> lame instruction latency information that I had, which made it >> impossible to fix the actual scheduling for performance. Now that we've >> fixed the scheduling for setup for register allocation, we can safely >> update the latency parameters for the final schedule. >> >> In shader-db, we lose 37 16-wide shaders, but gain 90 new ones. 4 >> shaders that were spilling change how many registers spill, for a >> reduction of 70/3899 instructions. >> --- >> .../dri/i965/brw_fs_schedule_instructions.cpp | 49 >> +--- >> 1 file changed, 43 insertions(+), 6 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp >> b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp >> index d48ad1e..3941eac 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp >> + for (schedule_node *node = (schedule_node >> *)instructions.get_tail(); >> + node != instructions.get_head()->prev; >> + node = (schedule_node *)node->prev) { >> +schedule_node *n = (schedule_node *)node; >> + >> +if (!chosen || chosen->inst->regs_written() > 1) { >> + chosen = n; >> + if (chosen->inst->regs_written() <= 1) >> + break; >> +} > > I don't think the if condition is necessary here. Just doing > > for (...) { > chosen = (schedule_node *) node; > if (chosen->inst->regs_written() <= 1) > break; > } > > should accomplish the same thing. Yeah, it was a leftover of trying a bunch of more complicated heuristics. >> - if (!chosen || n->unblocked_time < chosen_time) { >> -chosen = n; >> -chosen_time = n->unblocked_time; >> - } >> + chosen_time = chosen->unblocked_time; > > It seems plausible that there could be no nodes to schedule...which > means chosen would be NULL here. Perhaps just move chosen_time = > chosen->unblocked_time into the if...break above. This is all in a loop while (!instructions.is_empty()), and these loops have to pick one of those. pgpqLqB62uIjJ.pgp Description: PGP signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/dri: add a way to force MSAA on with an environment variable
On 12/10/2012 03:51 PM, Eric Anholt wrote: Marek Olšák writes: There are 2 ways. I prefer the former: GALLIUM_MSAA=n __GL_FSAA_MODE=n Tested with ETQW, which doesn't support MSAA on Linux. This is the only way to get MSAA there. This sounds like something that would be nice to add as a driconf knob. I like the idea of also accepting __GL_FSAA_MODE... since that's what the NVIDIA closed-source driver uses. The other name probably should use a driconf option. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 12/13] mesa: Support querying GL_MAX_ELEMENT_INDEX in ES 3
On 12/10/2012 02:28 PM, Matt Turner wrote: The ES 3 spec says that the minumum allowable value is 2^24-1, but the GL 4.3 and ARB_ES3_compatibility specs require 2^32-1, so return 2^32-1. Fixes es3conform's element_index_uint_constants test. --- src/mesa/main/context.c |3 +++ src/mesa/main/get.c |1 + src/mesa/main/get_hash_params.py |3 +++ src/mesa/main/mtypes.h |3 +++ 4 files changed, 10 insertions(+), 0 deletions(-) diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c index fc2db12..241a1f9 100644 --- a/src/mesa/main/context.c +++ b/src/mesa/main/context.c @@ -656,6 +656,9 @@ _mesa_init_constants(struct gl_context *ctx) /* PrimitiveRestart */ ctx->Const.PrimitiveRestartInSoftware = GL_FALSE; + + /* ES 3.0 or ARB_ES3_compatibility */ + ctx->Const.MaxElementIndex = UINT_MAX; This should explicitly be 0x0. It is possible for UINT_MAX to have some other value. It seems like madness these days, but stranger things have happened. } diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index 115d3c5..c7f8ada 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -304,6 +304,7 @@ static const int extra_ARB_uniform_buffer_object_and_geometry_shader[] = { EXTRA_EXT(ARB_ES2_compatibility); +EXTRA_EXT(ARB_ES3_compatibility); EXTRA_EXT(ARB_texture_cube_map); EXTRA_EXT(MESA_texture_array); EXTRA_EXT2(EXT_secondary_color, ARB_vertex_program); diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py index d0e8a76..cb58394 100644 --- a/src/mesa/main/get_hash_params.py +++ b/src/mesa/main/get_hash_params.py @@ -321,6 +321,9 @@ descriptor=[ # Enums in OpenGL and ES 3.0 { "apis": ["GL", "GL_CORE", "GLES3"], "params": [ +# GL_ARB_ES3_compatibility + [ "MAX_ELEMENT_INDEX", "CONTEXT_INT64(Const.MaxElementIndex), extra_ARB_ES3_compatibility"], + # GL_ARB_fragment_shader [ "MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB", "CONTEXT_INT(Const.FragmentProgram.MaxUniformComponents), extra_ARB_fragment_shader" ], diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index bd180a5..c9bef15 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2940,6 +2940,9 @@ struct gl_constants /** GL_ARB_map_buffer_alignment */ GLuint MinMapBufferAlignment; + + /** ES 3.0 or GL_ARB_ES3_compatibility */ + GLint64 MaxElementIndex; In addition to Brian's comment, I'm not a huge fan of the way we comment structure fields like this. I'd much rather see some description of what the field means than what extension it is for. Something like: /** * Maximum value supported for an index in DrawElements and friends. * * This must be at least (1ull<<24)-1. The default value is * (1ull<<32)-1. * * \since ES 3.0 or GL_ARB_ES3_compatibility * \sa _mesa_init_constants */ }; ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 12/13] mesa: Support querying GL_MAX_ELEMENT_INDEX in ES 3
On Tue, Dec 11, 2012 at 11:00 AM, Ian Romanick wrote: > On 12/10/2012 02:28 PM, Matt Turner wrote: >> >> The ES 3 spec says that the minumum allowable value is 2^24-1, but the >> GL 4.3 and ARB_ES3_compatibility specs require 2^32-1, so return 2^32-1. >> >> Fixes es3conform's element_index_uint_constants test. >> --- >> src/mesa/main/context.c |3 +++ >> src/mesa/main/get.c |1 + >> src/mesa/main/get_hash_params.py |3 +++ >> src/mesa/main/mtypes.h |3 +++ >> 4 files changed, 10 insertions(+), 0 deletions(-) >> >> diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c >> index fc2db12..241a1f9 100644 >> --- a/src/mesa/main/context.c >> +++ b/src/mesa/main/context.c >> @@ -656,6 +656,9 @@ _mesa_init_constants(struct gl_context *ctx) >> >> /* PrimitiveRestart */ >> ctx->Const.PrimitiveRestartInSoftware = GL_FALSE; >> + >> + /* ES 3.0 or ARB_ES3_compatibility */ >> + ctx->Const.MaxElementIndex = UINT_MAX; > > > This should explicitly be 0x0. It is possible for UINT_MAX to have > some other value. It seems like madness these days, but stranger things > have happened. > > >> } >> >> >> diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c >> index 115d3c5..c7f8ada 100644 >> --- a/src/mesa/main/get.c >> +++ b/src/mesa/main/get.c >> @@ -304,6 +304,7 @@ static const int >> extra_ARB_uniform_buffer_object_and_geometry_shader[] = { >> >> >> EXTRA_EXT(ARB_ES2_compatibility); >> +EXTRA_EXT(ARB_ES3_compatibility); >> EXTRA_EXT(ARB_texture_cube_map); >> EXTRA_EXT(MESA_texture_array); >> EXTRA_EXT2(EXT_secondary_color, ARB_vertex_program); >> diff --git a/src/mesa/main/get_hash_params.py >> b/src/mesa/main/get_hash_params.py >> index d0e8a76..cb58394 100644 >> --- a/src/mesa/main/get_hash_params.py >> +++ b/src/mesa/main/get_hash_params.py >> @@ -321,6 +321,9 @@ descriptor=[ >> >> # Enums in OpenGL and ES 3.0 >> { "apis": ["GL", "GL_CORE", "GLES3"], "params": [ >> +# GL_ARB_ES3_compatibility >> + [ "MAX_ELEMENT_INDEX", "CONTEXT_INT64(Const.MaxElementIndex), >> extra_ARB_ES3_compatibility"], >> + >> # GL_ARB_fragment_shader >> [ "MAX_FRAGMENT_UNIFORM_COMPONENTS_ARB", >> "CONTEXT_INT(Const.FragmentProgram.MaxUniformComponents), >> extra_ARB_fragment_shader" ], >> >> diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h >> index bd180a5..c9bef15 100644 >> --- a/src/mesa/main/mtypes.h >> +++ b/src/mesa/main/mtypes.h >> @@ -2940,6 +2940,9 @@ struct gl_constants >> >> /** GL_ARB_map_buffer_alignment */ >> GLuint MinMapBufferAlignment; >> + >> + /** ES 3.0 or GL_ARB_ES3_compatibility */ >> + GLint64 MaxElementIndex; > > > In addition to Brian's comment, I'm not a huge fan of the way we comment > structure fields like this. I'd much rather see some description of what > the field means than what extension it is for. Something like: > >/** > * Maximum value supported for an index in DrawElements and friends. > * > * This must be at least (1ull<<24)-1. The default value is > * (1ull<<32)-1. > * > * \since ES 3.0 or GL_ARB_ES3_compatibility > * \sa _mesa_init_constants > */ > >> }; Both comments sound good to me. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 01/13] Add ES 3 handling to get.c and get_hash_generator.py
On 12/10/2012 04:06 PM, Jordan Justen wrote: On Mon, Dec 10, 2012 at 2:28 PM, Matt Turner wrote: @@ -966,6 +973,15 @@ find_value(const char *func, GLenum pname, void **p, union value *v) int api; api = ctx->API; + /* We index into the table_set[] list of per-API hash tables using the API's +* value in the gl_api enum. Since GLES 3 doesn't have an API_OPENGL* enum +* value since it's compatible with GLES2 its entry in table_set[] is at the +* end. +*/ + STATIC_ASSERT(Elements(table_set) == API_OPENGL_LAST + 2); + if (_mesa_is_gles3(ctx)) { + api = API_OPENGL_LAST + 1; + } This seems somewhat unexpected. How do we keep track of the fact that API_OPENGL_LAST + 1 is used for GLES3 in this case? Having API_OPENGL_LAST and the STATIC_ASSERT should be sufficient. Are we sure GLES3 isn't a separate API from GLES2? :) I guess I don't know the motivation for keeping GLES2/3 under a combined API_GLES2 brings. Wasn't there the idea that since GLES3 was backward compatible, we'd just upgrade GLES2 to add the GLES3 features? But, now it seems we keep separating them in various ways. The problem is that we have a zillion places that check of GLES2. I don't want to have to find and update all of those to check for separate GLES2 and GLES3. Every single place that checks for GLES2 would have to be updated. diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 57dddf8..bd180a5 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -3335,6 +3335,7 @@ typedef enum API_OPENGLES, API_OPENGLES2, API_OPENGL_CORE, + API_OPENGL_LAST = API_OPENGL_CORE, We could instead add: API_COUNT Then we wouldn't need to update the API_OPENGL_LAST enum if newer API's are added. The problem is we'll get compiler warnings in switch-statements that don't include either a default case or a case for API_COUNT. Since the warning are potentially useful, I'd rather not be forced to hide them. -Jordan ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 06/13] mesa: Allow glGet* queries on ARB_transform_feedback2 data in ES 3
On 12/10/2012 02:28 PM, Matt Turner wrote: Fixes the transform_feedback2_init_defaults test from es3conform. The ES 3 spec lists these as TRANSFORM_FEEDBACK_PAUSED and TRANSFORM_FEEDBACK_ACTIVE. --- src/mesa/main/get.c |8 +++- src/mesa/main/get_hash_params.py | 10 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index 146612c..115d3c5 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -289,6 +289,13 @@ static const int extra_texture_buffer_object[] = { EXTRA_END }; +/* FIXME: Remove this when i965 exposes transform_feedback2 */ This comment should be removed. These get enums are valid if either ARB_transformfeedback2 or ES3, so the check is correct. XFB2 is a superset of ES3, so I don't think we should assume that every driver that implements ES3 also supports XFB2... even if they all happen to do so. +static const int extra_ARB_transform_feedback2_api_es3[] = { + EXT(ARB_transform_feedback2), + EXTRA_API_ES3, + EXTRA_END +}; + static const int extra_ARB_uniform_buffer_object_and_geometry_shader[] = { EXT(ARB_uniform_buffer_object), EXT(ARB_geometry_shader4), @@ -321,7 +328,6 @@ EXTRA_EXT(ARB_seamless_cube_map); EXTRA_EXT(ARB_sync); EXTRA_EXT(ARB_vertex_shader); EXTRA_EXT(EXT_transform_feedback); -EXTRA_EXT(ARB_transform_feedback2); EXTRA_EXT(ARB_transform_feedback3); EXTRA_EXT(EXT_pixel_buffer_object); EXTRA_EXT(ARB_vertex_program); diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py index d9fa693..ffdf96a 100644 --- a/src/mesa/main/get_hash_params.py +++ b/src/mesa/main/get_hash_params.py @@ -324,6 +324,11 @@ descriptor=[ # GL_ARB_sync [ "MAX_SERVER_WAIT_TIMEOUT", "CONTEXT_INT64(Const.MaxServerWaitTimeout), extra_ARB_sync" ], +# GL_ARB_transform_feedback2 + [ "TRANSFORM_FEEDBACK_BUFFER_PAUSED", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_ARB_transform_feedback2_api_es3" ], + [ "TRANSFORM_FEEDBACK_BUFFER_ACTIVE", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_ARB_transform_feedback2_api_es3" ], + [ "TRANSFORM_FEEDBACK_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_transform_feedback2_api_es3" ], + # GL_ARB_uniform_buffer_object [ "MAX_VERTEX_UNIFORM_BLOCKS", "CONTEXT_INT(Const.VertexProgram.MaxUniformBlocks), extra_ARB_uniform_buffer_object" ], [ "MAX_FRAGMENT_UNIFORM_BLOCKS", "CONTEXT_INT(Const.FragmentProgram.MaxUniformBlocks), extra_ARB_uniform_buffer_object" ], @@ -621,11 +626,6 @@ descriptor=[ # GL_EXT_texture_integer [ "RGBA_INTEGER_MODE_EXT", "BUFFER_BOOL(_IntegerColor), extra_EXT_texture_integer" ], -# GL_ARB_transform_feedback2 - [ "TRANSFORM_FEEDBACK_BUFFER_PAUSED", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_ARB_transform_feedback2" ], - [ "TRANSFORM_FEEDBACK_BUFFER_ACTIVE", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_ARB_transform_feedback2" ], - [ "TRANSFORM_FEEDBACK_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_transform_feedback2" ], - # GL_ARB_transform_feedback3 [ "MAX_TRANSFORM_FEEDBACK_BUFFERS", "CONTEXT_INT(Const.MaxTransformFeedbackBuffers), extra_ARB_transform_feedback3" ], [ "MAX_VERTEX_STREAMS", "CONTEXT_INT(Const.MaxVertexStreams), extra_ARB_transform_feedback3" ], ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 11/13] mesa: De-duplicate ES2 queries
On 12/10/2012 02:28 PM, Matt Turner wrote: From GL/GLES/GL_CORE and GLES2 -> GL/GL_CORE/GLES2. Yes, we really were exposing ES2_compatibility queries on ES 1. --- src/mesa/main/get_hash_params.py | 16 ++-- 1 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py index 650fb38..d0e8a76 100644 --- a/src/mesa/main/get_hash_params.py +++ b/src/mesa/main/get_hash_params.py @@ -204,12 +204,6 @@ descriptor=[ [ "TEXTURE_COORD_ARRAY_TYPE", "LOC_CUSTOM, TYPE_ENUM, offsetof(struct gl_client_array, Type), NO_EXTRA" ], [ "TEXTURE_COORD_ARRAY_STRIDE", "LOC_CUSTOM, TYPE_INT, offsetof(struct gl_client_array, Stride), NO_EXTRA" ], -# GL_ARB_ES2_compatibility - [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ], - [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), extra_ARB_ES2_compatibility" ], - [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_ES2_compatibility" ], - [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_ES2_compatibility" ], - # GL_ARB_multitexture [ "MAX_TEXTURE_UNITS", "CONTEXT_INT(Const.MaxTextureUnits), NO_EXTRA" ], [ "CLIENT_ACTIVE_TEXTURE", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], @@ -310,6 +304,12 @@ descriptor=[ # GL_NV_read_buffer [ "READ_BUFFER", "LOC_CUSTOM, TYPE_ENUM, NO_OFFSET, extra_NV_read_buffer_api_gl" ], + +# GL_ARB_ES2_compatibility + [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ], + [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), extra_ARB_ES2_compatibility" ], + [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_ES2_compatibility" ], + [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_ES2_compatibility" ], Are there any drivers in Mesa that support ES2 but do not advertise ARB_ES2_compatibility? I think this will break those drivers. ]}, # GLES3 is not a typo. @@ -373,10 +373,6 @@ descriptor=[ # Enums unique to OpenGL ES 2.0 { "apis": ["GLES2"], "params": [ - [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], - [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), NO_EXTRA" ], - [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], - [ "SHADER_COMPILER", "CONST(1), NO_EXTRA" ], # OES_get_program_binary [ "NUM_SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ], [ "SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ], ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 06/13] mesa: Allow glGet* queries on ARB_transform_feedback2 data in ES 3
On Tue, Dec 11, 2012 at 11:08 AM, Ian Romanick wrote: > On 12/10/2012 02:28 PM, Matt Turner wrote: >> >> Fixes the transform_feedback2_init_defaults test from es3conform. >> >> The ES 3 spec lists these as TRANSFORM_FEEDBACK_PAUSED and >> TRANSFORM_FEEDBACK_ACTIVE. >> --- >> src/mesa/main/get.c |8 +++- >> src/mesa/main/get_hash_params.py | 10 +- >> 2 files changed, 12 insertions(+), 6 deletions(-) >> >> diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c >> index 146612c..115d3c5 100644 >> --- a/src/mesa/main/get.c >> +++ b/src/mesa/main/get.c >> @@ -289,6 +289,13 @@ static const int extra_texture_buffer_object[] = { >> EXTRA_END >> }; >> >> +/* FIXME: Remove this when i965 exposes transform_feedback2 */ > > > This comment should be removed. These get enums are valid if either > ARB_transformfeedback2 or ES3, so the check is correct. XFB2 is a superset > of ES3, so I don't think we should assume that every driver that implements > ES3 also supports XFB2... even if they all happen to do so. This is the only case of an extension that i965 doesn't expose whose enums are in ES 3. I suppose some of the other extensions could not be exposed but a driver support ES 3, although it seems unlikely. Should all of the other patches (for extensions that i965 exposes) be modified to add EXTRA_API_ES3 to their extra_* blocks? I guess that's technically more correct. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 01/13] Add ES 3 handling to get.c and get_hash_generator.py
On 12/10/2012 02:28 PM, Matt Turner wrote: Other than the comments on 6, 11, and 12, the series is Reviewed-by: Ian Romanick --- src/mesa/main/get.c | 16 src/mesa/main/get_hash_generator.py |8 +++- src/mesa/main/mtypes.h |1 + 3 files changed, 24 insertions(+), 1 deletions(-) diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index cd239a6..146612c 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -130,6 +130,7 @@ enum value_extra { EXTRA_VERSION_32, EXTRA_API_GL, EXTRA_API_ES2, + EXTRA_API_ES3, EXTRA_NEW_BUFFERS, EXTRA_NEW_FRAG_CLAMP, EXTRA_VALID_DRAW_BUFFER, @@ -873,6 +874,12 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d enabled++; } break; + case EXTRA_API_ES3: +if (_mesa_is_gles3(ctx)) { + total++; + enabled++; +} +break; case EXTRA_API_GL: if (_mesa_is_desktop_gl(ctx)) { total++; @@ -966,6 +973,15 @@ find_value(const char *func, GLenum pname, void **p, union value *v) int api; api = ctx->API; + /* We index into the table_set[] list of per-API hash tables using the API's +* value in the gl_api enum. Since GLES 3 doesn't have an API_OPENGL* enum +* value since it's compatible with GLES2 its entry in table_set[] is at the +* end. +*/ + STATIC_ASSERT(Elements(table_set) == API_OPENGL_LAST + 2); + if (_mesa_is_gles3(ctx)) { + api = API_OPENGL_LAST + 1; + } mask = Elements(table(api)) - 1; hash = (pname * prime_factor); while (1) { diff --git a/src/mesa/main/get_hash_generator.py b/src/mesa/main/get_hash_generator.py index 4b3f5f4..04bf9ff 100644 --- a/src/mesa/main/get_hash_generator.py +++ b/src/mesa/main/get_hash_generator.py @@ -44,7 +44,7 @@ prime_factor = 89 prime_step = 281 hash_table_size = 1024 -gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2"]) +gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3"]) def print_header(): print "typedef const unsigned short table_t[%d];\n" % (hash_table_size) @@ -67,6 +67,7 @@ api_enum = [ 'GLES', 'GLES2', 'GL_CORE', + 'GLES3', # Not in gl_api enum in mtypes.h ] def api_index(api): @@ -166,6 +167,9 @@ def generate_hash_tables(enum_list, enabled_apis, param_descriptors): for api in valid_apis: add_to_hash_table(tables[api], hash_val, len(params)) +# Also add GLES2 items to the GLES3 hash table +if api == "GLES2": + add_to_hash_table(tables["GLES3"], hash_val, len(params)) params.append(["GL_" + enum_name, param[1]]) @@ -183,6 +187,8 @@ def opt_to_apis(feature): apis = set([_map[feature]]) if "GL" in apis: apis.add("GL_CORE") + if "GLES2" in apis: + apis.add("GLES3") return apis diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 57dddf8..bd180a5 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -3335,6 +3335,7 @@ typedef enum API_OPENGLES, API_OPENGLES2, API_OPENGL_CORE, + API_OPENGL_LAST = API_OPENGL_CORE, } gl_api; /** ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 11/13] mesa: De-duplicate ES2 queries
On Tue, Dec 11, 2012 at 11:12 AM, Ian Romanick wrote: > On 12/10/2012 02:28 PM, Matt Turner wrote: >> >> From GL/GLES/GL_CORE and GLES2 -> GL/GL_CORE/GLES2. >> >> Yes, we really were exposing ES2_compatibility queries on ES 1. >> --- >> src/mesa/main/get_hash_params.py | 16 ++-- >> 1 files changed, 6 insertions(+), 10 deletions(-) >> >> diff --git a/src/mesa/main/get_hash_params.py >> b/src/mesa/main/get_hash_params.py >> index 650fb38..d0e8a76 100644 >> --- a/src/mesa/main/get_hash_params.py >> +++ b/src/mesa/main/get_hash_params.py >> @@ -204,12 +204,6 @@ descriptor=[ >> [ "TEXTURE_COORD_ARRAY_TYPE", "LOC_CUSTOM, TYPE_ENUM, offsetof(struct >> gl_client_array, Type), NO_EXTRA" ], >> [ "TEXTURE_COORD_ARRAY_STRIDE", "LOC_CUSTOM, TYPE_INT, offsetof(struct >> gl_client_array, Stride), NO_EXTRA" ], >> >> -# GL_ARB_ES2_compatibility >> - [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ], >> - [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), >> extra_ARB_ES2_compatibility" ], >> - [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, >> extra_ARB_ES2_compatibility" ], >> - [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, >> extra_ARB_ES2_compatibility" ], >> - >> # GL_ARB_multitexture >> [ "MAX_TEXTURE_UNITS", "CONTEXT_INT(Const.MaxTextureUnits), NO_EXTRA" >> ], >> [ "CLIENT_ACTIVE_TEXTURE", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], >> @@ -310,6 +304,12 @@ descriptor=[ >> >> # GL_NV_read_buffer >> [ "READ_BUFFER", "LOC_CUSTOM, TYPE_ENUM, NO_OFFSET, >> extra_NV_read_buffer_api_gl" ], >> + >> +# GL_ARB_ES2_compatibility >> + [ "SHADER_COMPILER", "CONST(1), extra_ARB_ES2_compatibility" ], >> + [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), >> extra_ARB_ES2_compatibility" ], >> + [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, >> extra_ARB_ES2_compatibility" ], >> + [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, >> extra_ARB_ES2_compatibility" ], > > > Are there any drivers in Mesa that support ES2 but do not advertise > ARB_ES2_compatibility? I think this will break those drivers. > > >> ]}, >> >> # GLES3 is not a typo. >> @@ -373,10 +373,6 @@ descriptor=[ >> >> # Enums unique to OpenGL ES 2.0 >> { "apis": ["GLES2"], "params": [ >> - [ "MAX_FRAGMENT_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" >> ], >> - [ "MAX_VARYING_VECTORS", "CONTEXT_INT(Const.MaxVarying), NO_EXTRA" ], >> - [ "MAX_VERTEX_UNIFORM_VECTORS", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ], >> - [ "SHADER_COMPILER", "CONST(1), NO_EXTRA" ], >> # OES_get_program_binary >> [ "NUM_SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ], >> [ "SHADER_BINARY_FORMATS", "CONST(0), NO_EXTRA" ], >> > Oh, the original version of this patch added EXTRA_API_ES2 to the extra_* block. Consider that fixed. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 06/13] mesa: Allow glGet* queries on ARB_transform_feedback2 data in ES 3
On 12/11/2012 11:14 AM, Matt Turner wrote: On Tue, Dec 11, 2012 at 11:08 AM, Ian Romanick wrote: On 12/10/2012 02:28 PM, Matt Turner wrote: Fixes the transform_feedback2_init_defaults test from es3conform. The ES 3 spec lists these as TRANSFORM_FEEDBACK_PAUSED and TRANSFORM_FEEDBACK_ACTIVE. --- src/mesa/main/get.c |8 +++- src/mesa/main/get_hash_params.py | 10 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index 146612c..115d3c5 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -289,6 +289,13 @@ static const int extra_texture_buffer_object[] = { EXTRA_END }; +/* FIXME: Remove this when i965 exposes transform_feedback2 */ This comment should be removed. These get enums are valid if either ARB_transformfeedback2 or ES3, so the check is correct. XFB2 is a superset of ES3, so I don't think we should assume that every driver that implements ES3 also supports XFB2... even if they all happen to do so. This is the only case of an extension that i965 doesn't expose whose enums are in ES 3. I suppose some of the other extensions could not be exposed but a driver support ES 3, although it seems unlikely. Should all of the other patches (for extensions that i965 exposes) be modified to add EXTRA_API_ES3 to their extra_* blocks? I guess that's technically more correct. In all of the other cases, the driver must expose the extensions in order to have ES3. Note the checks in compute_version_es2. This is because ES3 took the whole extension. The parts of XFB2 that a need to be supported are either already required by meta or are implemented completely in core Mesa. XFB2 is kind of weird in this respect. So, I don't think any of the other checks need to be modified. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/4] r600g: use u_upload_mgr for allocating staging transfer buffers
I'm not familiar enough with the existing code to feel comfortable reviewing it, but I've run it through a full piglit test run (using tests/all.tests w/ GL/GLX enabled) without noticing any issues. Also, Reaction Quake 3 performance went up by ~25% as a result of this series on my Radeon 6850. http://openbenchmarking.org/result/1212102-SU-SUBALLOCT33 For my part: Tested-by: Aaron Watry --Aaron Watry u_upload_mgr suballocates memory from a large buffer and maps the allocated range (unsychronized), which is perfect for short-lived staging buffers. This reduces the number of relocations sent to the kernel. --- src/gallium/drivers/r600/r600_buffer.c | 30 +++--- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c index 9e2cf66..e674e13 100644 --- a/src/gallium/drivers/r600/r600_buffer.c +++ b/src/gallium/drivers/r600/r600_buffer.c @@ -66,7 +66,8 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx, unsigned usage, const struct pipe_box *box, struct pipe_transfer **ptransfer, - void *data, struct r600_resource *staging) + void *data, struct r600_resource *staging, + unsigned offset) { struct r600_context *rctx = (struct r600_context*)ctx; struct r600_transfer *transfer = util_slab_alloc(&rctx->pool_transfers); @@ -77,8 +78,7 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx, transfer->transfer.box = *box; transfer->transfer.stride = 0; transfer->transfer.layer_stride = 0; - transfer->staging = NULL; - transfer->offset = 0; + transfer->offset = offset; transfer->staging = staging; *ptransfer =&transfer->transfer; return data; @@ -147,18 +147,17 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, if (rctx->ws->cs_is_buffer_referenced(rctx->cs, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ - struct r600_resource *staging = (struct r600_resource*) - pipe_buffer_create(ctx->screen, PIPE_BIND_VERTEX_BUFFER, - PIPE_USAGE_STAGING, - box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT)); - data = rctx->ws->buffer_map(staging->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE); + unsigned offset; + struct r600_resource *staging = NULL; - if (!data) - return NULL; + u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), + &offset, (struct pipe_resource**)&staging, (void**)&data); - data += box->x % R600_MAP_BUFFER_ALIGNMENT; - return r600_buffer_get_transfer(ctx, resource, level, usage, box, - ptransfer, data, staging); + if (staging) { + data += box->x % R600_MAP_BUFFER_ALIGNMENT; + return r600_buffer_get_transfer(ctx, resource, level, usage, box, + ptransfer, data, staging, offset); + } } } @@ -169,7 +168,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, data += box->x; return r600_buffer_get_transfer(ctx, resource, level, usage, box, - ptransfer, data, NULL); + ptransfer, data, NULL, 0); } static void r600_buffer_transfer_unmap(struct pipe_context *pipe, @@ -180,7 +179,8 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe, if (rtransfer->staging) { struct pipe_box box; - u_box_1d(transfer->box.x % R600_MAP_BUFFER_ALIGNMENT, transfer->box.width,&box); + u_box_1d(rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT, +transfer->box.width,&box); /* Copy the staging buffer into the original one. */ r600_copy_buffer(pipe, transfer->resource, transfer->box.x, -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/4] r600g: use u_upload_mgr for allocating staging transfer buffers
On Tue, Dec 11, 2012 at 8:24 PM, Aaron Watry wrote: > I'm not familiar enough with the existing code to feel comfortable reviewing > it, but I've run it through a full piglit test run (using tests/all.tests w/ > GL/GLX enabled) without noticing any issues. > > Also, Reaction Quake 3 performance went up by ~25% as a result of this > series on my Radeon 6850. > http://openbenchmarking.org/result/1212102-SU-SUBALLOCT33 > > For my part: > Tested-by: Aaron Watry Awesome, thanks for testing! Marek ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/4] r600g: use u_upload_mgr for allocating staging transfer buffers
On Mon, Dec 10, 2012 at 3:47 PM, Marek Olšák wrote: > u_upload_mgr suballocates memory from a large buffer and maps the allocated > range (unsychronized), which is perfect for short-lived staging buffers. > > This reduces the number of relocations sent to the kernel. Series looks good to me. Reviewed-by: Alex Deucher > --- > src/gallium/drivers/r600/r600_buffer.c | 30 +++--- > 1 file changed, 15 insertions(+), 15 deletions(-) > > diff --git a/src/gallium/drivers/r600/r600_buffer.c > b/src/gallium/drivers/r600/r600_buffer.c > index 9e2cf66..e674e13 100644 > --- a/src/gallium/drivers/r600/r600_buffer.c > +++ b/src/gallium/drivers/r600/r600_buffer.c > @@ -66,7 +66,8 @@ static void *r600_buffer_get_transfer(struct pipe_context > *ctx, >unsigned usage, >const struct pipe_box *box, > struct pipe_transfer **ptransfer, > - void *data, struct r600_resource > *staging) > + void *data, struct r600_resource > *staging, > + unsigned offset) > { > struct r600_context *rctx = (struct r600_context*)ctx; > struct r600_transfer *transfer = > util_slab_alloc(&rctx->pool_transfers); > @@ -77,8 +78,7 @@ static void *r600_buffer_get_transfer(struct pipe_context > *ctx, > transfer->transfer.box = *box; > transfer->transfer.stride = 0; > transfer->transfer.layer_stride = 0; > - transfer->staging = NULL; > - transfer->offset = 0; > + transfer->offset = offset; > transfer->staging = staging; > *ptransfer = &transfer->transfer; > return data; > @@ -147,18 +147,17 @@ static void *r600_buffer_transfer_map(struct > pipe_context *ctx, > if (rctx->ws->cs_is_buffer_referenced(rctx->cs, > rbuffer->cs_buf, RADEON_USAGE_READWRITE) || > rctx->ws->buffer_is_busy(rbuffer->buf, > RADEON_USAGE_READWRITE)) { > /* Do a wait-free write-only transfer using a > temporary buffer. */ > - struct r600_resource *staging = (struct > r600_resource*) > - pipe_buffer_create(ctx->screen, > PIPE_BIND_VERTEX_BUFFER, > - PIPE_USAGE_STAGING, > - box->width + (box->x % > R600_MAP_BUFFER_ALIGNMENT)); > - data = rctx->ws->buffer_map(staging->cs_buf, > rctx->cs, PIPE_TRANSFER_WRITE); > + unsigned offset; > + struct r600_resource *staging = NULL; > > - if (!data) > - return NULL; > + u_upload_alloc(rctx->uploader, 0, box->width + > (box->x % R600_MAP_BUFFER_ALIGNMENT), > + &offset, (struct > pipe_resource**)&staging, (void**)&data); > > - data += box->x % R600_MAP_BUFFER_ALIGNMENT; > - return r600_buffer_get_transfer(ctx, resource, level, > usage, box, > - ptransfer, data, > staging); > + if (staging) { > + data += box->x % R600_MAP_BUFFER_ALIGNMENT; > + return r600_buffer_get_transfer(ctx, > resource, level, usage, box, > + ptransfer, > data, staging, offset); > + } > } > } > > @@ -169,7 +168,7 @@ static void *r600_buffer_transfer_map(struct pipe_context > *ctx, > data += box->x; > > return r600_buffer_get_transfer(ctx, resource, level, usage, box, > - ptransfer, data, NULL); > + ptransfer, data, NULL, 0); > } > > static void r600_buffer_transfer_unmap(struct pipe_context *pipe, > @@ -180,7 +179,8 @@ static void r600_buffer_transfer_unmap(struct > pipe_context *pipe, > > if (rtransfer->staging) { > struct pipe_box box; > - u_box_1d(transfer->box.x % R600_MAP_BUFFER_ALIGNMENT, > transfer->box.width, &box); > + u_box_1d(rtransfer->offset + transfer->box.x % > R600_MAP_BUFFER_ALIGNMENT, > +transfer->box.width, &box); > > /* Copy the staging buffer into the original one. */ > r600_copy_buffer(pipe, transfer->resource, transfer->box.x, > -- > 1.7.10.4 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/list
[Mesa-dev] [PATCH] llvmpipe: remove unneeded draw_flush() call
This is redundant since we're calling draw_bind_fragment_shader() which already does a flush. v2: the redundant flush in llvmpipe_set_constant_buffer() has already been removed by commit 3427466e6dbbb8db7c1ecda6b3859ca1cc5827a3 --- src/gallium/drivers/llvmpipe/lp_state_fs.c |2 -- 1 files changed, 0 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index bf59a43..5a8351b 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -2312,8 +2312,6 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs) if (llvmpipe->fs == fs) return; - draw_flush(llvmpipe->draw); - llvmpipe->fs = (struct lp_fragment_shader *) fs; draw_bind_fragment_shader(llvmpipe->draw, -- 1.7.3.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 01/13] Add ES 3 handling to get.c and get_hash_generator.py
On 12/11/2012 11:04 AM, Ian Romanick wrote: On 12/10/2012 04:06 PM, Jordan Justen wrote: On Mon, Dec 10, 2012 at 2:28 PM, Matt Turner wrote: @@ -966,6 +973,15 @@ find_value(const char *func, GLenum pname, void **p, union value *v) int api; api = ctx->API; + /* We index into the table_set[] list of per-API hash tables using the API's +* value in the gl_api enum. Since GLES 3 doesn't have an API_OPENGL* enum +* value since it's compatible with GLES2 its entry in table_set[] is at the +* end. +*/ + STATIC_ASSERT(Elements(table_set) == API_OPENGL_LAST + 2); + if (_mesa_is_gles3(ctx)) { + api = API_OPENGL_LAST + 1; + } This seems somewhat unexpected. How do we keep track of the fact that API_OPENGL_LAST + 1 is used for GLES3 in this case? Having API_OPENGL_LAST and the STATIC_ASSERT should be sufficient. Are we sure GLES3 isn't a separate API from GLES2? :) I guess I don't know the motivation for keeping GLES2/3 under a combined API_GLES2 brings. Wasn't there the idea that since GLES3 was backward compatible, we'd just upgrade GLES2 to add the GLES3 features? But, now it seems we keep separating them in various ways. The problem is that we have a zillion places that check of GLES2. I don't want to have to find and update all of those to check for separate GLES2 and GLES3. Every single place that checks for GLES2 would have to be updated. Where "zillion" means 50 (measured by git grep API_OPENGLES2 | wc -l). In contrast, there are 72 occurrances of API_OPENGLES3. I'm not taking a position one way or another, just saying that 50 isn't that much work. --Ken ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/5] i965/fs: Add a note explaining a detail of register_coalesce_2().
--- src/mesa/drivers/dri/i965/brw_fs.cpp | 21 + 1 file changed, 21 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f428a83..c520364 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1846,7 +1846,28 @@ fs_visitor::register_coalesce_2() } inst->remove(); + + /* We don't need to recalculate live intervals inside the loop despite + * flagging live_intervals_valid because we only use live intervals for + * the interferes test, and we must have had a situation where the + * intervals were: + * + * from to + * ^ + * | + * v + *^ + *| + *v + * + * Some register R that might get coalesced with one of these two could + * only be referencing "to", otherwise "from"'s range would have been + * longer. R's range could also only start at the end of "to" or later, + * otherwise it will conflict with "to" when we try to coalesce "to" + * into Rw anyway. + */ live_intervals_valid = false; + progress = true; continue; } -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/5] i965/fs: Drop an unnecessary _safe on a list walk.
--- src/mesa/drivers/dri/i965/brw_fs.cpp |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c520364..62800b1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1828,7 +1828,7 @@ fs_visitor::register_coalesce_2() int reg_to = inst->dst.reg; int reg_to_offset = inst->dst.reg_offset; - foreach_list_safe(node, &this->instructions) { + foreach_list(node, &this->instructions) { fs_inst *scan_inst = (fs_inst *)node; if (scan_inst->dst.file == GRF && -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/5] i965/vs: Add a unit test for opt_compute_to_mrf().
The compute-to-mrf code is really twitchy, and it's hard to construct GLSL testcases for it. This unit test is also really hard to work with (for example, if your instruction is removed by dead code elimination, you end up inspecting something irrelevant), but I did use it for debugging some of the commits to follow. I called it test_vec4_register_coalesce because the compute-to-mrf code is about to morph into that. --- src/mesa/drivers/dri/i965/.gitignore |1 + src/mesa/drivers/dri/i965/Makefile.am | 10 +- .../dri/i965/test_vec4_register_coalesce.cpp | 124 3 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp diff --git a/src/mesa/drivers/dri/i965/.gitignore b/src/mesa/drivers/dri/i965/.gitignore index c6ea403..8208295 100644 --- a/src/mesa/drivers/dri/i965/.gitignore +++ b/src/mesa/drivers/dri/i965/.gitignore @@ -2,3 +2,4 @@ Makefile i965_symbols_test libi965_dri.la test_eu_compact +test_vec4_register_coalesce diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index f5ca12b..8e96467 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -66,8 +66,14 @@ i965_dri_la_SOURCES = i965_dri_la_LIBADD = $(COMMON_LIBS) i965_dri_la_LDFLAGS = -module -avoid-version -shared -TESTS = test_eu_compact -check_PROGRAMS = test_eu_compact +TESTS = \ +test_eu_compact \ +test_vec4_register_coalesce +check_PROGRAMS = $(TESTS) + +test_vec4_register_coalesce_LDADD = \ +$(TEST_LIBS) \ +$(top_builddir)/src/gtest/libgtest.la test_eu_compact_SOURCES = \ test_eu_compact.c diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp new file mode 100644 index 000..c79b0fd --- /dev/null +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp @@ -0,0 +1,124 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include "brw_vec4.h" + +using namespace brw; + +int ret = 0; + +#define register_coalesce(v) _register_coalesce(v, __FUNCTION__) + +class register_coalesce_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_context *brw; + struct intel_context *intel; + struct gl_context *ctx; + struct gl_shader_program *shader_prog; + struct brw_vs_compile *c; + vec4_visitor *v; +}; + +void register_coalesce_test::SetUp() +{ + brw = (struct brw_context *)calloc(1, sizeof(*brw)); + intel = &brw->intel; + ctx = &intel->ctx; + + c = ralloc(NULL, struct brw_vs_compile); + c->vp = ralloc(NULL, struct brw_vertex_program); + + shader_prog = ralloc(NULL, struct gl_shader_program); + + v = new vec4_visitor(brw, c, shader_prog, NULL, NULL); + + _mesa_init_vertex_program(ctx, &c->vp->program, GL_VERTEX_SHADER, 0); + + intel->gen = 4; +} + +static void +_register_coalesce(vec4_visitor *v, const char *func) +{ + bool print = false; + + if (print) { + printf("%s: instructions before:\n", func); + v->dump_instructions(); + } + + v->opt_compute_to_mrf(); + + if (print) { + printf("%s: instructions after:\n", func); + v->dump_instructions(); + } +} + +TEST_F(register_coalesce_test, test_easy_success) +{ + src_reg something = src_reg(v, glsl_type::float_type); + dst_reg temp = dst_reg(v, glsl_type::float_type); + dst_reg init; + + dst_reg m0 = dst_reg(MRF, 0); + m0.writemask = WRITEMASK_X; + m0.type = BRW_REGISTER_TYPE_F; + + vec4_instruction *mul = v->emit(v->MUL(temp, something, src_reg(1.0f))); + v->emit(v->MOV(m0, src_reg(temp))); + + register_coalesce(v); + + EXPECT_EQ(mul->dst.file, MRF); +} + + +TEST_F(register_coalesce_test, test_multiple_use) +{ + src_reg something = src
[Mesa-dev] [PATCH 4/5] i965/vs: Extend opt_compute_to_mrf to handle limited "reswizzling"
The way our visitor works, scalar expression/swizzle results that get stored in channels other than .x will have an intermediate MOV from their result in the .x channel to the real .y (or whatever) channel, and similarly for vec2/vec3 results. By knowing how to adjust DP4-type instructions for optimizing out a swizzled MOV, we can reduce instructions in common matrix multiplication cases. --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 99 ++-- src/mesa/drivers/dri/i965/brw_vec4.h |2 + .../dri/i965/test_vec4_register_coalesce.cpp | 21 + 3 files changed, 113 insertions(+), 9 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 436ba97..7ab37e7 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -600,6 +600,85 @@ vec4_visitor::move_push_constants_to_pull_constants() pack_uniform_registers(); } +bool +vec4_instruction::can_reswizzle_dst(int dst_writemask, +int swizzle, +int swizzle_mask) +{ + /* If this instruction sets anything not referenced by swizzle, then we'd +* totally break it when we reswizzle. +*/ + if (dst.writemask & ~swizzle_mask) + return false; + + switch (opcode) { + case BRW_OPCODE_DP4: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP2: + return true; + default: + /* Check if there happens to be no reswizzling required. */ + for (int c = 0; c < 4; c++) { + int bit = 1 << BRW_GET_SWZ(swizzle, c); + /* Skip components of the swizzle not used by the dst. */ + if (!(dst_writemask & (1 << c))) +continue; + + /* We don't do the reswizzling yet, so just sanity check that we + * don't have to. + */ + if (bit != (1 << c)) +return false; + } + return true; + } +} + +/** + * For any channels in the swizzle's source that were populated by this + * instruction, rewrite the instruction to put the appropriate result directly + * in those channels. + * + * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x + */ +void +vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle) +{ + int new_writemask = 0; + + switch (opcode) { + case BRW_OPCODE_DP4: + case BRW_OPCODE_DP3: + case BRW_OPCODE_DP2: + for (int c = 0; c < 4; c++) { + int bit = 1 << BRW_GET_SWZ(swizzle, c); + /* Skip components of the swizzle not used by the dst. */ + if (!(dst_writemask & (1 << c))) +continue; + /* If we were populating this component, then populate the + * corresponding channel of the new dst. + */ + if (dst.writemask & bit) +new_writemask |= (1 << c); + } + dst.writemask = new_writemask; + break; + default: + for (int c = 0; c < 4; c++) { + int bit = 1 << BRW_GET_SWZ(swizzle, c); + /* Skip components of the swizzle not used by the dst. */ + if (!(dst_writemask & (1 << c))) +continue; + + /* We don't do the reswizzling yet, so just sanity check that we + * don't have to. + */ + assert(bit == (1 << c)); + } + break; + } +} + /* * Tries to reduce extra MOV instructions by taking GRFs that get just * written and then MOVed into an MRF and making the original write of @@ -641,26 +720,20 @@ vec4_visitor::opt_compute_to_mrf() */ bool chans_needed[4] = {false, false, false, false}; int chans_remaining = 0; + int swizzle_mask = 0; for (int i = 0; i < 4; i++) { int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); if (!(inst->dst.writemask & (1 << i))) continue; -/* We don't handle compute-to-MRF across a swizzle. We would - * need to be able to rewrite instructions above to output - * results to different channels. - */ -if (chan != i) - chans_remaining = 5; + swizzle_mask |= (1 << chan); if (!chans_needed[chan]) { chans_needed[chan] = true; chans_remaining++; } } - if (chans_remaining > 4) -continue; /* Now walk up the instruction stream trying to see if we can * rewrite everything writing to the GRF into the MRF instead. @@ -689,6 +762,13 @@ vec4_visitor::opt_compute_to_mrf() } } +/* If we can't handle the swizzle, bail. */ +if (!scan_inst->can_reswizzle_dst(inst->dst.writemask, + inst->src[0].swizzle, + swizzle_mask)) { + break; +} + /* Mark which channels we found unconditional writes for. */ if (!scan_inst->predicate) { for (int i = 0; i < 4
[Mesa-dev] [PATCH 5/5] i965: Generalize VS compute-to-MRF for compute-to-another-GRF, too.
No statistically significant performance difference on glbenchmark 2.7 (n=60). It reduces cycles spent in the vertex shader by 3.3% +/- 0.8% (n=5), but that's only about .3% of all cycles spent according to the fixed shader_time. --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 129 +++- src/mesa/drivers/dri/i965/brw_vec4.h |2 +- .../dri/i965/test_vec4_register_coalesce.cpp | 58 - 3 files changed, 128 insertions(+), 61 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 7ab37e7..079bbab 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -680,12 +680,12 @@ vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle) } /* - * Tries to reduce extra MOV instructions by taking GRFs that get just - * written and then MOVed into an MRF and making the original write of - * the GRF write directly to the MRF instead. + * Tries to reduce extra MOV instructions by taking temporary GRFs that get + * just written and then MOVed into another reg and making the original write + * of the GRF write directly to the final destination instead. */ bool -vec4_visitor::opt_compute_to_mrf() +vec4_visitor::opt_register_coalesce() { bool progress = false; int next_ip = 0; @@ -699,24 +699,25 @@ vec4_visitor::opt_compute_to_mrf() next_ip++; if (inst->opcode != BRW_OPCODE_MOV || + (inst->dst.file != GRF && inst->dst.file != MRF) || inst->predicate || - inst->dst.file != MRF || inst->src[0].file != GRF || + inst->src[0].file != GRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) continue; - int mrf = inst->dst.reg; + bool to_mrf = (inst->dst.file == MRF); - /* Can't compute-to-MRF this GRF if someone else was going to + /* Can't coalesce this GRF if someone else was going to * read it later. */ if (this->virtual_grf_use[inst->src[0].reg] > ip) continue; - /* We need to check interference with the MRF between this - * instruction and the earliest instruction involved in writing - * the GRF we're eliminating. To do that, keep track of which - * of our source channels we've seen initialized. + /* We need to check interference with the final destination between this + * instruction and the earliest instruction involved in writing the GRF + * we're eliminating. To do that, keep track of which of our source + * channels we've seen initialized. */ bool chans_needed[4] = {false, false, false, false}; int chans_remaining = 0; @@ -735,8 +736,9 @@ vec4_visitor::opt_compute_to_mrf() } } - /* Now walk up the instruction stream trying to see if we can - * rewrite everything writing to the GRF into the MRF instead. + /* Now walk up the instruction stream trying to see if we can rewrite + * everything writing to the temporary to write into the destination + * instead. */ vec4_instruction *scan_inst; for (scan_inst = (vec4_instruction *)inst->prev; @@ -745,22 +747,21 @@ vec4_visitor::opt_compute_to_mrf() if (scan_inst->dst.file == GRF && scan_inst->dst.reg == inst->src[0].reg && scan_inst->dst.reg_offset == inst->src[0].reg_offset) { - /* Found something writing to the reg we want to turn into -* a compute-to-MRF. -*/ - - /* SEND instructions can't have MRF as a destination. */ - if (scan_inst->mlen) - break; - - if (intel->gen >= 6) { - /* gen6 math instructions must have the destination be - * GRF, so no compute-to-MRF for them. - */ - if (scan_inst->is_math()) { - break; - } - } +/* Found something writing to the reg we want to coalesce away. */ +if (to_mrf) { + /* SEND instructions can't have MRF as a destination. */ + if (scan_inst->mlen) + break; + + if (intel->gen >= 6) { + /* gen6 math instructions must have the destination be + * GRF, so no compute-to-MRF for them. + */ + if (scan_inst->is_math()) { + break; + } + } +} /* If we can't handle the swizzle, bail. */ if (!scan_inst->can_reswizzle_dst(inst->dst.writemask, @@ -784,9 +785,8 @@ vec4_visitor::opt_compute_to_mrf() break; } -/* We don't handle flow control here. Most computation of - * values that end up in MRFs are shortly before the MRF - * write anyway. +/* We don't han
[Mesa-dev] [PATCH 00/10] glsl: Implement varying packing.
This patch series adds varying packing to Mesa, so that we can handle varyings composed of things other than vec4's without using up extra varying components. For the initial implementation I've chosen a strategy that operates exclusively at the GLSL IR level, so that it doesn't require the cooperation of the driver back-ends. This means that varying packing should be immediately useful for all drivers. However, there are some types of varying packing that can't be done using GLSL IR alone (for example, packing a "noperspective" varying and a "smooth" varying together), but should be possible on some drivers with a small amount of back-end work. I'm deferring that work for a later patch series. Also, packing of floats and ints together into the same "flat varying" should be possible for drivers that implement ARB_shader_bit_encoding--I'm also deferring that for a later patch series. The strategy is as follows: - Before assigning locations to varyings, we sort them into "packing classes" based on base type and interpolation mode (this is to ensure that we don't try to pack floats with ints, or smooth with flat, for example). - Within each packing class, we sort the varyings based on the number of vector elements. Vec4's (as well as matrices and arrays composed of vec4's) are packed first, then vec2's, then scalars, since this allows us to align them all to their natural alignment boundary, so we avoid the performance penalty of "double parking" a varying across two varying slots. Vec3's are packed last, double parking them if necessary. - For any varying slot that doesn't contain exactly one vec4, we generate GLSL IR to manually pack/unpack the varying in the shader. For instance, the following fragment shader: varying vec2 a; varying vec2 b; varying vec3 c; varying vec3 d; main() { ... } would get rewritten as follows: varying vec4 packed0; varying vec4 packed1; varying vec4 packed2; vec2 a; vec2 b; vec3 c; vec3 d; main() { a = packed0.xy; b = packed0.zw; c = packed1.xyz; d.x = packed1.w; // d is "double parked" across slots 1 and 2 d.yz = packed2.xy; ... } This GLSL IR is generated by a lowering pass, so that in the future we will have the option of disabling it for driver back-ends that are capable of natively understanding the packed varying format. - Finally, the linker code to handle transform feedback is modified to account for varying packing (e.g. by feeding back just a subset of the components of a varying slot rather than the entire varying slot). Fortunately transform feedback already has the infrastructure necessary to do this, since it was needed in order to implement glClipDistance. I believe this is enough to be useful for the vast majority of programs, and to get us passing the GLES3 conformance tests. Additional improvements, which I'm planning to defer to later patch series, include: - Allow uints and ints to be packed together in the same varying slot. This should be possible on all back-ends, since ints and uints may be interconverted without losing information. - On back-ends that support ARB_shader_bit_encoding, allow floats and ints to be packed together in the same varying slot, since ARB_shader_bit_encoding allows floating-point values to be encoded into ints without losing information. - On back-ends that can mix interpolation modes within a single varying slot, allow additional packing, with help from the driver back-end. For instance, i965 gen6 and above can in principle mix together all interpolation modes except for "flat" within a single varying slot, if we do a hopefully small amount of back-end work. - Allow a driver back-end to advertise a larger number of varying components to the linker than it advertises to the client program--this will allow us to ensure that varying packing *never* fails. For example, on i965 gen6 and above, after the above improvements are made, we should be able to pack any possible combination of varyings with a maximum waste of 3 varying components. That means, for example, that if the i965 driver advertises 17 varying slots to the linker (== 68 varying components), but advertises only 64 varying components to the the client program, then varying packing will always succeed. Note: I also have a new piglit test that exercises this code; I'll be publishing that to the Piglit list ASAP. [PATCH 01/10] glsl/lower_clip_distance: Update symbol table. [PATCH 02/10] glsl/linker: Always invalidate shader ins/outs, even in corner cases. [PATCH 03/10] glsl/linker: Make separate ir_variable field to mean "unmatched". [PATCH 04/10] glsl: Create a field to store fractional varying locations. [PATCH 05/10] glsl/linker: Defer recording transform feedback locations. [PATCH 06/10] glsl/linker: Subdivide the first phase of varying assignment. [PATCH 07/10] glsl/linker: Sort varyings by packing class, t
[Mesa-dev] [PATCH 01/10] glsl/lower_clip_distance: Update symbol table.
This patch modifies the clip distance lowering pass so that the new symbol it generates (glClipDistanceMESA) is added to the shader's symbol table. This will allow a later patch to modify the linker so that it finds transform feedback varyings using the symbol table rather than having to iterate through all the declarations in the shader. --- src/glsl/ir_optimization.h | 2 +- src/glsl/linker.cpp | 5 +++-- src/glsl/lower_clip_distance.cpp | 8 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index 2220d51..628096e 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -72,7 +72,7 @@ bool lower_noise(exec_list *instructions); bool lower_variable_index_to_cond_assign(exec_list *instructions, bool lower_input, bool lower_output, bool lower_temp, bool lower_uniform); bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz); -bool lower_clip_distance(exec_list *instructions); +bool lower_clip_distance(gl_shader *shader); void lower_output_reads(exec_list *instructions); void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions); bool optimize_redundant_jumps(exec_list *instructions); diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 29fc5d8..802323e 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -2568,8 +2568,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) if (!prog->LinkStatus) goto done; - if (ctx->ShaderCompilerOptions[i].LowerClipDistance) - lower_clip_distance(prog->_LinkedShaders[i]->ir); + if (ctx->ShaderCompilerOptions[i].LowerClipDistance) { + lower_clip_distance(prog->_LinkedShaders[i]); + } unsigned max_unroll = ctx->ShaderCompilerOptions[i].MaxUnrollIterations; diff --git a/src/glsl/lower_clip_distance.cpp b/src/glsl/lower_clip_distance.cpp index 0316914..09bdc36 100644 --- a/src/glsl/lower_clip_distance.cpp +++ b/src/glsl/lower_clip_distance.cpp @@ -45,6 +45,7 @@ * LowerClipDistance flag in gl_shader_compiler_options to true. */ +#include "glsl_symbol_table.h" #include "ir_hierarchical_visitor.h" #include "ir.h" @@ -334,11 +335,14 @@ lower_clip_distance_visitor::visit_leave(ir_call *ir) bool -lower_clip_distance(exec_list *instructions) +lower_clip_distance(gl_shader *shader) { lower_clip_distance_visitor v; - visit_list_elements(&v, instructions); + visit_list_elements(&v, shader->ir); + + if (v.new_clip_distance_var) + shader->symbols->add_variable(v.new_clip_distance_var); return v.progress; } -- 1.8.0.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 02/10] glsl/linker: Always invalidate shader ins/outs, even in corner cases.
Previously, link_invalidate_variable_locations() was only called during assign_attribute_or_color_locations() and assign_varying_locations(). This meant that in the corner case when there was only a vertex shader, and varyings were being captured by transform feedback, link_invalidate_variable_locations() wasn't being called for the varyings. This patch migrates the calls to link_invalidate_variable_locations() to link_shaders(), so that they will be called in all circumstances. In addition, it modifies the call semantics so that link_invalidate_variable_locations() need only be called once per shader stage (rather than once for inputs and once for outputs). --- src/glsl/linker.cpp | 43 +++ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 802323e..2523dc9 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -200,19 +200,31 @@ linker_warning(gl_shader_program *prog, const char *fmt, ...) void -link_invalidate_variable_locations(gl_shader *sh, enum ir_variable_mode mode, - int generic_base) +link_invalidate_variable_locations(gl_shader *sh, int input_base, + int output_base) { foreach_list(node, sh->ir) { ir_variable *const var = ((ir_instruction *) node)->as_variable(); - if ((var == NULL) || (var->mode != (unsigned) mode)) -continue; + if (var == NULL) + continue; + + int base; + switch (var->mode) { + case ir_var_in: + base = input_base; + break; + case ir_var_out: + base = output_base; + break; + default: + continue; + } /* Only assign locations for generic attributes / varyings / etc. */ - if ((var->location >= generic_base) && !var->explicit_location) - var->location = -1; + if ((var->location >= base) && !var->explicit_location) + var->location = -1; } } @@ -1309,8 +1321,6 @@ assign_attribute_or_color_locations(gl_shader_program *prog, (target_index == MESA_SHADER_VERTEX) ? ir_var_in : ir_var_out; - link_invalidate_variable_locations(sh, direction, generic_base); - /* Temporary storage for the set of attributes that need locations assigned. */ struct temp_attr { @@ -2072,10 +2082,6 @@ assign_varying_locations(struct gl_context *ctx, *not being inputs. This lets the optimizer eliminate them. */ - link_invalidate_variable_locations(producer, ir_var_out, VERT_RESULT_VAR0); - if (consumer) - link_invalidate_variable_locations(consumer, ir_var_in, FRAG_ATTRIB_VAR0); - foreach_list(node, producer->ir) { ir_variable *const output_var = ((ir_instruction *) node)->as_variable(); @@ -2578,6 +2584,19 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) ; } + /* Mark all generic shader inputs and outputs as unpaired. */ + if (prog->_LinkedShaders[MESA_SHADER_VERTEX] != NULL) { + link_invalidate_variable_locations( +prog->_LinkedShaders[MESA_SHADER_VERTEX], +VERT_ATTRIB_GENERIC0, VERT_RESULT_VAR0); + } + /* FINISHME: Geometry shaders not implemented yet */ + if (prog->_LinkedShaders[MESA_SHADER_FRAGMENT] != NULL) { + link_invalidate_variable_locations( +prog->_LinkedShaders[MESA_SHADER_FRAGMENT], +FRAG_ATTRIB_VAR0, FRAG_RESULT_DATA0); + } + /* FINISHME: The value of the max_attribute_index parameter is * FINISHME: implementation dependent based on the value of * FINISHME: GL_MAX_VERTEX_ATTRIBS. GL_MAX_VERTEX_ATTRIBS must be -- 1.8.0.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 03/10] glsl/linker: Make separate ir_variable field to mean "unmatched".
Previously, the linker used a value of -1 in ir_variable::location to denote a generic input or output of the shader that had not yet been matched up to a variable in another pipeline stage. This patch introduces a new ir_variable field, is_unmatched_generic_inout, for that purpose. In future patches, this will allow us to separate the process of matching varyings between shader stages from the processes of assigning locations to those varying. That will in turn pave the way for packing varyings. --- src/glsl/ir.h | 9 + src/glsl/linker.cpp | 18 ++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/glsl/ir.h b/src/glsl/ir.h index 89c516c..e2ecb3d 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -437,6 +437,15 @@ public: unsigned has_initializer:1; /** +* Is this variable a generic output or input that has not yet been matched +* up to a variable in another stage of the pipeline? +* +* This is used by the linker as scratch storage while assigning locations +* to generic inputs and outputs. +*/ + unsigned is_unmatched_generic_inout:1; + + /** * \brief Layout qualifier for gl_FragDepth. * * This is not equal to \c ir_depth_layout_none if and only if this diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 2523dc9..ee6dc25 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -225,6 +225,11 @@ link_invalidate_variable_locations(gl_shader *sh, int input_base, */ if ((var->location >= base) && !var->explicit_location) var->location = -1; + + if ((var->location == -1) && !var->explicit_location) + var->is_unmatched_generic_inout = 1; + else + var->is_unmatched_generic_inout = 0; } } @@ -1362,6 +1367,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog, if (prog->AttributeBindings->get(binding, var->name)) { assert(binding >= VERT_ATTRIB_GENERIC0); var->location = binding; +var->is_unmatched_generic_inout = 0; } } else if (target_index == MESA_SHADER_FRAGMENT) { unsigned binding; @@ -1370,6 +1376,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog, if (prog->FragDataBindings->get(binding, var->name)) { assert(binding >= FRAG_RESULT_DATA0); var->location = binding; +var->is_unmatched_generic_inout = 0; if (prog->FragDataIndexBindings->get(index, var->name)) { var->index = index; @@ -1485,6 +1492,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog, } to_assign[i].var->location = generic_base + location; + to_assign[i].var->is_unmatched_generic_inout = 0; used_locations |= (use_mask << location); } @@ -1508,7 +1516,7 @@ demote_shader_inputs_and_outputs(gl_shader *sh, enum ir_variable_mode mode) * its value is used by other shader stages. This will cause the variable * to have a location assigned. */ - if (var->location == -1) { + if (var->is_unmatched_generic_inout) { var->mode = ir_var_auto; } } @@ -1985,7 +1993,7 @@ void assign_varying_location(ir_variable *input_var, ir_variable *output_var, unsigned *input_index, unsigned *output_index) { - if (output_var->location != -1) { + if (!output_var->is_unmatched_generic_inout) { /* Location already assigned. */ return; } @@ -1993,9 +2001,11 @@ assign_varying_location(ir_variable *input_var, ir_variable *output_var, if (input_var) { assert(input_var->location == -1); input_var->location = *input_index; + input_var->is_unmatched_generic_inout = 0; } output_var->location = *output_index; + output_var->is_unmatched_generic_inout = 0; /* FINISHME: Support for "varying" records in GLSL 1.50. */ assert(!output_var->type->is_record()); @@ -2105,7 +2115,7 @@ assign_varying_locations(struct gl_context *ctx, if (!tfeedback_decls[i].is_assigned() && tfeedback_decls[i].matches_var(output_var)) { -if (output_var->location == -1) { +if (output_var->is_unmatched_generic_inout) { assign_varying_location(input_var, output_var, &input_index, &output_index); } @@ -2124,7 +2134,7 @@ assign_varying_locations(struct gl_context *ctx, if ((var == NULL) || (var->mode != ir_var_in)) continue; - if (var->location == -1) { + if (var->is_unmatched_generic_inout) { if (prog->Version <= 120) { /* On page 25 (page 31 of the PDF) of the GLSL 1.20 spec: * -- 1.8.0.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 04/10] glsl: Create a field to store fractional varying locations.
Currently, the location of each varying is recorded in ir_variable as a multiple of the size of a vec4. In order to pack varyings, we need to be able to record, e.g. that a vec2 is stored in the second half of a varying slot rather than the first half. This patch introduces a field ir_variable::location_frac, which represents the offset within a vec4 where a varying's value is stored. Varyings that are not subject to packing will always have a location_frac value of zero. --- src/glsl/ir.cpp | 1 + src/glsl/ir.h | 9 + src/glsl/linker.cpp | 6 -- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index 7b0a487..703f5ec 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -1492,6 +1492,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name, this->explicit_location = false; this->has_initializer = false; this->location = -1; + this->location_frac = 0; this->uniform_block = -1; this->warn_extension = NULL; this->constant_value = NULL; diff --git a/src/glsl/ir.h b/src/glsl/ir.h index e2ecb3d..85fc5ce 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -446,6 +446,15 @@ public: unsigned is_unmatched_generic_inout:1; /** +* If non-zero, then this variable may be packed along with other variables +* into a single varying slot, so this offset should be applied when +* accessing components. For example, an offset of 1 means that the x +* component of this variable is actually stored in component y of the +* location specified by \c location. +*/ + unsigned location_frac:2; + + /** * \brief Layout qualifier for gl_FragDepth. * * This is not equal to \c ir_depth_layout_none if and only if this diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index ee6dc25..b13a6aa 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -226,10 +226,12 @@ link_invalidate_variable_locations(gl_shader *sh, int input_base, if ((var->location >= base) && !var->explicit_location) var->location = -1; - if ((var->location == -1) && !var->explicit_location) + if ((var->location == -1) && !var->explicit_location) { var->is_unmatched_generic_inout = 1; - else + var->location_frac = 0; + } else { var->is_unmatched_generic_inout = 0; + } } } -- 1.8.0.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 05/10] glsl/linker: Defer recording transform feedback locations.
This patch subdivides the loop that assigns varying locations into two phases: one phase to match up varyings between shader stages (and assign them varying locations), and a second phase to record the varying assignments for use by transform feedback. This paves the way for varying packing, which will require us to further subdivide the first phase. In addition, it lets us avoid a clumsy O(n^2) algorithm, since we can now record the locations of all transform feedback varyings in a single pass through the tfeedback_decls array, rather than have to iterate through the array after assigning each varying. --- src/glsl/linker.cpp | 103 1 file changed, 48 insertions(+), 55 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index b13a6aa..dd77010 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1537,18 +1537,12 @@ public: static bool is_same(const tfeedback_decl &x, const tfeedback_decl &y); bool assign_location(struct gl_context *ctx, struct gl_shader_program *prog, ir_variable *output_var); - bool accumulate_num_outputs(struct gl_shader_program *prog, unsigned *count); + unsigned accumulate_num_outputs(struct gl_shader_program *prog); bool store(struct gl_context *ctx, struct gl_shader_program *prog, struct gl_transform_feedback_info *info, unsigned buffer, const unsigned max_outputs) const; - - /** -* True if assign_location() has been called for this object. -*/ - bool is_assigned() const - { - return this->location != -1; - } + ir_variable *find_output_var(gl_shader_program *prog, +gl_shader *producer) const; bool is_next_buffer_separator() const { @@ -1561,19 +1555,8 @@ public: } /** -* Determine whether this object refers to the variable var. -*/ - bool matches_var(ir_variable *var) const - { - if (this->is_clip_distance_mesa) - return strcmp(var->name, "gl_ClipDistanceMESA") == 0; - else - return strcmp(var->name, this->var_name) == 0; - } - - /** * The total number of varying components taken up by this variable. Only -* valid if is_assigned() is true. +* valid if assign_location() has been called. */ unsigned num_components() const { @@ -1822,34 +1805,18 @@ tfeedback_decl::assign_location(struct gl_context *ctx, } -bool -tfeedback_decl::accumulate_num_outputs(struct gl_shader_program *prog, - unsigned *count) +unsigned +tfeedback_decl::accumulate_num_outputs(struct gl_shader_program *prog) { if (!this->is_varying()) { - return true; - } - - if (!this->is_assigned()) { - /* From GL_EXT_transform_feedback: - * A program will fail to link if: - * - * * any variable name specified in the array is not - * declared as an output in the geometry shader (if present) or - * the vertex shader (if no geometry shader is present); - */ - linker_error(prog, "Transform feedback varying %s undeclared.", - this->orig_name); - return false; + return 0; } unsigned translated_size = this->size; if (this->is_clip_distance_mesa) translated_size = (translated_size + 3) / 4; - *count += translated_size * this->matrix_columns; - - return true; + return translated_size * this->matrix_columns; } @@ -1926,6 +1893,29 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog, } +ir_variable * +tfeedback_decl::find_output_var(gl_shader_program *prog, +gl_shader *producer) const +{ + const char *name = this->is_clip_distance_mesa + ? "gl_ClipDistanceMESA" : this->var_name; + ir_variable *var = producer->symbols->get_variable(name); + if (var && var->mode == ir_var_out) + return var; + + /* From GL_EXT_transform_feedback: +* A program will fail to link if: +* +* * any variable name specified in the array is not +* declared as an output in the geometry shader (if present) or +* the vertex shader (if no geometry shader is present); +*/ + linker_error(prog, "Transform feedback varying %s undeclared.", +this->orig_name); + return NULL; +} + + /** * Parse all the transform feedback declarations that were passed to * glTransformFeedbackVaryings() and store them in tfeedback_decl objects. @@ -2110,21 +2100,25 @@ assign_varying_locations(struct gl_context *ctx, assign_varying_location(input_var, output_var, &input_index, &output_index); } + } - for (unsigned i = 0; i < num_tfeedback_decls; ++i) { - if (!tfeedback_decls[i].is_varying()) -continue; + for (unsigned i = 0; i < num_tfeedback_decls; ++i) { + if (!tfeedback_decls[i].is_varying()) +
[Mesa-dev] [PATCH 06/10] glsl/linker: Subdivide the first phase of varying assignment.
This patch further subdivides the loop that assigns varying locations into two phases: one phase to match up the varyings between shader stages, and one phase to assign them varying locations. In between the two phases the matched varyings are stored in a new data structure called varying_matches. This will free us to be able to assign varying locations in any order, which will pave the way for packing varyings. Note that the new varying_matches::assign_locations() function returns the number of varying slots that were used; this return value will be used in a future patch. --- src/glsl/linker.cpp | 207 +--- 1 file changed, 163 insertions(+), 44 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index dd77010..7fa980f 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1961,58 +1961,167 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog, /** - * Assign a location for a variable that is produced in one pipeline stage - * (the "producer") and consumed in the next stage (the "consumer"). - * - * \param input_var is the input variable declaration in the consumer. - * - * \param output_var is the output variable declaration in the producer. - * - * \param input_index is the counter that keeps track of assigned input - *locations in the consumer. - * - * \param output_index is the counter that keeps track of assigned output - *locations in the producer. + * Data structure recording the relationship between outputs of one shader + * stage (the "producer") and inputs of another (the "consumer"). + */ +class varying_matches +{ +public: + varying_matches(); + ~varying_matches(); + void record(ir_variable *producer_var, ir_variable *consumer_var); + unsigned assign_locations(); + void store_locations(unsigned producer_base, unsigned consumer_base) const; + +private: + /** +* Structure recording the relationship between a single producer output +* and a single consumer input. +*/ + struct match { + /** + * The output variable in the producer stage. + */ + ir_variable *producer_var; + + /** + * The input variable in the consumer stage. + */ + ir_variable *consumer_var; + + /** + * The location which has been assigned for this varying. This is + * expressed in multiples of a float, with the first generic varying + * (i.e. the one referred to by VERT_RESULT_VAR0 or FRAG_ATTRIB_VAR0) + * represented by the value 0. + */ + unsigned generic_location; + } *matches; + + /** +* The number of elements in the \c matches array that are currently in +* use. +*/ + unsigned num_matches; + + /** +* The number of elements that were set aside for the \c matches array when +* it was allocated. +*/ + unsigned matches_capacity; +}; + + +varying_matches::varying_matches() +{ + /* Note: this initial capacity is rather arbitrarily chosen to be large +* enough for many cases without wasting an unreasonable amount of space. +* varying_matches::record() will resize the array if there are more than +* this number of varyings. +*/ + this->matches_capacity = 8; + this->matches = (match *) + malloc(sizeof(*this->matches) * this->matches_capacity); + this->num_matches = 0; +} + + +varying_matches::~varying_matches() +{ + free(this->matches); +} + + +/** + * Record the given producer/consumer variable pair in the list of variables + * that should later be assigned locations. * - * It is permissible for \c input_var to be NULL (this happens if a variable - * is output by the producer and consumed by transform feedback, but not - * consumed by the consumer). + * It is permissible for \c consumer_var to be NULL (this happens if a + * variable is output by the producer and consumed by transform feedback, but + * not consumed by the consumer). * - * If the variable has already been assigned a location, this function has no - * effect. + * If \c producer_var has already been paired up with a consumer_var, or + * producer_var is part of fixed pipeline functionality (and hence already has + * a location assigned), this function has no effect. */ void -assign_varying_location(ir_variable *input_var, ir_variable *output_var, -unsigned *input_index, unsigned *output_index) +varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) { - if (!output_var->is_unmatched_generic_inout) { - /* Location already assigned. */ + if (!producer_var->is_unmatched_generic_inout) { + /* Either a location already exists for this variable (since it is part + * of fixed functionality), or it has already been recorded as part of a + * previous match. + */ return; } - if (input_var) { - assert(input_var->location == -1); - input_var->location = *input_index; - input_v
[Mesa-dev] [PATCH 07/10] glsl/linker: Sort varyings by packing class, then vector size.
This patch paves the way for varying packing by adding a sorting step before varying assignment, which sorts the varyings into an order that increases the likelihood of being able to find an efficient packing. First, varyings are sorted into "packing classes" by considering attributes that can't be mixed during varying packing--at the moment this includes base type (float/int/uint/bool) and interpolation mode (smooth/noperspective/flat/centroid), though later we will hopefully be able to relax some of these restrictions. The number of packing classes places an upper limit on the amount of space that must be wasted by varying packing, since in theory a shader might nave 4n+1 components worth of varyings in each of m packing classes, resulting in 3m components worth of wasted space. Then, within each packing class, varyings are sorted by vector size, with vec4's coming first, then vec2's, then scalars, and then finally vec3's. The motivation for this order is that it ensures that the only vectors that might be "double parked" (with part of the vector in one varying slot and the remainder in another) are vec3's. Note that the varyings aren't actually packed yet, merely placed in an order that will facilitate packing. --- src/glsl/linker.cpp | 104 1 file changed, 104 insertions(+) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 7fa980f..cee7386 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1975,11 +1975,41 @@ public: private: /** +* Enum representing the order in which varyings are packed within a +* packing class. +* +* Currently we pack vec4's first, then vec2's, then scalar values, then +* vec3's. This order ensures that the only vectors that are at risk of +* having to be "double parked" (split between two adjacent varying slots) +* are the vec3's. +*/ + enum packing_order_enum { + PACKING_ORDER_VEC4, + PACKING_ORDER_VEC2, + PACKING_ORDER_SCALAR, + PACKING_ORDER_VEC3, + }; + + static unsigned compute_packing_class(ir_variable *var); + static packing_order_enum compute_packing_order(ir_variable *var); + static int match_comparator(const void *x_generic, const void *y_generic); + + /** * Structure recording the relationship between a single producer output * and a single consumer input. */ struct match { /** + * Packing class for this varying, computed by compute_packing_class(). + */ + unsigned packing_class; + + /** + * Packing order for this varying, computed by compute_packing_order(). + */ + packing_order_enum packing_order; + + /** * The output variable in the producer stage. */ ir_variable *producer_var; @@ -2061,6 +2091,10 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) realloc(this->matches, sizeof(*this->matches) * this->matches_capacity); } + this->matches[this->num_matches].packing_class + = this->compute_packing_class(producer_var); + this->matches[this->num_matches].packing_order + = this->compute_packing_order(producer_var); this->matches[this->num_matches].producer_var = producer_var; this->matches[this->num_matches].consumer_var = consumer_var; this->num_matches++; @@ -2077,6 +2111,10 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) unsigned varying_matches::assign_locations() { + /* Sort varying matches into an order that makes them easy to pack. */ + qsort(this->matches, this->num_matches, sizeof(*this->matches), + &varying_matches::match_comparator); + unsigned generic_location = 0; for (unsigned i = 0; i < this->num_matches; i++) { @@ -2127,6 +2165,72 @@ varying_matches::store_locations(unsigned producer_base, /** + * Compute the "packing class" of the given varying. This is an unsigned + * integer with the property that two variables in the same packing class can + * be safely backed into the same vec4. + */ +unsigned +varying_matches::compute_packing_class(ir_variable *var) +{ + /* In this initial implementation we conservatively assume that variables +* can only be packed if their base type (float/int/uint/bool) matches and +* their interpolation and centroid qualifiers match. +* +* TODO: relax these restrictions when the driver back-end permits. +*/ + unsigned packing_class = var->centroid ? 1 : 0; + packing_class *= 4; + packing_class += var->interpolation; + packing_class *= GLSL_TYPE_ERROR; + packing_class += var->type->get_scalar_type()->base_type; + return packing_class; +} + + +/** + * Compute the "packing order" of the given varying. This is a sort key we + * use to determine when to attempt to pack the given varying relative to + * other varyings in the same packing class. + */ +varying_matches::packing_order_enum +varying_matches::compu
[Mesa-dev] [PATCH 08/10] glsl: Add a lowering pass for packing varyings.
This lowering pass generates GLSL code that manually packs varyings into vec4 slots, for the benefit of back-ends that don't support packed varyings natively. No functional change--the lowering pass is not yet used. --- src/glsl/Makefile.sources | 1 + src/glsl/ir_optimization.h | 3 + src/glsl/lower_packed_varyings.cpp | 371 + 3 files changed, 375 insertions(+) create mode 100644 src/glsl/lower_packed_varyings.cpp diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index 5e098fc..d984c5c 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -57,6 +57,7 @@ LIBGLSL_FILES = \ $(GLSL_SRCDIR)/lower_jumps.cpp \ $(GLSL_SRCDIR)/lower_mat_op_to_vec.cpp \ $(GLSL_SRCDIR)/lower_noise.cpp \ + $(GLSL_SRCDIR)/lower_packed_varyings.cpp \ $(GLSL_SRCDIR)/lower_texture_projection.cpp \ $(GLSL_SRCDIR)/lower_variable_index_to_cond_assign.cpp \ $(GLSL_SRCDIR)/lower_vec_index_to_cond_assign.cpp \ diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index 628096e..6b95191 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -75,6 +75,9 @@ bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz); bool lower_clip_distance(gl_shader *shader); void lower_output_reads(exec_list *instructions); void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions); +void lower_packed_varyings(void *mem_ctx, unsigned location_base, + unsigned locations_used, ir_variable_mode mode, + gl_shader *shader); bool optimize_redundant_jumps(exec_list *instructions); bool optimize_split_arrays(exec_list *instructions, bool linked); diff --git a/src/glsl/lower_packed_varyings.cpp b/src/glsl/lower_packed_varyings.cpp new file mode 100644 index 000..4cb9066 --- /dev/null +++ b/src/glsl/lower_packed_varyings.cpp @@ -0,0 +1,371 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file lower_varyings_to_packed.cpp + * + * This lowering pass generates GLSL code that manually packs varyings into + * vec4 slots, for the benefit of back-ends that don't support packed varyings + * natively. + * + * For example, the following shader: + * + * out mat3x2 foo; // location=4, location_frac=0 + * out vec3 bar[2]; // location=5, location_frac=2 + * + * main() + * { + * ... + * } + * + * Is rewritten to: + * + * mat3x2 foo; + * vec3 bar[2]; + * out vec4 packed4; // location=4, location_frac=0 + * out vec4 packed5; // location=5, location_frac=0 + * out vec4 packed6; // location=6, location_frac=0 + * + * main() + * { + * ... + * packed4.xy = foo[0]; + * packed4.zw = foo[1]; + * packed5.xy = foo[2]; + * packed5.zw = bar[0].xy; + * packed6.x = bar[0].z; + * packed6.yzw = bar[1]; + * } + * + * This lowering pass properly handles "double parking" of a varying vector + * across two varying slots. For example, in the code above, two of the + * components of bar[0] are stored in packed5, and the remaining component is + * stored in packed6. + * + * Note that in theory, the extra instructions may cause some loss of + * performance. However, hopefully in most cases the performance loss will + * either be absorbed by a later optimization pass, or it will be offset by + * memory bandwidth savings (because fewer varyings are used). + */ + +#include "glsl_symbol_table.h" +#include "ir.h" +#include "ir_optimization.h" + +/** + * Visitor that performs lowering packing. For each varying declared in the + * shader, this visitor determines whether it needs to be packed. If so, it + * demotes it to an ordinary global, creates new packed varyings, and + * generates assignments to convert between the original varying and the + * packed v
[Mesa-dev] [PATCH 09/10] glsl/linker: Pack within compound varyings.
This patch implements varying packing within varyings that are composed of multiple vectors of size less than 4 (e.g. arrays of vec2's, or matrices with height less than 4). Previously, such varyings used up a full 4-wide varying slot for each constituent vector, meaning that some of the components of each varying slot went unused. For example, a mat4x3 would be stored as follows:slots * * * * * * * * * * * * * * * * <-column1-> x <-column2-> x <-column3-> x <-column4-> x matrix (Each * represents a varying component, and the "x"s represent wasted space). In addition to wasting precious varying components, this layout complicated transform feedback, since the constituents of the varying are expected to be output to the transform feedback buffer contiguously (e.g. without gaps between the columns, in the case of a matrix). This change packs the constituents of each varying together so that all wasted space is at the end. For the mat4x3 example, this looks like so: slots * * * * * * * * * * * * * * * * <-column1-> <-column2-> <-column3-> <-column4-> x x x x matrix Note that matrix columns 2 and 3 now cross a boundary between varying slots (a characteristic I call "double parking" of a varying). We don't bother trying to eliminate the wasted space at the end of the varying, since the patch that follows will take care of that. Since compiler back-ends don't (yet) support this packed layout, the lower_packed_varyings function is used to rewrite the shader into a form where each varying occupies a full varying slot. Later, if we add native back-end support for varying packing, we can make this lowering pass optional. --- src/glsl/linker.cpp | 85 ++--- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index cee7386..d6f11a5 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1601,6 +1601,17 @@ private: int location; /** +* If non-zero, then this variable may be packed along with other variables +* into a single varying slot, so this offset should be applied when +* accessing components. For example, an offset of 1 means that the x +* component of this variable is actually stored in component y of the +* location specified by \c location. +* +* Only valid if location != -1. +*/ + unsigned location_frac; + + /** * If location != -1, the number of vector elements in this variable, or 1 * if this variable is a scalar. */ @@ -1739,6 +1750,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx, /* Array variable */ const unsigned matrix_cols = output_var->type->fields.array->matrix_columns; + const unsigned vector_elements = + output_var->type->fields.array->vector_elements; unsigned actual_array_size = this->is_clip_distance_mesa ? prog->Vert.ClipDistanceArraySize : output_var->type->array_size(); @@ -1754,16 +1767,22 @@ tfeedback_decl::assign_location(struct gl_context *ctx, if (this->is_clip_distance_mesa) { this->location = output_var->location + this->array_subscript / 4; +this->location_frac = this->array_subscript % 4; } else { -this->location = - output_var->location + this->array_subscript * matrix_cols; +unsigned fine_location + = output_var->location * 4 + output_var->location_frac; +unsigned array_elem_size = vector_elements * matrix_cols; +fine_location += array_elem_size * this->array_subscript; +this->location = fine_location / 4; +this->location_frac = fine_location % 4; } this->size = 1; } else { this->location = output_var->location; + this->location_frac = output_var->location_frac; this->size = actual_array_size; } - this->vector_elements = output_var->type->fields.array->vector_elements; + this->vector_elements = vector_elements; this->matrix_columns = matrix_cols; if (this->is_clip_distance_mesa) this->type = GL_FLOAT; @@ -1778,6 +1797,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx, return false; } this->location = output_var->location; + this->location_frac = output_var->location_frac; this->size = 1; this->vector_elements = output_var->type->vector_elements; this->matrix_columns = output_var->type->matrix_columns; @@ -1812,11 +1832,7 @@ tfeedback_decl::accumulate_num_outputs(struct gl_shader_program *prog) return 0; } - unsigned translated_size = this->size; - if (this->is_clip_distance_mesa) - tran
[Mesa-dev] [PATCH 10/10] glsl/linker: Pack between varyings.
This patch implements varying packing between varyings. Previously, each varying occupied components 0 through N-1 of its assigned varying slot, so there was no way to pack two varyings into the same slot. For example, if the varyings were a float, a vec2, a vec3, and another vec2, they would be stored as follows:slots * * * * * * * * * * * * * * * * flt x x xx x <--vec3---> xx x varyings (Each * represents a varying component, and the "x"s represent wasted space). This change packs the varyings together to eliminate wasted space between varyings, like so: slots * * * * * * * * * * * * * * * * flt <--vec3---> x x x x x x x x varyings Note that we take advantage of the sort order introduced in previous patches (vec4's first, then vec2's, then scalars, then vec3's) to minimize how often a varying is "double parked" (split across varying slots). --- src/glsl/linker.cpp | 49 + 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index d6f11a5..80aa260 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1996,6 +1996,7 @@ private: static unsigned compute_packing_class(ir_variable *var); static packing_order_enum compute_packing_order(ir_variable *var); + static unsigned compute_num_components(ir_variable *var); static int match_comparator(const void *x_generic, const void *y_generic); /** @@ -2012,6 +2013,7 @@ private: * Packing order for this varying, computed by compute_packing_order(). */ packing_order_enum packing_order; + unsigned num_components; /** * The output variable in the producer stage. @@ -2099,6 +2101,8 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) = this->compute_packing_class(producer_var); this->matches[this->num_matches].packing_order = this->compute_packing_order(producer_var); + this->matches[this->num_matches].num_components + = this->compute_num_components(producer_var); this->matches[this->num_matches].producer_var = producer_var; this->matches[this->num_matches].consumer_var = consumer_var; this->num_matches++; @@ -2122,20 +2126,19 @@ varying_matches::assign_locations() unsigned generic_location = 0; for (unsigned i = 0; i < this->num_matches; i++) { - this->matches[i].generic_location = generic_location; - - ir_variable *producer_var = this->matches[i].producer_var; - - if (producer_var->type->is_array()) { - const unsigned slots = producer_var->type->length -* producer_var->type->fields.array->matrix_columns; + /* Advance to the next slot if this varying has a different packing + * class than the previous one, and we're not already on a slot + * boundary. + */ + if (i > 0 && generic_location % 4 != 0 && + this->matches[i - 1].packing_class + != this->matches[i].packing_class) { + generic_location += 4 - generic_location % 4; + } - generic_location += 4 * slots; - } else { - const unsigned slots = producer_var->type->matrix_columns; + this->matches[i].generic_location = generic_location; - generic_location += 4 * slots; - } + generic_location += this->matches[i].num_components; } return (generic_location + 3) / 4; @@ -2219,6 +,28 @@ varying_matches::compute_packing_order(ir_variable *var) /** + * Compute the number of components that this variable will occupy when + * properly packed. + */ +unsigned +varying_matches::compute_num_components(ir_variable *var) +{ + const glsl_type *type = var->type; + unsigned multipiler = 1; + + if (type->is_array()) { + multipiler *= type->length; + type = type->fields.array; + } + + /* FINISHME: Support for "varying" records in GLSL 1.50. */ + assert(!type->is_record()); + + return multipiler * type->components(); +} + + +/** * Comparison function passed to qsort() to sort varyings by packing_class and * then by packing_order. */ -- 1.8.0.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 42516] Assertion `src.File != TGSI_FILE_NULL' failed with llvmpipe renderer
https://bugs.freedesktop.org/show_bug.cgi?id=42516 Brian Paul changed: What|Removed |Added Status|NEW |RESOLVED Resolution|--- |WORKSFORME --- Comment #4 from Brian Paul --- There hasn't been any follow-up about testing something newer than Mesa 7.11 and I don't see the problem with later versions. Closing. -- You are receiving this mail because: You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] mesa: Fix computation of default vertex attrib stride for 2_10_10_10 formats.
Previously, if the client program didn't specify a stride when setting up a vertex attribute, we used _mesa_sizeof_type() to compute the size of the type, and multiplied it by the number of components. This didn't work for the 2_10_10_10 formats, since _mesa_sizeof_type() returns -1 for those types, resulting in all kinds of havoc, since it was causing the hardware to be programmed with a negative stride value. This patch adds a new function _mesa_bytes_per_vertex_attrib(), which is similar to the existing function _mesa_bytes_per_pixel(), but which computes the size of a vertex attribute based on the type and the number of formats. For packed formats (currently only the 2_10_10_10 formats), it verifies that the number of components is correct and returns the size of the packed format. For unpacked formats, it returns the size of the type times the number of components. In addition, this patch adds an assertion so that if we ever forget to update _mesa_bytes_per_vertex_attrib() when adding a new vertex format, we'll see the problem quickly rather than having to debug a subtle conformance test failure. Fixes GLES3 conformance tests vertex_type_2_10_10_10_rev_{conversion,divisor,stride_pointer}.test. --- src/mesa/main/glformats.c | 42 ++ src/mesa/main/glformats.h | 3 +++ src/mesa/main/varray.c| 3 ++- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c index fefa9c4..b6b16ca 100644 --- a/src/mesa/main/glformats.c +++ b/src/mesa/main/glformats.c @@ -308,6 +308,48 @@ _mesa_bytes_per_pixel(GLenum format, GLenum type) /** + * Get the number of bytes for a vertex attrib with the given number of + * components ant type. + * + * \param comps number of components. + * \param type data type. + * + * \return bytes per attribute, or -1 if a bad comps/type combination was given. + */ +GLint +_mesa_bytes_per_vertex_attrib(GLint comps, GLenum type) +{ + switch (type) { + case GL_BYTE: + case GL_UNSIGNED_BYTE: + return comps * sizeof(GLubyte); + case GL_SHORT: + case GL_UNSIGNED_SHORT: + return comps * sizeof(GLshort); + case GL_INT: + case GL_UNSIGNED_INT: + return comps * sizeof(GLint); + case GL_FLOAT: + return comps * sizeof(GLfloat); + case GL_HALF_FLOAT_ARB: + return comps * sizeof(GLhalfARB); + case GL_DOUBLE: + return comps * sizeof(GLdouble); + case GL_FIXED: + return comps * sizeof(GLfixed); + case GL_INT_2_10_10_10_REV: + case GL_UNSIGNED_INT_2_10_10_10_REV: + if (comps == 4) + return sizeof(GLuint); + else + return -1; + default: + return -1; + } +} + + +/** * Test if the given format is an integer (non-normalized) format. */ GLboolean diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h index 5d09951..ccfb5e1 100644 --- a/src/mesa/main/glformats.h +++ b/src/mesa/main/glformats.h @@ -49,6 +49,9 @@ _mesa_components_in_format( GLenum format ); extern GLint _mesa_bytes_per_pixel( GLenum format, GLenum type ); +extern GLint +_mesa_bytes_per_vertex_attrib(GLint comps, GLenum type); + extern GLboolean _mesa_is_type_integer(GLenum type); diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c index f770143..5e4d6c3 100644 --- a/src/mesa/main/varray.c +++ b/src/mesa/main/varray.c @@ -251,7 +251,8 @@ update_array(struct gl_context *ctx, return; } - elementSize = _mesa_sizeof_type(type) * size; + elementSize = _mesa_bytes_per_vertex_attrib(size, type); + assert(elementSize != -1); array = &ctx->Array.ArrayObj->VertexAttrib[attrib]; array->Size = size; -- 1.8.0.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 00/10] glsl: Implement varying packing.
> For the initial implementation I've chosen a strategy that operates > exclusively at the GLSL IR level, so that it doesn't require the > cooperation of the driver back-ends. Wouldn't this negatively affect performance of some GPUs? Not sure if relevant for Mesa, but e.g. on PowerVR SGX it's really bad to pack two vec2 texture coordinates into a single vec4. That's because var.xy texture read can be "prefetched", whereas var.zw texture read is not prefetched (essentially treated as a dependent texture read), and often causes stalls in the shader execution. -- Aras Pranckevičius work: http://unity3d.com home: http://aras-p.info ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev