[Mesa-dev] [PATCH] winsys/radeon: cleanup CS offloading
From: Christian König Using atomic function for ncs is superfluous since it is protected by a mutex anyway. Also lock the mutex only once while retrieving the next CS for submission. Signed-off-by: Christian König --- src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 31 --- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index 4f43093..f8aeb96 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -542,13 +542,12 @@ void radeon_drm_ws_queue_cs(struct radeon_drm_winsys *ws, struct radeon_drm_cs * { retry: pipe_mutex_lock(ws->cs_stack_lock); -if (p_atomic_read(&ws->ncs) >= RING_LAST) { +if (ws->ncs >= RING_LAST) { /* no room left for a flush */ pipe_mutex_unlock(ws->cs_stack_lock); goto retry; } -ws->cs_stack[p_atomic_read(&ws->ncs)] = cs; -p_atomic_inc(&ws->ncs); +ws->cs_stack[ws->ncs++] = cs; pipe_mutex_unlock(ws->cs_stack_lock); pipe_semaphore_signal(&ws->cs_queued); } @@ -557,41 +556,31 @@ static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param) { struct radeon_drm_winsys *ws = (struct radeon_drm_winsys *)param; struct radeon_drm_cs *cs; -unsigned i, empty_stack; +unsigned i; while (1) { pipe_semaphore_wait(&ws->cs_queued); if (ws->kill_thread) break; -next: + pipe_mutex_lock(ws->cs_stack_lock); cs = ws->cs_stack[0]; +for (i = 1; i < ws->ncs; i++) +ws->cs_stack[i - 1] = ws->cs_stack[i]; +ws->cs_stack[--ws->ncs] = NULL; pipe_mutex_unlock(ws->cs_stack_lock); if (cs) { radeon_drm_cs_emit_ioctl_oneshot(cs, cs->cst); - -pipe_mutex_lock(ws->cs_stack_lock); -for (i = 1; i < p_atomic_read(&ws->ncs); i++) { -ws->cs_stack[i - 1] = ws->cs_stack[i]; -} -ws->cs_stack[p_atomic_read(&ws->ncs) - 1] = NULL; -empty_stack = p_atomic_dec_zero(&ws->ncs); -pipe_mutex_unlock(ws->cs_stack_lock); - pipe_semaphore_signal(&cs->flush_completed); - -if (!empty_stack) { -goto next; -} } } pipe_mutex_lock(ws->cs_stack_lock); -for (i = 0; i < p_atomic_read(&ws->ncs); i++) { +for (i = 0; i < ws->ncs; i++) { pipe_semaphore_signal(&ws->cs_stack[i]->flush_completed); ws->cs_stack[i] = NULL; } -p_atomic_set(&ws->ncs, 0); +ws->ncs = 0; pipe_mutex_unlock(ws->cs_stack_lock); return NULL; } @@ -655,7 +644,7 @@ struct radeon_winsys *radeon_drm_winsys_create(int fd) pipe_mutex_init(ws->cmask_owner_mutex); pipe_mutex_init(ws->cs_stack_lock); -p_atomic_set(&ws->ncs, 0); +ws->ncs = 0; pipe_semaphore_init(&ws->cs_queued, 0); if (ws->num_cpus > 1 && debug_get_option_thread()) ws->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, ws); -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] mesa: remove remnants of GL_MESA_shader_debug
This extension never saw any real use so remove it. --- include/GL/gl.h | 20 src/mapi/glapi/gen/gl_API.xml | 32 2 files changed, 52 deletions(-) diff --git a/include/GL/gl.h b/include/GL/gl.h index babb746..968032c 100644 --- a/include/GL/gl.h +++ b/include/GL/gl.h @@ -2086,26 +2086,6 @@ typedef void (APIENTRYP PFNGLMULTITEXCOORD4SVARBPROC) (GLenum target, const GLsh -#if GL_ARB_shader_objects - -#ifndef GL_MESA_shader_debug -#define GL_MESA_shader_debug 1 - -#define GL_DEBUG_OBJECT_MESA 0x8759 -#define GL_DEBUG_PRINT_MESA 0x875A -#define GL_DEBUG_ASSERT_MESA 0x875B - -GLAPI GLhandleARB GLAPIENTRY glCreateDebugObjectMESA (void); -GLAPI void GLAPIENTRY glClearDebugLogMESA (GLhandleARB obj, GLenum logType, GLenum shaderType); -GLAPI void GLAPIENTRY glGetDebugLogMESA (GLhandleARB obj, GLenum logType, GLenum shaderType, GLsizei maxLength, - GLsizei *length, GLcharARB *debugLog); -GLAPI GLsizei GLAPIENTRY glGetDebugLogLengthMESA (GLhandleARB obj, GLenum logType, GLenum shaderType); - -#endif /* GL_MESA_shader_debug */ - -#endif /* GL_ARB_shader_objects */ - - /* * ???. GL_MESA_packed_depth_stencil * XXX obsolete diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml index 48fce36..30ab9c9 100644 --- a/src/mapi/glapi/gen/gl_API.xml +++ b/src/mapi/glapi/gen/gl_API.xml @@ -13027,38 +13027,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] mesa: remove GL_MESA_program_debug bits from gl.h
The code for this was removed from Mesa some time ago. --- include/GL/gl.h | 21 - 1 file changed, 21 deletions(-) diff --git a/include/GL/gl.h b/include/GL/gl.h index 968032c..6b94e3f 100644 --- a/include/GL/gl.h +++ b/include/GL/gl.h @@ -2102,27 +2102,6 @@ typedef void (APIENTRYP PFNGLMULTITEXCOORD4SVARBPROC) (GLenum target, const GLsh #endif /* GL_MESA_packed_depth_stencil */ -#ifndef GL_MESA_program_debug -#define GL_MESA_program_debug 1 - -#define GL_FRAGMENT_PROGRAM_POSITION_MESA 0x8bb0 -#define GL_FRAGMENT_PROGRAM_CALLBACK_MESA 0x8bb1 -#define GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA 0x8bb2 -#define GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA 0x8bb3 -#define GL_VERTEX_PROGRAM_POSITION_MESA 0x8bb4 -#define GL_VERTEX_PROGRAM_CALLBACK_MESA 0x8bb5 -#define GL_VERTEX_PROGRAM_CALLBACK_FUNC_MESA0x8bb6 -#define GL_VERTEX_PROGRAM_CALLBACK_DATA_MESA0x8bb7 - -typedef void (*GLprogramcallbackMESA)(GLenum target, GLvoid *data); - -GLAPI void GLAPIENTRY glProgramCallbackMESA(GLenum target, GLprogramcallbackMESA callback, GLvoid *data); - -GLAPI void GLAPIENTRY glGetProgramRegisterfvMESA(GLenum target, GLsizei len, const GLubyte *name, GLfloat *v); - -#endif /* GL_MESA_program_debug */ - - #ifndef GL_MESA_texture_array #define GL_MESA_texture_array 1 -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] mesa: fix a couple issues with U_FIXED, I_FIXED macros
Silence a bunch of MSVC type conversion warnings. Changed return type of S_FIXED to int32_t (signed). The result is the same. It just seems more intuitive that a signed conversion function should return a signed value. --- src/mesa/main/macros.h |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h index 880c656..379f756 100644 --- a/src/mesa/main/macros.h +++ b/src/mesa/main/macros.h @@ -193,7 +193,7 @@ static INLINE uint32_t U_FIXED(float value, uint32_t frac_bits) { value *= (1 << frac_bits); - return value < 0 ? 0 : value; + return value < 0.0f ? 0 : (uint32_t) value; } /** @@ -201,10 +201,10 @@ U_FIXED(float value, uint32_t frac_bits) * * \param frac_bits The number of bits used to store the fractional part. */ -static INLINE uint32_t +static INLINE int32_t S_FIXED(float value, uint32_t frac_bits) { - return value * (1 << frac_bits); + return (int32_t) (value * (1 << frac_bits)); } /*@}*/ -- 1.7.10.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/2] gallivm: implement seamless cube filtering
On 10/18/2013 01:47 PM, srol...@vmware.com wrote: From: Roland Scheidegger For seamless cube filtering it is necessary to determine new faces and new coords per sample. The logic for this is _seriously_ complex (what needs to happen is very "asymmetric" wrt face, x/y under/overflow), further complicated by the fact that if the 4 samples are in a corner (meaning we only have actually 3 samples, and all 3 are on different faces) then falling off the edge is happening _both_ on x and y axis simultaneously. There was a noticeable performance hit in mesa's cubemap demo when seamless filtering was forced on (just below 10 percent or so in a debug build, when disabling all filtering hacks, otherwise it would probably be a bit more) and when always doing the logic, hence use a branch which it only does it if any of the pixels in a quad (or in two quads) actually hit this. With that there was no measurable performance hit in the cubemap demo (neither in a debug nor release buidl), but this will vary (cubemap demo very rarely hits edges). Might also be different on other cpus, as this forces SoA sampling path which potentially can be quite a bit slower. Note that as for corners, this code gets all the 3 samples which actually exist right, and the 4th texel will simply be the same as one of the others, meaning that filter weights will be a bit wrong. This however should be enough for full OpenGL (but not d3d10) compliance. --- That's some pretty dense code. I doubt I could spot an error in it but it looks OK otherwise. For both: Reviewed-by: Brian Paul ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 5/8] i965/gs: Fix up gl_PointSize input swizzling for DUAL_INSTANCED gs.
On 18 October 2013 17:04, Eric Anholt wrote: > Paul Berry writes: > > > Geometry shaders that run in "DUAL_INSTANCED" mode store their inputs > > in vec4's. This means that when compiling gl_PointSize input > > swizzling (a MOV instruction which uses a geometry shader input as > > both source and destination), we need to do two things: > > > > - Set force_writemask_all to ensure that the MOV happens regardless of > > which channels are enabled. > > > > - Set the source register region to <4;4,1> (instead of <0;4,1> to > > satisfy register region restrictions. > > This sure sounds like something you empirically found, but I'm confused. > I would have assumed that DUAL_INSTANCED with an instance count of 1 > meant that the channel enables the hardware gave you had the first 4 > enabled and the second 4 disabled. And since the dst.width (and thus > execsize) is 4, whether or not the second 4 are disabled wouldn't > matter. In that case, why do you need the writemask forced, since just > the 4 channels you care about will be affected, anyway? > Setting force_writemask_all was not empirical--I did it out of general caution. I couldn't find any documentation in the bspec to guarantee that an instance count of 1 would get dispatched to the first 4 channels rather than the second 4, and I didn't want to rely on undocumented behaviour. > > And, if dst.width == 4, then execsize == 4, and I'm confused what > register region restriction is being honored by promoting the hstride to > 4. > This part was empirical. I discovered that if I don't set the hstride to 4, then I get an assertion failure here (in validate_reg() in brw_eu_emit.c): /* 4. */ if (execsize == width && hstride != 0) { assert(vstride == -1 || vstride == width * hstride); } I don't know why the hardware cares about this, but I double-checked the bspec and this restriction is really there. Side note: I forgot to mention it in the comments, but in addition to fixing up <0;4,1> -> <4;4,1>, this code fixes up <8;8,1> -> <4;4,1>. That's necessary for a stupid reason: in the vec4 back-end we represent GRFs as <8;8,1> so that guess_execution_size() will correctly guess an execution size of 8. However, in align16 mode, the hardware assumes a width of 4, so it really needs to be <4;4,1>. Normally, we fix that up in brw_set_src0() and brw_set_src1(), but we do it *after* the call to validate_reg(). So to avoid validate_reg() asserting on these width-4 instructions, we need to change the source register from <8;8,1> to <4;4,1> before emitting the instruction. One of these days, I swear I'm going to get rid of guess_execution_size() so we can end this sort of madness. > > Putting these fixups for a couple of weird cases in just MOV and ADD > feels wrong to me, but maybe when I understand better what's going on > it'll seem more natural. > Another possibility I'd be equally happy with would be to put the fixup at the top of vec4_generator::generate_vec4_instruction(), before the switch statement. It would look something like this: if (dst.width == BRW_WIDTH_4) { /* This happens in attribute fixups for "dual instanced" geometry * shaders, since they use attributes that are vec4's. Since the exec * width is only 4, it's essential that the caller set * force_writemask_all in order to make sure the instruction is executed * regardless of which channels are enabled. */ assert(inst->force_writemask_all); /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy * the following register region restrictions (from Graphics BSpec: * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions * > Register Region Restrictions) * * 1. ExecSize must be greater than or equal to Width. * * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set *to Width * HorzStride." */ for (int i = 0; i < 3; i++) { if (src[i].file == BRW_GENERAL_REGISTER_FILE) src[i] = stride(src[i], 4, 4, 1); } } Does that seem better to you? I actually think I like it slightly better because by making the assertion more general, I caught another case where I think I should be setting force_writemask_all to be on the safe side (the "clear r0.2" instruction in the gs prolog). ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 7/8] i965/gs: If a DUAL_OBJECT gs would spill, fall back to DUAL_INSTANCED.
On 18 October 2013 16:46, Eric Anholt wrote: > Paul Berry writes: > > Since most geometry shaders used in piglit testing are small, > > DUAL_INSTANCED mode won't get exercised very much in a normal piglit > > run. To force DUAL_INSTANCED mode to be used for all geometry > > shaders, set INTEL_DEBUG=nogualobj. > >^ nodualobj > Whoops. Fixed, thanks. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] glsl/linker: Allow mixing of desktop GLSL versions.
On 18 October 2013 11:07, Ian Romanick wrote: > On 10/17/2013 08:07 PM, Paul Berry wrote: > > Previously, Mesa followed the linkage rules outlined in the GLSL > > 1.20-1.40 specs, which (collectively) said that GLSL versions 1.10 and > > 1.20 could be linked together, but no other versions could be linked. > > > > In GLSL 4.30, the linkage rules were relaxed so that any two desktop > > GLSL versions can be linked together. This change was made because it > > reflected the behaviour of nearly all existing implementations (see > > Khronos bug 8463). Mesa was one of the few (perhaps the only) > > exceptions to prohibit cross-linking of some GLSL versions. > > > > Since the GLSL linkage rules were deliberately relaxed in order to > > match the behaviour of existing implementations, it seems appropriate > > to relax the rules in Mesa too (even though Mesa doesn't support GLSL > > 4.30 yet). > > > > Note that linking ES and desktop shaders is still prohibited, as is > > linking ES shaders having different GLSL versions. > > > > Fixes piglit tests "shaders/version-mixing {interstage,intrastage}". > > Are there any piglit tests that now fail? It seems like we may have had > one or two tests that verify the (old) spec behavior... > I would have thought so, but I just did a full piglit run and didn't get any regressions, so I guess not. > > You should ping the reporter of this bug: > > https://bugs.freedesktop.org/show_bug.cgi?id=70261 > > This should fix his problem as well. > Good call. I'll do that once I've landed the patch. > > > --- > > src/glsl/linker.cpp | 10 +++--- > > 1 file changed, 3 insertions(+), 7 deletions(-) > > > > diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp > > index 9095a40..0a949b4 100644 > > --- a/src/glsl/linker.cpp > > +++ b/src/glsl/linker.cpp > > @@ -2057,14 +2057,10 @@ link_shaders(struct gl_context *ctx, struct > gl_shader_program *prog) > >} > > } > > > > - /* Previous to GLSL version 1.30, different compilation units could > mix and > > -* match shading language versions. With GLSL 1.30 and later, the > versions > > -* of all shaders must match. > > -* > > -* GLSL ES has never allowed mixing of shading language versions. > > + /* In desktop GLSL, different shader versions may be linked > together. In > > +* GLSL ES, all shader versions must be the same. > > */ > > - if ((is_es_prog || max_version >= 130) > > - && min_version != max_version) { > > + if (is_es_prog && min_version != max_version) { > >linker_error(prog, "all shaders must use same shading " > > "language version\n"); > >goto done; > > > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] Remove error when calling glGenQueries/glDeleteQueries while a query is active
On 10/17/2013 02:35 PM, Carl Worth wrote: There is nothing in the OpenGL specification which prevents the user from calling glGenQueries to generate a new query object while another object is active. Neither is there anything in the Mesa implementation which prevents this. So remove the INVALID_OPERATION errors in this case. Similarly, it is explicitly allowed by the OpenGL specification to delete an active query, so remove the assertion for that case and be sure to call the driver's EndQuery hook. CC: --- Brian Paul writes: On 10/17/2013 12:14 PM, Carl Worth wrote: But from http://www.opengl.org/registry/specs/ARB/occlusion_query.txt: """ Calling either GenQueriesARB or DeleteQueriesARB while any query of any target is active causes an INVALID_OPERATION error to be generated. """ (it's about half-way down in the file) It's also mentioned in the "Errors" section. Thanks, Brian. That certainly does justify where the original code came from. Maybe that was rescinded since that spec was done. If so, I'm fine with removing the code. I can't find any similar error requirement in the OpenGL 4.4 (Core) specification. (And with the increased number of different query types, it doesn't seem that the error requirement makes sense.) I did find the following sentence in the specification (section 4.2): If an active query object is deleted its name immediately becomes unused, but the underlying object is not deleted until it is no longer active. This sentence presumes the possibility of deleting an active query, so I think it is reasonable to remove the error. However, I wouldn't be surprised if our drivers crashed and burned if an active query is deleted. gl_query_object isn't referenced counted. Thanks for the catch. My revised patch below calls the driver's EndQuery hook, which will hopefully avoid this problem. This does inactivate the query immediately at delete time, which might seem inconsistent with the sentence I quoted from the specification above. But I think this is fine since once the object is deleted the user has no visibility into whether the object is active or not. -Carl src/mesa/main/queryobj.c | 19 --- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c index a180133..ebdb71c 100644 --- a/src/mesa/main/queryobj.c +++ b/src/mesa/main/queryobj.c @@ -202,13 +202,6 @@ _mesa_GenQueries(GLsizei n, GLuint *ids) return; } - /* No query objects can be active at this time! */ - if (ctx->Query.CurrentOcclusionObject || - ctx->Query.CurrentTimerObject) { - _mesa_error(ctx, GL_INVALID_OPERATION, "glGenQueriesARB"); - return; - } - first = _mesa_HashFindFreeKeyBlock(ctx->Query.QueryObjects, n); if (first) { GLsizei i; @@ -241,18 +234,14 @@ _mesa_DeleteQueries(GLsizei n, const GLuint *ids) return; } - /* No query objects can be active at this time! */ - if (ctx->Query.CurrentOcclusionObject || - ctx->Query.CurrentTimerObject) { - _mesa_error(ctx, GL_INVALID_OPERATION, "glDeleteQueriesARB"); - return; - } - for (i = 0; i < n; i++) { if (ids[i] > 0) { struct gl_query_object *q = _mesa_lookup_query_object(ctx, ids[i]); if (q) { -ASSERT(!q->Active); /* should be caught earlier */ +if (q->Active) { + q->Active = GL_FALSE; + ctx->Driver.EndQuery(ctx, q); Valgrind found an invalid pointer (and crashed!) when I modified your piglit test (see other msg). We also need to make sure that the ctx->Query.CurrentFoo binding point is cleared. Something like this: if (q->Active) { struct gl_query_object **bindpt = get_query_binding_point(ctx, q->Target); assert(bindpt); /* _should_ be non-null if q is active */ if (bindpt) { *bindpt = NULL; } ... +} _mesa_HashRemove(ctx->Query.QueryObjects, ids[i]); ctx->Driver.DeleteQuery(ctx, q); } ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] r600g/sb: Initialize shader::dce_flags.
On 10/19/2013 06:18 AM, Vinson Lee wrote: Fixes "Uninitialized scalar field" defect reported by Coverity. Signed-off-by: Vinson Lee Reviewed-by: Vadim Girlin --- src/gallium/drivers/r600/sb/sb_shader.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp index 98e52b1..38617a8 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.cpp +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp @@ -39,7 +39,8 @@ shader::shader(sb_context &sctx, shader_target t, unsigned id) coal(*this), bbs(), target(t), vt(ex), ex(*this), root(), compute_interferences(), - has_alu_predication(), uses_gradients(), safe_math(), ngpr(), nstack() {} + has_alu_predication(), + uses_gradients(), safe_math(), ngpr(), nstack(), dce_flags() {} bool shader::assign_slot(alu_node* n, alu_node *slots[5]) { ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 5/8] i965: Implement FS backend for ARB_sample_shading
On 18 October 2013 10:30, Anuj Phogat wrote: > I know we can specify stride if we have a brw_reg :- > fs_reg (stride(brw_vec1_grf(0, 0), 2, 4, 0)); > > But I could not find a reliable way to use stride if we have a fs_reg. > That's why I used OR to get the desired result. > The right way to do this, IMHO, is to create a special back-end instruction opcode to do the work. That way you can put the call to stride() in fs_generator, and it will happen after registers have been assigned and everything has been turned into brw_reg's. See for example what happens in FS_OPCODE_SET_SIMD4X2_OFFSET, which does a similar thing except that instead of changing the stride of the register it changes its width. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 5/8] i965: Implement FS backend for ARB_sample_shading
On 14 October 2013 10:12, Anuj Phogat wrote: > Implement the FS backend for new builtins added by the extension: > in vec2 gl_SamplePosition > in int gl_SampleID > in int gl_NumSamples > out int gl_SampleMask[] > There is a lot going on in this one patch, and it's getting hard to follow all the patch review that's going on. If you wind up re-spinning this patch series, can you please break it into four separate patches, one to add support for each builtin? > > Signed-off-by: Anuj Phogat > --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 109 > +++ > src/mesa/drivers/dri/i965/brw_fs.h | 4 + > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 23 ++ > src/mesa/drivers/dri/i965/brw_wm.h | 1 + > 4 files changed, 137 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index e5d6e4b..e4f7745 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -1115,6 +1115,109 @@ > fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) > return reg; > } > > +void > +fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) > +{ > + int num_samples = ctx->DrawBuffer->Visual.samples; > This isn't safe. When compilation depends on a piece of GL state, you need to include the state in brw_wm_prog_key. Otherwise the program won't get recompiled when the value changes. To avoid unnecessary recompiles, I'd recommend adding a boolean to brw_wm_prog_key which is: - true if ctx->Multisample.Enabled && num_samples != 0 && (shader accesses gl_SamplePosition) - false otherwise > + assert(num_samples >= 0 && num_samples <= 8); > + > + /* From arb_sample_shading specification: > +* "When rendering to a non-multisample buffer, or if multisample > +* rasterization is disabled, gl_SamplePosition will always be > +* (0.5, 0.5). > +*/ > + if (!ctx->Multisample.Enabled || num_samples == 0) { > + emit(BRW_OPCODE_MOV, dst, fs_reg(0.5f)); > It looks like you're using the old, more verbose style of emitting instructions. Can we convert this (and the other instructions in this patch) to the more compact style: emit(MOV(dst, fs_reg(0.5f))); > + } > + else { > + /* For num_samples = {4, 8} */ > + emit(BRW_OPCODE_MOV, dst, int_sample_pos); > + emit(BRW_OPCODE_MUL, dst, dst, fs_reg(1 / 16.0f)); > Like Ken, I was confused as to why we needed a separate MOV before the MUL. The assertion Ken recommended would have been a useful clue, but I'd prefer to just have explicit comments explaining why we need the MOV. How about this: /* Convert int_sample_pos to floating point */ emit(BRW_OPCODE_MOV, dst, int_sample_pos); /* Scale to the range [0, 1] */ emit(BRW_OPCODE_MUL, dst, dst, fs_reg(1 / 16.0f)); > + } > +} > + > +fs_reg * > +fs_visitor::emit_samplepos_interpolation(ir_variable *ir) > +{ > + assert(brw->gen >= 6); > + > + this->current_annotation = "compute sample position"; > + fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); > Since this code assigns to two consecutive registers, it relies on the fact that ir->type is vec2. Just to make that explicit, can we add: assert(ir->type == glsl_type::vec2_type); > + fs_reg pos = *reg; > + fs_reg int_sample_x = fs_reg(this, glsl_type::int_type); > + fs_reg int_sample_y = fs_reg(this, glsl_type::int_type); > + > + /* WM will be run in MSDISPMODE_PERSAMPLE. So, only SIMD8 mode will be > +* enabled. The X, Y sample positions come in as bytes in thread > payload. > +* Sample IDs and sample positions remain same for all four slots in a > +* subspan. So, read the positions using vstride=2, width=4, hstride=0. > I have similar concerns to Ken about why MSDISPMODE_PERSAMPLE implies that only SIMD8 mode will be enabled. I'm assuming the two of you have adequately resolved that. > +*/ > + emit(BRW_OPCODE_AND, int_sample_x, > +fs_reg(stride(retype(brw_vec1_grf(c->sample_pos_reg, 0), > + BRW_REGISTER_TYPE_D), 2, 4, 0)), > +fs_reg(brw_imm_d(0xff))); > If I understand correctly, this is creating the instruction AND(8) int_sample_x<1>:d sample_pos_reg<2;4,0>:d 0x00ff:d I think this works, but it would be more straightforward to do this, which just selects the X coordinate bytes from the sample_pos_reg register: MOV(8) int_sample_x<1>:d sample_pos_reg<16;8,2>:b That would have the advantage that it doesn't rely on the fact that sample IDs and sample positions are the same for all four slots in a subspan. (Note: there are some weird restrictions on byte operations, and I can't remember all the details. So you might need to try this in the simulator and tweak it to get it to work.) > + > + /* Compute gl_SamplePosition.x */ > + compute_sample_position(pos, int_sample_x); > + pos.reg_offset++; > I think if we change this to: pos.reg_offset += dispatch_width / 8; t
[Mesa-dev] [PATCH] radeonsi: Optimize out exports to unbound color buffers
This patch identifies shader exports to unbound CBs and removes them during TGSI to LLVM IR lowering. The method is identical to the one used in the gallium/r600 driver. The GLSL lower_output_reads pass generates temporary copies for writes to shader outputs. In the case of gl_FragData, this results in writes to every MRT when one or more elements are written in the shader. When these MRTs are unbound and masked out there is still a performance loss equivalent to exports to bound, unmasked MRTs on SI. Signed-off-by: Jay Cornwall --- src/gallium/drivers/radeonsi/radeonsi_pipe.h | 1 + src/gallium/drivers/radeonsi/radeonsi_shader.c | 6 ++ src/gallium/drivers/radeonsi/si_state.c| 15 +-- src/gallium/drivers/radeonsi/si_state.h| 1 + 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h index 26f7e09..bede043 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h @@ -148,6 +148,7 @@ struct r600_context { unsignedfb_compressed_cb_mask; unsignedpa_sc_line_stipple; unsignedpa_su_sc_mode_cntl; + boolean dual_src_blend; /* for saving when using blitter */ struct pipe_stencil_ref stencil_ref; struct si_pipe_shader_selector *ps_shader; diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c index 80ee325..e4a9f56 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -995,6 +995,12 @@ handle_semantic: semantic_name); } + /* Shader is keyed on nr_cbufs, optimize out exports to unbound CBs */ + if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT && + semantic_name == TGSI_SEMANTIC_COLOR && + color_count > si_shader_ctx->shader->key.ps.nr_cbufs) + continue; + si_llvm_init_export_args(bld_base, d, index, target, args); if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index da7c3d0..8109bde 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -27,6 +27,7 @@ #include "util/u_memory.h" #include "util/u_framebuffer.h" #include "util/u_blitter.h" +#include "util/u_dual_blend.h" #include "util/u_helpers.h" #include "util/u_math.h" #include "util/u_pack_color.h" @@ -168,6 +169,8 @@ static void si_update_fb_blend_state(struct r600_context *rctx) si_pm4_set_reg(pm4, R_028238_CB_TARGET_MASK, mask); si_pm4_set_state(rctx, fb_blend, pm4); + + rctx->dual_src_blend = blend->dual_src_blend; } /* @@ -309,6 +312,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); } + /* only MRT0 has dual src blend */ + blend->dual_src_blend = util_blend_state_is_dual(state, 0); + return blend; } @@ -2097,8 +2103,13 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx, if (rctx->queued.named.rasterizer->clip_plane_enable & 0xf) key->vs.ucps_enabled |= 0x1; } else if (sel->type == PIPE_SHADER_FRAGMENT) { - if (sel->fs_write_all) - key->ps.nr_cbufs = rctx->framebuffer.nr_cbufs; + /* Key on nr_cbufs to optimize unused EXPORTs. */ + key->ps.nr_cbufs = rctx->framebuffer.nr_cbufs; + + /* Dual-source blending only makes sense with nr_cbufs == 1. */ + if (key->ps.nr_cbufs == 1 && rctx->dual_src_blend) + key->ps.nr_cbufs = 2; + key->ps.export_16bpc = rctx->export_16bpc; if (rctx->queued.named.rasterizer) { diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 6dbf880..ca51496 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -34,6 +34,7 @@ struct si_state_blend { struct si_pm4_state pm4; uint32_tcb_target_mask; boolalpha_to_one; + booldual_src_blend; }; struct si_state_viewport { -- 1.8.4.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] radeonsi: Optimize out exports to unbound color buffers
As per the discussion on IRC, this is trying to fix an issue with a GLSL compiler pass lower_output_reads that always generates 8 output writes for any shader which writes gl_FragData. I'll fix this in the GLSL compiler. NAK. Marek On Sun, Oct 20, 2013 at 2:33 AM, Jay Cornwall wrote: > This patch identifies shader exports to unbound CBs and removes them during > TGSI to LLVM IR lowering. The method is identical to the one used in the > gallium/r600 driver. > > The GLSL lower_output_reads pass generates temporary copies for writes to > shader outputs. In the case of gl_FragData, this results in writes to every > MRT when one or more elements are written in the shader. When these MRTs > are unbound and masked out there is still a performance loss equivalent to > exports to bound, unmasked MRTs on SI. > > Signed-off-by: Jay Cornwall > --- > src/gallium/drivers/radeonsi/radeonsi_pipe.h | 1 + > src/gallium/drivers/radeonsi/radeonsi_shader.c | 6 ++ > src/gallium/drivers/radeonsi/si_state.c| 15 +-- > src/gallium/drivers/radeonsi/si_state.h| 1 + > 4 files changed, 21 insertions(+), 2 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h > b/src/gallium/drivers/radeonsi/radeonsi_pipe.h > index 26f7e09..bede043 100644 > --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h > +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h > @@ -148,6 +148,7 @@ struct r600_context { > unsignedfb_compressed_cb_mask; > unsignedpa_sc_line_stipple; > unsignedpa_su_sc_mode_cntl; > + boolean dual_src_blend; > /* for saving when using blitter */ > struct pipe_stencil_ref stencil_ref; > struct si_pipe_shader_selector *ps_shader; > diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c > b/src/gallium/drivers/radeonsi/radeonsi_shader.c > index 80ee325..e4a9f56 100644 > --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c > +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c > @@ -995,6 +995,12 @@ handle_semantic: > semantic_name); > } > > + /* Shader is keyed on nr_cbufs, optimize out exports > to unbound CBs */ > + if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT && > + semantic_name == TGSI_SEMANTIC_COLOR && > + color_count > > si_shader_ctx->shader->key.ps.nr_cbufs) > + continue; > + > si_llvm_init_export_args(bld_base, d, index, target, > args); > > if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && > diff --git a/src/gallium/drivers/radeonsi/si_state.c > b/src/gallium/drivers/radeonsi/si_state.c > index da7c3d0..8109bde 100644 > --- a/src/gallium/drivers/radeonsi/si_state.c > +++ b/src/gallium/drivers/radeonsi/si_state.c > @@ -27,6 +27,7 @@ > #include "util/u_memory.h" > #include "util/u_framebuffer.h" > #include "util/u_blitter.h" > +#include "util/u_dual_blend.h" > #include "util/u_helpers.h" > #include "util/u_math.h" > #include "util/u_pack_color.h" > @@ -168,6 +169,8 @@ static void si_update_fb_blend_state(struct r600_context > *rctx) > si_pm4_set_reg(pm4, R_028238_CB_TARGET_MASK, mask); > > si_pm4_set_state(rctx, fb_blend, pm4); > + > + rctx->dual_src_blend = blend->dual_src_blend; > } > > /* > @@ -309,6 +312,9 @@ static void *si_create_blend_state_mode(struct > pipe_context *ctx, > si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, > blend_cntl); > } > > + /* only MRT0 has dual src blend */ > + blend->dual_src_blend = util_blend_state_is_dual(state, 0); > + > return blend; > } > > @@ -2097,8 +2103,13 @@ static INLINE void si_shader_selector_key(struct > pipe_context *ctx, > if (rctx->queued.named.rasterizer->clip_plane_enable & 0xf) > key->vs.ucps_enabled |= 0x1; > } else if (sel->type == PIPE_SHADER_FRAGMENT) { > - if (sel->fs_write_all) > - key->ps.nr_cbufs = rctx->framebuffer.nr_cbufs; > + /* Key on nr_cbufs to optimize unused EXPORTs. */ > + key->ps.nr_cbufs = rctx->framebuffer.nr_cbufs; > + > + /* Dual-source blending only makes sense with nr_cbufs == 1. > */ > + if (key->ps.nr_cbufs == 1 && rctx->dual_src_blend) > + key->ps.nr_cbufs = 2; > + > key->ps.export_16bpc = rctx->export_16bpc; > > if (rctx->queued.named.rasterizer) { > diff --git a/src/gallium/drivers/radeonsi/si_state.h > b/src/gallium/drivers/radeonsi/si_state.h > index 6dbf880..ca51496 100644 > --- a/src/gallium/drivers/radeonsi/si_state.h > +++ b/src/gallium/drivers/radeonsi/si_state.h > @@ -34,6 +34,7 @@ struct si_st
[Mesa-dev] [PATCH] i965: Only emit interpolation setup if there are actual FS inputs.
Dead code elimination would get rid of the extra instructions, but skipping this saves iterations through the optimization loop. >From shader-db: N Min MaxMedian AvgStddev x 14672 3 16 3 3.13345150.59904168 + 14672 1 16 3 2.89551530.77732963 Difference at 95.0% confidence -0.237936 +/- 0.0158798 -7.59342% +/- 0.506783% (Student's t, pooled s = 0.693935) Embarassingly, the classic shadow mapping shader: void main() { } used to require three iterations through the optimization loop. With this patch, it only requires one (which makes no progress). Signed-off-by: Kenneth Graunke Cc: Matt Turner --- src/mesa/drivers/dri/i965/brw_fs.cpp | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 65a4b66..a3268fb 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3025,10 +3025,12 @@ fs_visitor::run() emit_shader_time_begin(); calculate_urb_setup(); - if (brw->gen < 6) -emit_interpolation_setup_gen4(); - else -emit_interpolation_setup_gen6(); + if (fp->Base.InputsRead > 0) { + if (brw->gen < 6) +emit_interpolation_setup_gen4(); + else +emit_interpolation_setup_gen6(); + } /* We handle discards by keeping track of the still-live pixels in f0.1. * Initialize it with the dispatched pixels. -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev