I'm not sure it makes a lot of sense to be optimizing swrast at this stage. Take a look at llvmpipe and perhaps consider improving the multithreading already in place in that rasterizer, which is far better optimized than swrast already.
Keith On Wed, 2011-08-10 at 08:07 +0000, Andreas Fänger wrote: > Optional parallel rendering of spans using OpenMP. > Initial implementation for aa triangles. A new option for scons is > also provided to activate the openmp support (off by default). > --- > common.py | 1 + > scons/gallium.py | 12 +++++++ > src/mesa/swrast/s_aatritemp.h | 68 ++++++++++++++++++++++----------------- > src/mesa/swrast/s_context.c | 26 ++++++++++++--- > src/mesa/swrast/s_texcombine.c | 4 ++ > src/mesa/tnl/t_pipeline.c | 12 +++++++ > 6 files changed, 87 insertions(+), 36 deletions(-) > > diff --git a/common.py b/common.py > index 8657030..cfee1b5 100644 > --- a/common.py > +++ b/common.py > @@ -88,6 +88,7 @@ def AddOptions(opts): > opts.Add('toolchain', 'compiler toolchain', default_toolchain) > opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', > 'no')) > opts.Add(BoolOption('llvm', 'use LLVM', default_llvm)) > + opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp > (swrast)', 'no')) > opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes')) > opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no')) > opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes')) > diff --git a/scons/gallium.py b/scons/gallium.py > index 8cd3bc7..7135251 100755 > --- a/scons/gallium.py > +++ b/scons/gallium.py > @@ -596,6 +596,18 @@ def generate(env): > libs += ['m', 'pthread', 'dl'] > env.Append(LIBS = libs) > > + # OpenMP > + if env['openmp']: > + if env['msvc']: > + env.Append(CCFLAGS = ['/openmp']) > + # When building openmp release VS2008 link.exe crashes with > LNK1103 error. > + # Workaround: overwrite PDB flags with empty value as it isn't > required anyways > + if env['build'] == 'release': > + env['PDB'] = '' > + if env['gcc']: > + env.Append(CCFLAGS = ['-fopenmp']) > + env.Append(LIBS = ['gomp']) > + > # Load tools > env.Tool('lex') > env.Tool('yacc') > diff --git a/src/mesa/swrast/s_aatritemp.h b/src/mesa/swrast/s_aatritemp.h > index 91d4f7a..005d12c 100644 > --- a/src/mesa/swrast/s_aatritemp.h > +++ b/src/mesa/swrast/s_aatritemp.h > @@ -181,13 +181,18 @@ > const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS]; > const GLfloat dxdy = majDx / majDy; > const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F; > - GLfloat x = pMin[0] - (yMin - iyMin) * dxdy; > GLint iy; > - for (iy = iyMin; iy < iyMax; iy++, x += dxdy) { > + #pragma omp parallel for schedule(dynamic) private(iy) > firstprivate(span) > + for (iy = iyMin; iy < iyMax; iy++) { > + GLfloat x = pMin[0] - (yMin - iy) * dxdy; > GLint ix, startX = (GLint) (x - xAdj); > GLuint count; > GLfloat coverage = 0.0F; > > +#ifdef _OPENMP > + /* each thread needs to use a different (global) SpanArrays > variable */ > + span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num(); > +#endif > /* skip over fragments with zero coverage */ > while (startX < MAX_WIDTH) { > coverage = compute_coveragef(pMin, pMid, pMax, startX, iy); > @@ -228,13 +233,12 @@ > coverage = compute_coveragef(pMin, pMid, pMax, ix, iy); > } > > - if (ix <= startX) > - continue; > - > - span.x = startX; > - span.y = iy; > - span.end = (GLuint) ix - (GLuint) startX; > - _swrast_write_rgba_span(ctx, &span); > + if (ix > startX) { > + span.x = startX; > + span.y = iy; > + span.end = (GLuint) ix - (GLuint) startX; > + _swrast_write_rgba_span(ctx, &span); > + } > } > } > else { > @@ -244,13 +248,18 @@ > const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS]; > const GLfloat dxdy = majDx / majDy; > const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F; > - GLfloat x = pMin[0] - (yMin - iyMin) * dxdy; > GLint iy; > - for (iy = iyMin; iy < iyMax; iy++, x += dxdy) { > + #pragma omp parallel for schedule(dynamic) private(iy) > firstprivate(span) > + for (iy = iyMin; iy < iyMax; iy++) { > + GLfloat x = pMin[0] - (yMin - iy) * dxdy; > GLint ix, left, startX = (GLint) (x + xAdj); > GLuint count, n; > GLfloat coverage = 0.0F; > > +#ifdef _OPENMP > + /* each thread needs to use a different (global) SpanArrays > variable */ > + span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num(); > +#endif > /* make sure we're not past the window edge */ > if (startX >= ctx->DrawBuffer->_Xmax) { > startX = ctx->DrawBuffer->_Xmax - 1; > @@ -296,31 +305,30 @@ > ATTRIB_LOOP_END > #endif > > - if (startX <= ix) > - continue; > + if (startX > ix) { > + n = (GLuint) startX - (GLuint) ix; > > - n = (GLuint) startX - (GLuint) ix; > + left = ix + 1; > > - left = ix + 1; > - > - /* shift all values to the left */ > - /* XXX this is temporary */ > - { > - SWspanarrays *array = span.array; > - GLint j; > - for (j = 0; j < (GLint) n; j++) { > - array->coverage[j] = array->coverage[j + left]; > - COPY_CHAN4(array->rgba[j], array->rgba[j + left]); > + /* shift all values to the left */ > + /* XXX this is temporary */ > + { > + SWspanarrays *array = span.array; > + GLint j; > + for (j = 0; j < (GLint) n; j++) { > + array->coverage[j] = array->coverage[j + left]; > + COPY_CHAN4(array->rgba[j], array->rgba[j + left]); > #ifdef DO_Z > - array->z[j] = array->z[j + left]; > + array->z[j] = array->z[j + left]; > #endif > + } > } > - } > > - span.x = left; > - span.y = iy; > - span.end = n; > - _swrast_write_rgba_span(ctx, &span); > + span.x = left; > + span.y = iy; > + span.end = n; > + _swrast_write_rgba_span(ctx, &span); > + } > } > } > } > diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c > index def1531..4434f11 100644 > --- a/src/mesa/swrast/s_context.c > +++ b/src/mesa/swrast/s_context.c > @@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx ) > { > GLuint i; > SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext)); > +#ifdef _OPENMP > + const GLint maxThreads = omp_get_max_threads(); > +#else > + const GLint maxThreads = 1; > +#endif > > if (SWRAST_DEBUG) { > _mesa_debug(ctx, "_swrast_CreateContext\n"); > @@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx ) > for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++) > swrast->TextureSample[i] = NULL; > > - swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays); > + /* SpanArrays is global and shared by all SWspan instances. However, when > + * using multiple threads, it is necessary to have one SpanArrays instance > + * per thread. > + */ > + swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * > sizeof(SWspanarrays)); > if (!swrast->SpanArrays) { > FREE(swrast); > return GL_FALSE; > } > - swrast->SpanArrays->ChanType = CHAN_TYPE; > + for(i = 0; i < maxThreads; i++) { > + swrast->SpanArrays[i].ChanType = CHAN_TYPE; > #if CHAN_TYPE == GL_UNSIGNED_BYTE > - swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8; > + swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8; > #elif CHAN_TYPE == GL_UNSIGNED_SHORT > - swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16; > + swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16; > #else > - swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0]; > + swrast->SpanArrays[i].rgba = > swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0]; > #endif > + } > > /* init point span buffer */ > swrast->PointSpan.primitive = GL_POINT; > @@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx ) > swrast->PointSpan.facing = 0; > swrast->PointSpan.array = swrast->SpanArrays; > > - swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * > + /* TexelBuffer is also global and normally shared by all SWspan instances; > + * when running with multiple threads, create one per thread. > + */ > + swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits > * maxThreads * > MAX_WIDTH * 4 * sizeof(GLfloat)); > if (!swrast->TexelBuffer) { > FREE(swrast->SpanArrays); > diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c > index 086ed0b..80b9dff 100644 > --- a/src/mesa/swrast/s_texcombine.c > +++ b/src/mesa/swrast/s_texcombine.c > @@ -48,7 +48,11 @@ typedef float (*float4_array)[4]; > static INLINE float4_array > get_texel_array(SWcontext *swrast, GLuint unit) > { > +#ifdef _OPENMP > + return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * > omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num())); > +#else > return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4); > +#endif > } > > > diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c > index 18f095f..881d5d5 100644 > --- a/src/mesa/tnl/t_pipeline.c > +++ b/src/mesa/tnl/t_pipeline.c > @@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx ) > _tnl_notify_pipeline_output_change( ctx ); > } > > +#ifndef _OPENMP > + /* Don't adjust FPU precision mode in case multiple threads are to be > used. > + * This would require that the additional threads also changed the FPU > mode > + * which is quite a mess as this had to be done in all parallelized > sections; > + * otherwise the master thread and all other threads are running in > different > + * modes, producing inconsistent results. > + * Note that all x64 implementations don't define/use START_FAST_MATH, so > + * this is "hack" is only used in i386 mode > + */ > START_FAST_MATH(__tmp); > +#endif > > for (i = 0; i < tnl->pipeline.nr_stages ; i++) { > struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; > @@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx ) > break; > } > > +#ifndef _OPENMP > END_FAST_MATH(__tmp); > +#endif > } > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev