I am not too familiar with translate sse code, so someone else should probably look at it. The rest of the series looks good to me, a comment inline.
Am 13.04.2014 22:29, schrieb Andreas Hartmetz: > From: Andreas Hartmetz <andreas.hartm...@kdab.com> > > --- > src/gallium/auxiliary/translate/translate_sse.c | 35 > ++++++++++++++++++++----- > 1 file changed, 28 insertions(+), 7 deletions(-) > > diff --git a/src/gallium/auxiliary/translate/translate_sse.c > b/src/gallium/auxiliary/translate/translate_sse.c > index dace722..03d8276 100644 > --- a/src/gallium/auxiliary/translate/translate_sse.c > +++ b/src/gallium/auxiliary/translate/translate_sse.c > @@ -71,7 +71,7 @@ struct translate_group > > #define ELEMENT_BUFFER_INSTANCE_ID 1001 > > -#define NUM_CONSTS 7 > +#define NUM_CONSTS 8 > > enum > { > @@ -81,6 +81,7 @@ enum > CONST_INV_32767, > CONST_INV_65535, > CONST_INV_2147483647, > + CONST_INV_4294967295, > CONST_255 > }; > > @@ -92,6 +93,7 @@ static float consts[NUM_CONSTS][4] = { > C(1.0 / 32767.0), > C(1.0 / 65535.0), > C(1.0 / 2147483647.0), > + C(1.0 / 4294967295.0), > C(255.0) > }; > > @@ -515,6 +517,7 @@ translate_attr_convert(struct translate_sse *p, > || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT > || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { > struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); > + struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); > > for (i = 0; i < output_desc->nr_channels; ++i) { > if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0 > @@ -546,17 +549,38 @@ translate_attr_convert(struct translate_sse *p, > */ > sse2_punpcklbw(p->func, dataXMM, get_const(p, > CONST_IDENTITY)); > sse2_punpcklbw(p->func, dataXMM, get_const(p, > CONST_IDENTITY)); > + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); > break; > case 16: > sse2_punpcklwd(p->func, dataXMM, get_const(p, > CONST_IDENTITY)); > + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); > break; > - case 32: /* we lose precision here */ > + case 32: /* we lose precision if value > 2^23 - 1 */ > + /* ...but try to keep all bits for value <= 2^23 - 1 */ > + /* this could be done much smarter for 1 or 2 dwords of input > + * on X64, using sth like "cvtsi2ss xmm0, rax" (note *R*ax) */ > + > + /* save the low bit */ > + sse_movss(p->func, dataXMM2, dataXMM); I don't think that's right, this code can handle multiple channels, i.e. true vectors hence you need sse2_movdqa (or sse_movaps but the former should cause less domain transitions). > + /* right shift & convert, losing the low bit - must clear > + * high bit because there is no unsigned convert instruction > */ > sse2_psrld_imm(p->func, dataXMM, 1); > + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); > + > + /* convert low bit to float */ > + sse2_pslld_imm(p->func, dataXMM2, 31); > + sse2_psrld_imm(p->func, dataXMM2, 31); > + sse2_cvtdq2ps(p->func, dataXMM2, dataXMM2); Is this really ideal, wouldn't something like (in horrible pseudo-code notation) dataXMM2 = dataXMM2 & CONST(0x1 vec) dataXMM2 = cvtdq2ps(dataXMM2) be faster? I guess though your method avoids the constant, so probably not worth bothering (I am actually wondering what code llvm generates for its UIToFP instruction or what in general the fastest way to do this is). Roland > + > + /* mostly undo the right-shift by 1 */ > + sse_addps(p->func, dataXMM, dataXMM); > + /* add back the lost low bit */ > + sse_addps(p->func, dataXMM, dataXMM2); > break; > default: > return FALSE; > } > - sse2_cvtdq2ps(p->func, dataXMM, dataXMM); > + > if (input_desc->channel[0].normalized) { > struct x86_reg factor; > switch (input_desc->channel[0].size) { > @@ -567,7 +591,7 @@ translate_attr_convert(struct translate_sse *p, > factor = get_const(p, CONST_INV_65535); > break; > case 32: > - factor = get_const(p, CONST_INV_2147483647); > + factor = get_const(p, CONST_INV_4294967295); > break; > default: > assert(0); > @@ -579,9 +603,6 @@ translate_attr_convert(struct translate_sse *p, > } > sse_mulps(p->func, dataXMM, factor); > } > - else if (input_desc->channel[0].size == 32) > - /* compensate for the bit we threw away to fit u32 into s32 */ > - sse_addps(p->func, dataXMM, dataXMM); > break; > case UTIL_FORMAT_TYPE_SIGNED: > if (!(x86_target_caps(p->func) & X86_SSE2)) > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev