Am 26.04.2014 01:45, schrieb Ilia Mirkin: > On Fri, Apr 25, 2014 at 5:44 PM, Roland Scheidegger <srol...@vmware.com> > wrote: >> Am 25.04.2014 19:41, schrieb Ilia Mirkin: >>> Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> >>> --- >>> src/gallium/auxiliary/tgsi/tgsi_exec.c | 188 >>> +++++++++++++++++++++++++++++++++ >>> src/gallium/auxiliary/util/u_math.h | 11 ++ >>> 2 files changed, 199 insertions(+) >>> >>> diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c >>> b/src/gallium/auxiliary/tgsi/tgsi_exec.c >>> index 55da60a..2cc7884 100644 >>> --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c >>> +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c >>> @@ -2603,6 +2603,40 @@ exec_vector_trinary(struct tgsi_exec_machine *mach, >>> } >>> } >>> >>> +typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src0, >>> + const union tgsi_exec_channel *src1, >>> + const union tgsi_exec_channel *src2, >>> + const union tgsi_exec_channel *src3); >>> + >>> +static void >>> +exec_vector_quaternary(struct tgsi_exec_machine *mach, >>> + const struct tgsi_full_instruction *inst, >>> + micro_quaternary_op op, >>> + enum tgsi_exec_datatype dst_datatype, >>> + enum tgsi_exec_datatype src_datatype) >>> +{ >>> + unsigned int chan; >>> + struct tgsi_exec_vector dst; >>> + >>> + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { >>> + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { >>> + union tgsi_exec_channel src[4]; >>> + >>> + fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype); >>> + fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype); >>> + fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype); >>> + fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype); >>> + op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]); >>> + } >>> + } >>> + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { >>> + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { >>> + store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, >>> dst_datatype); >>> + } >>> + } >>> +} >>> + >>> static void >>> exec_dp3(struct tgsi_exec_machine *mach, >>> const struct tgsi_full_instruction *inst) >>> @@ -3571,6 +3605,135 @@ micro_ucmp(union tgsi_exec_channel *dst, >>> } >>> >>> static void >>> +micro_ibfe(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src0, >>> + const union tgsi_exec_channel *src1, >>> + const union tgsi_exec_channel *src2) >>> +{ >>> + int i; >>> + for (i = 0; i < 4; i++) { >>> + int width = src2->i[i] & 0x1f; >>> + int offset = src1->i[i] & 0x1f; >>> + if (width == 0) >>> + dst->i[i] = 0; >>> + else if (width + offset < 32) >>> + dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width); >>> + else >>> + dst->i[i] = src0->i[i] >> offset; >>> + } >>> +} >>> + >>> +static void >>> +micro_ubfe(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src0, >>> + const union tgsi_exec_channel *src1, >>> + const union tgsi_exec_channel *src2) >>> +{ >>> + int i; >>> + for (i = 0; i < 4; i++) { >>> + int width = src2->u[i] & 0x1f; >>> + int offset = src1->u[i] & 0x1f; >>> + if (width == 0) >>> + dst->u[i] = 0; >>> + else if (width + offset < 32) >>> + dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width); >>> + else >>> + dst->u[i] = src0->u[i] >> offset; >>> + } >>> +} >>> + >>> +static void >>> +micro_bfi(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src0, >>> + const union tgsi_exec_channel *src1, >>> + const union tgsi_exec_channel *src2, >>> + const union tgsi_exec_channel *src3) >>> +{ >>> + int i; >>> + for (i = 0; i < 4; i++) { >>> + int width = src3->u[i] & 0x1f; >>> + int offset = src2->u[i] & 0x1f; >>> + int bitmask = ((1 << width) - 1) << offset; >>> + dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & >>> ~bitmask); >>> + } >>> +} >>> + >>> +static void >>> +micro_brev(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src) >>> +{ >>> + int i; >>> + static const unsigned reverse[16] = { >>> + [0x0] = 0x0, >>> + [0x1] = 0x8, >>> + [0x2] = 0x4, >>> + [0x3] = 0xc, >>> + [0x4] = 0x2, >>> + [0x5] = 0xa, >>> + [0x6] = 0x6, >>> + [0x7] = 0xe, >>> + [0x8] = 0x1, >>> + [0x9] = 0x9, >>> + [0xa] = 0x5, >>> + [0xb] = 0xd, >>> + [0xc] = 0x3, >>> + [0xd] = 0xb, >>> + [0xe] = 0x7, >>> + [0xf] = 0xf, >>> + }; >>> + for (i = 0; i < 4; i++) { >>> + dst->u[i] = (reverse[(src->u[i] >> 0) & 0xf] << 28 | >>> + reverse[(src->u[i] >> 4) & 0xf] << 24 | >>> + reverse[(src->u[i] >> 8) & 0xf] << 20 | >>> + reverse[(src->u[i] >> 12) & 0xf] << 16 | >>> + reverse[(src->u[i] >> 16) & 0xf] << 12 | >>> + reverse[(src->u[i] >> 20) & 0xf] << 8 | >>> + reverse[(src->u[i] >> 24) & 0xf] << 4 | >>> + reverse[(src->u[i] >> 28) & 0xf] << 0); >>> + } >>> +} >> Hmm looks like that opcode is slow even for softpipe's standards > > I also can't imagine any uses for it (otherwise perhaps modern CPUs > would have a way of doing this that's a little more direct). Oh well, > it's part of ARB_gs5. I guess we could lower it into lots of other > opcodes out of spite, but I think both radeon/nvc0 have explicit > instructions for it, so might as well pipe it through. > >> (luckily llvmpipe should be able to exploit pshufb for a rather nice >> implementation, though it will require ssse3...). >> I think though something like >> uint32_t reverse(uint32_t x) >> { >> x = ((x >> 1) & 0x55555555u) | ((x & 0x55555555u) << 1); >> x = ((x >> 2) & 0x33333333u) | ((x & 0x33333333u) << 2); >> x = ((x >> 4) & 0x0f0f0f0fu) | ((x & 0x0f0f0f0fu) << 4); >> x = ((x >> 8) & 0x00ff00ffu) | ((x & 0x00ff00ffu) << 8); >> x = ((x >> 16) & 0xffffu) | ((x & 0xffffu) << 16); >> return x; >> } >> >> blatantly copied from >> https://urldefense.proofpoint.com/v1/url?u=http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=F4msKE2WxRzA%2BwN%2B25muztFm5TSPwE8HKJfWfR2NgfY%3D%0A&m=ybCfdBJh%2Foy%2BFqbyvrs44fwIBeoY%2Bq7V74YNYKittR8%3D%0A&s=023445a2a572aa5021a7962bf5976d3ed945892c342ddecf83de17bd278dcfec >> should be faster. But whatever works... > > That's pretty clever. I actually wonder if that's faster than the > lookup method, since this isn't really superscalar-friendly (each step > depends on the previous step). That's not entirely true, there are some independent instructions so it isn't that bad. OTOH you skip the oodles of (data-dependent) lookups (which have a 3 cycle or so latency, and there's so many of them that it's probably not pretty). For the first line for instance the cpu could do: cycle 1: x >> 1, x & 0x55555555 cycle 2: ((x >> 1) & 0x55555555u), ((x & 0x55555555u) << 1) cycle 3: | But that's just reversing one number, there's always 4 in parallel which are fully independent anyway (the compiler might adjust things for that, even if it does not the cpu will at least to some degree unless that's an old-style atom :-)). (The solution above actually has some unnecessary ands there, the last two masks are unnecessary though I bet the compiler knows as well.)
> But it's neat, I copied it into util_bitreverse(). I don't know what's the absolute fastest way to generically do this, It seems though the problem is interesting enough that it comes up from time to time... http://stackoverflow.com/questions/746171/best-algorithm-for-bit-reversal-from-msb-lsb-to-lsb-msb-in-c So, it looks like a LUT could beat this but I think you'd really need an 8bit one, and noone likes LUTs. Of course my solution would be to use pshufb - one day I might need to implement this in llvmpipe. For all 4 32bit values you just need 3 pshufb, 2 shifts, 2 "and" and one "or" if I count that right (you need 2 pshufb to swap higher and lower 4 bits, one pshufb for swapping the bytes)... (not counting loading the masks and "mini-lut" used for pshufb etc.). That would be beating the crap out of the scalar c code by at least an order of magnitude :-). But anyway I doubt it's often used and even if so softpipe was never fast. >> >> >>> + >>> +static void >>> +micro_popc(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src) >>> +{ >>> + dst->u[0] = util_bitcount(src->u[0]); >>> + dst->u[1] = util_bitcount(src->u[1]); >>> + dst->u[2] = util_bitcount(src->u[2]); >>> + dst->u[3] = util_bitcount(src->u[3]); >>> +} >>> + >>> +static void >>> +micro_lsb(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src) >>> +{ >>> + dst->i[0] = ffs(src->u[0]) - 1; >>> + dst->i[1] = ffs(src->u[1]) - 1; >>> + dst->i[2] = ffs(src->u[2]) - 1; >>> + dst->i[3] = ffs(src->u[3]) - 1; >>> +} >>> + >>> +static void >>> +micro_imsb(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src) >>> +{ >>> + dst->i[0] = util_last_bit_signed(src->i[0]) - 1; >>> + dst->i[1] = util_last_bit_signed(src->i[1]) - 1; >>> + dst->i[2] = util_last_bit_signed(src->i[2]) - 1; >>> + dst->i[3] = util_last_bit_signed(src->i[3]) - 1; >>> +} >>> + >>> +static void >>> +micro_umsb(union tgsi_exec_channel *dst, >>> + const union tgsi_exec_channel *src) >>> +{ >>> + dst->i[0] = util_last_bit(src->u[0]) - 1; >>> + dst->i[1] = util_last_bit(src->u[1]) - 1; >>> + dst->i[2] = util_last_bit(src->u[2]) - 1; >>> + dst->i[3] = util_last_bit(src->u[3]) - 1; >>> +} >>> + >>> +static void >>> exec_instruction( >>> struct tgsi_exec_machine *mach, >>> const struct tgsi_full_instruction *inst, >>> @@ -4417,6 +4580,31 @@ exec_instruction( >>> /* src[2] = sampler unit */ >>> exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2); >>> break; >>> + >>> + case TGSI_OPCODE_IBFE: >>> + exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, >>> TGSI_EXEC_DATA_INT); >>> + break; >>> + case TGSI_OPCODE_UBFE: >>> + exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, >>> TGSI_EXEC_DATA_UINT); >>> + break; >>> + case TGSI_OPCODE_BFI: >>> + exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, >>> TGSI_EXEC_DATA_UINT); >>> + break; >>> + case TGSI_OPCODE_BREV: >>> + exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, >>> TGSI_EXEC_DATA_UINT); >>> + break; >>> + case TGSI_OPCODE_POPC: >>> + exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, >>> TGSI_EXEC_DATA_UINT); >>> + break; >>> + case TGSI_OPCODE_LSB: >>> + exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, >>> TGSI_EXEC_DATA_UINT); >>> + break; >>> + case TGSI_OPCODE_IMSB: >>> + exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, >>> TGSI_EXEC_DATA_INT); >>> + break; >>> + case TGSI_OPCODE_UMSB: >>> + exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, >>> TGSI_EXEC_DATA_UINT); >>> + break; >>> default: >>> assert( 0 ); >>> } >>> diff --git a/src/gallium/auxiliary/util/u_math.h >>> b/src/gallium/auxiliary/util/u_math.h >>> index ec03e4e..5b811e3 100644 >>> --- a/src/gallium/auxiliary/util/u_math.h >>> +++ b/src/gallium/auxiliary/util/u_math.h >>> @@ -567,6 +567,17 @@ static INLINE unsigned util_last_bit(unsigned u) >>> #endif >>> } >>> >>> +static INLINE unsigned util_last_bit_signed(int i) >> Similar to my previous comment, I find that name confusing. Maybe >> last_significant_bit_signed or something like that? No biggie though. > > It parallels the naming of util_last_bit(). Perhaps that one should > also be last_significant_bit, in which case I'd be happy to rename > this function too. (I believe the current naming reflects 'fls' -- > 'find last set'.) Yes you are quite right. Maybe just add a comment then? Roland > >> Otherwise looks good to me. >> >> Roland >> >>> +{ >>> +#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 407) >>> + return 31 - __builtin_clrsb(i); >>> +#else >>> + if (i >= 0) >>> + return util_last_bit(i); >>> + else >>> + return util_last_bit(~(unsigned)i); >>> +#endif >>> +} >>> >>> /* Destructively loop over all of the bits in a mask as in: >>> * >>> _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev