On Fri, Apr 25, 2014 at 5:44 PM, Roland Scheidegger <srol...@vmware.com> wrote: > Am 25.04.2014 19:41, schrieb Ilia Mirkin: >> Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> >> --- >> src/gallium/auxiliary/tgsi/tgsi_exec.c | 188 >> +++++++++++++++++++++++++++++++++ >> src/gallium/auxiliary/util/u_math.h | 11 ++ >> 2 files changed, 199 insertions(+) >> >> diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c >> b/src/gallium/auxiliary/tgsi/tgsi_exec.c >> index 55da60a..2cc7884 100644 >> --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c >> +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c >> @@ -2603,6 +2603,40 @@ exec_vector_trinary(struct tgsi_exec_machine *mach, >> } >> } >> >> +typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src0, >> + const union tgsi_exec_channel *src1, >> + const union tgsi_exec_channel *src2, >> + const union tgsi_exec_channel *src3); >> + >> +static void >> +exec_vector_quaternary(struct tgsi_exec_machine *mach, >> + const struct tgsi_full_instruction *inst, >> + micro_quaternary_op op, >> + enum tgsi_exec_datatype dst_datatype, >> + enum tgsi_exec_datatype src_datatype) >> +{ >> + unsigned int chan; >> + struct tgsi_exec_vector dst; >> + >> + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { >> + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { >> + union tgsi_exec_channel src[4]; >> + >> + fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype); >> + fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype); >> + fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype); >> + fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype); >> + op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]); >> + } >> + } >> + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { >> + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { >> + store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, >> dst_datatype); >> + } >> + } >> +} >> + >> static void >> exec_dp3(struct tgsi_exec_machine *mach, >> const struct tgsi_full_instruction *inst) >> @@ -3571,6 +3605,135 @@ micro_ucmp(union tgsi_exec_channel *dst, >> } >> >> static void >> +micro_ibfe(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src0, >> + const union tgsi_exec_channel *src1, >> + const union tgsi_exec_channel *src2) >> +{ >> + int i; >> + for (i = 0; i < 4; i++) { >> + int width = src2->i[i] & 0x1f; >> + int offset = src1->i[i] & 0x1f; >> + if (width == 0) >> + dst->i[i] = 0; >> + else if (width + offset < 32) >> + dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width); >> + else >> + dst->i[i] = src0->i[i] >> offset; >> + } >> +} >> + >> +static void >> +micro_ubfe(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src0, >> + const union tgsi_exec_channel *src1, >> + const union tgsi_exec_channel *src2) >> +{ >> + int i; >> + for (i = 0; i < 4; i++) { >> + int width = src2->u[i] & 0x1f; >> + int offset = src1->u[i] & 0x1f; >> + if (width == 0) >> + dst->u[i] = 0; >> + else if (width + offset < 32) >> + dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width); >> + else >> + dst->u[i] = src0->u[i] >> offset; >> + } >> +} >> + >> +static void >> +micro_bfi(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src0, >> + const union tgsi_exec_channel *src1, >> + const union tgsi_exec_channel *src2, >> + const union tgsi_exec_channel *src3) >> +{ >> + int i; >> + for (i = 0; i < 4; i++) { >> + int width = src3->u[i] & 0x1f; >> + int offset = src2->u[i] & 0x1f; >> + int bitmask = ((1 << width) - 1) << offset; >> + dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & >> ~bitmask); >> + } >> +} >> + >> +static void >> +micro_brev(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src) >> +{ >> + int i; >> + static const unsigned reverse[16] = { >> + [0x0] = 0x0, >> + [0x1] = 0x8, >> + [0x2] = 0x4, >> + [0x3] = 0xc, >> + [0x4] = 0x2, >> + [0x5] = 0xa, >> + [0x6] = 0x6, >> + [0x7] = 0xe, >> + [0x8] = 0x1, >> + [0x9] = 0x9, >> + [0xa] = 0x5, >> + [0xb] = 0xd, >> + [0xc] = 0x3, >> + [0xd] = 0xb, >> + [0xe] = 0x7, >> + [0xf] = 0xf, >> + }; >> + for (i = 0; i < 4; i++) { >> + dst->u[i] = (reverse[(src->u[i] >> 0) & 0xf] << 28 | >> + reverse[(src->u[i] >> 4) & 0xf] << 24 | >> + reverse[(src->u[i] >> 8) & 0xf] << 20 | >> + reverse[(src->u[i] >> 12) & 0xf] << 16 | >> + reverse[(src->u[i] >> 16) & 0xf] << 12 | >> + reverse[(src->u[i] >> 20) & 0xf] << 8 | >> + reverse[(src->u[i] >> 24) & 0xf] << 4 | >> + reverse[(src->u[i] >> 28) & 0xf] << 0); >> + } >> +} > Hmm looks like that opcode is slow even for softpipe's standards
I also can't imagine any uses for it (otherwise perhaps modern CPUs would have a way of doing this that's a little more direct). Oh well, it's part of ARB_gs5. I guess we could lower it into lots of other opcodes out of spite, but I think both radeon/nvc0 have explicit instructions for it, so might as well pipe it through. > (luckily llvmpipe should be able to exploit pshufb for a rather nice > implementation, though it will require ssse3...). > I think though something like > uint32_t reverse(uint32_t x) > { > x = ((x >> 1) & 0x55555555u) | ((x & 0x55555555u) << 1); > x = ((x >> 2) & 0x33333333u) | ((x & 0x33333333u) << 2); > x = ((x >> 4) & 0x0f0f0f0fu) | ((x & 0x0f0f0f0fu) << 4); > x = ((x >> 8) & 0x00ff00ffu) | ((x & 0x00ff00ffu) << 8); > x = ((x >> 16) & 0xffffu) | ((x & 0xffffu) << 16); > return x; > } > > blatantly copied from > http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer > should be faster. But whatever works... That's pretty clever. I actually wonder if that's faster than the lookup method, since this isn't really superscalar-friendly (each step depends on the previous step). But it's neat, I copied it into util_bitreverse(). > > >> + >> +static void >> +micro_popc(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src) >> +{ >> + dst->u[0] = util_bitcount(src->u[0]); >> + dst->u[1] = util_bitcount(src->u[1]); >> + dst->u[2] = util_bitcount(src->u[2]); >> + dst->u[3] = util_bitcount(src->u[3]); >> +} >> + >> +static void >> +micro_lsb(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src) >> +{ >> + dst->i[0] = ffs(src->u[0]) - 1; >> + dst->i[1] = ffs(src->u[1]) - 1; >> + dst->i[2] = ffs(src->u[2]) - 1; >> + dst->i[3] = ffs(src->u[3]) - 1; >> +} >> + >> +static void >> +micro_imsb(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src) >> +{ >> + dst->i[0] = util_last_bit_signed(src->i[0]) - 1; >> + dst->i[1] = util_last_bit_signed(src->i[1]) - 1; >> + dst->i[2] = util_last_bit_signed(src->i[2]) - 1; >> + dst->i[3] = util_last_bit_signed(src->i[3]) - 1; >> +} >> + >> +static void >> +micro_umsb(union tgsi_exec_channel *dst, >> + const union tgsi_exec_channel *src) >> +{ >> + dst->i[0] = util_last_bit(src->u[0]) - 1; >> + dst->i[1] = util_last_bit(src->u[1]) - 1; >> + dst->i[2] = util_last_bit(src->u[2]) - 1; >> + dst->i[3] = util_last_bit(src->u[3]) - 1; >> +} >> + >> +static void >> exec_instruction( >> struct tgsi_exec_machine *mach, >> const struct tgsi_full_instruction *inst, >> @@ -4417,6 +4580,31 @@ exec_instruction( >> /* src[2] = sampler unit */ >> exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2); >> break; >> + >> + case TGSI_OPCODE_IBFE: >> + exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, >> TGSI_EXEC_DATA_INT); >> + break; >> + case TGSI_OPCODE_UBFE: >> + exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, >> TGSI_EXEC_DATA_UINT); >> + break; >> + case TGSI_OPCODE_BFI: >> + exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, >> TGSI_EXEC_DATA_UINT); >> + break; >> + case TGSI_OPCODE_BREV: >> + exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, >> TGSI_EXEC_DATA_UINT); >> + break; >> + case TGSI_OPCODE_POPC: >> + exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, >> TGSI_EXEC_DATA_UINT); >> + break; >> + case TGSI_OPCODE_LSB: >> + exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, >> TGSI_EXEC_DATA_UINT); >> + break; >> + case TGSI_OPCODE_IMSB: >> + exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, >> TGSI_EXEC_DATA_INT); >> + break; >> + case TGSI_OPCODE_UMSB: >> + exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, >> TGSI_EXEC_DATA_UINT); >> + break; >> default: >> assert( 0 ); >> } >> diff --git a/src/gallium/auxiliary/util/u_math.h >> b/src/gallium/auxiliary/util/u_math.h >> index ec03e4e..5b811e3 100644 >> --- a/src/gallium/auxiliary/util/u_math.h >> +++ b/src/gallium/auxiliary/util/u_math.h >> @@ -567,6 +567,17 @@ static INLINE unsigned util_last_bit(unsigned u) >> #endif >> } >> >> +static INLINE unsigned util_last_bit_signed(int i) > Similar to my previous comment, I find that name confusing. Maybe > last_significant_bit_signed or something like that? No biggie though. It parallels the naming of util_last_bit(). Perhaps that one should also be last_significant_bit, in which case I'd be happy to rename this function too. (I believe the current naming reflects 'fls' -- 'find last set'.) > Otherwise looks good to me. > > Roland > >> +{ >> +#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 407) >> + return 31 - __builtin_clrsb(i); >> +#else >> + if (i >= 0) >> + return util_last_bit(i); >> + else >> + return util_last_bit(~(unsigned)i); >> +#endif >> +} >> >> /* Destructively loop over all of the bits in a mask as in: >> * >> _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev