| Issue |
172903
|
| Summary |
[RegAlloc]
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
JoernEngel
|
I was told to submit testcases for bad register allocation. Here you go!
Performance is dominated by the 40 decode_one(). Ideally they should generate 4 instructions each, but without the "ifdef __BMI2__" hint I get 5 instructions and about 10% worse performance. Drawback of using shrx is the extra register spent on a constant value. In this case it is worth spending the extra register, determined by experimentation and benchmarks. I have no idea how a compiler would automatically make the right decision on unknown code, this might be a challenge.
Otherwise the generated code is quite good, almost pleasant to read. Gcc-12 is storing the constant (53) 6 times per loop iteration, clang-16 does it once outside the loop.
#include <endian.h>
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned u32;
typedef unsigned long long u64;
typedef u64 u64u __attribute__((may_alias, aligned(1)));
static inline u64 read_le64(const void *buf) { return le64toh(*(const u64u *)buf); }
static inline u64 shrx(u64 val, u64 s)
{
u64 ret;
asm("shrx %[s],%[v],%[r]\n\t"
:[r]"=r"(ret)
:[v]"r"(val), [s]"r"(s)
:);
return ret;
}
static inline u64 decode_one(u64 acc, u8 *op, const u16 table[2048])
{
u64 slot;
/* shrx is faster than mov+shr */
#ifdef __BMI2__
slot = shrx(acc, 53);
#else
slot = acc >> 53;
#endif
u16 entry = table[slot];
*op = entry>>8;
acc <<= entry&0xff;
return acc;
}
static inline u64 reload_one(u64 acc, const void **ipp)
{
const void *ip = *ipp;
int bits = __builtin_ctzll(acc);
int bytes = bits/8;
ip -= bytes;
acc = read_le64(ip)|1;
acc <<= bits&7;
*ipp = ip;
return acc;
}
void dec_bitstream_mod8(void *dst, u64 dlen,
const void *src, u64 slen[8],
const u16 table[2048])
{
const void *ip[8];
ip[0] = src+slen[0] - 8;
for (int i=1; i<8; i++)
ip[i] = ip[i-1] + slen[i];
u8 *op = dst;
u64 r[8];
for (int i=0; i<8; i++)
r[i] = read_le64(ip[i])|1;
u64 o=0;
for (; o+40<=dlen; o+=40) {
r[0] = decode_one(r[0], op+o+000, table);
r[1] = decode_one(r[1], op+o+001, table);
r[2] = decode_one(r[2], op+o+002, table);
r[3] = decode_one(r[3], op+o+003, table);
r[4] = decode_one(r[4], op+o+004, table);
r[5] = decode_one(r[5], op+o+005, table);
r[6] = decode_one(r[6], op+o+006, table);
r[7] = decode_one(r[7], op+o+007, table);
r[0] = decode_one(r[0], op+o+010, table);
r[1] = decode_one(r[1], op+o+011, table);
r[2] = decode_one(r[2], op+o+012, table);
r[3] = decode_one(r[3], op+o+013, table);
r[4] = decode_one(r[4], op+o+014, table);
r[5] = decode_one(r[5], op+o+015, table);
r[6] = decode_one(r[6], op+o+016, table);
r[7] = decode_one(r[7], op+o+017, table);
r[0] = decode_one(r[0], op+o+020, table);
r[1] = decode_one(r[1], op+o+021, table);
r[2] = decode_one(r[2], op+o+022, table);
r[3] = decode_one(r[3], op+o+023, table);
r[4] = decode_one(r[4], op+o+024, table);
r[5] = decode_one(r[5], op+o+025, table);
r[6] = decode_one(r[6], op+o+026, table);
r[7] = decode_one(r[7], op+o+027, table);
r[0] = decode_one(r[0], op+o+030, table);
r[1] = decode_one(r[1], op+o+031, table);
r[2] = decode_one(r[2], op+o+032, table);
r[3] = decode_one(r[3], op+o+033, table);
r[4] = decode_one(r[4], op+o+034, table);
r[5] = decode_one(r[5], op+o+035, table);
r[6] = decode_one(r[6], op+o+036, table);
r[7] = decode_one(r[7], op+o+037, table);
r[0] = decode_one(r[0], op+o+040, table);
r[1] = decode_one(r[1], op+o+041, table);
r[2] = decode_one(r[2], op+o+042, table);
r[3] = decode_one(r[3], op+o+043, table);
r[4] = decode_one(r[4], op+o+044, table);
r[5] = decode_one(r[5], op+o+045, table);
r[6] = decode_one(r[6], op+o+046, table);
r[7] = decode_one(r[7], op+o+047, table);
r[0] = reload_one(r[0], &ip[0]);
r[1] = reload_one(r[1], &ip[1]);
r[2] = reload_one(r[2], &ip[2]);
r[3] = reload_one(r[3], &ip[3]);
r[4] = reload_one(r[4], &ip[4]);
r[5] = reload_one(r[5], &ip[5]);
r[6] = reload_one(r[6], &ip[6]);
r[7] = reload_one(r[7], &ip[7]);
}
for (;; o+=8) {
/* weird indentation is easier to read */
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wmisleading-indentation"
if (o+0>=dlen) break; r[0] = decode_one(r[0], op+o+0, table);
if (o+1>=dlen) break; r[1] = decode_one(r[1], op+o+1, table);
if (o+2>=dlen) break; r[2] = decode_one(r[2], op+o+2, table);
if (o+3>=dlen) break; r[3] = decode_one(r[3], op+o+3, table);
if (o+4>=dlen) break; r[4] = decode_one(r[4], op+o+4, table);
if (o+5>=dlen) break; r[5] = decode_one(r[5], op+o+5, table);
if (o+6>=dlen) break; r[6] = decode_one(r[6], op+o+6, table);
if (o+7>=dlen) break; r[7] = decode_one(r[7], op+o+7, table);
# pragma GCC diagnostic pop
}
}
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs