On 10/8/24 01:51, LIU Zhiwei wrote:
It occurs to me that, rather than caching valid_frac_lmul[][], we can pre-compute
encode_vtype and lmul_eq_avl.
Do you mean cache vtype and lmul_eq_avl for different (lmul, sew) pairs instead of
valid_frac_lmul?
Or even one step further:
typedef struct VsetCache {
unsigned movi_insn;
unsigned vset_insn;
} VsetCache;
static VsetCache riscv_vset_cache[3][4];
static void set_vtype(TCGContext *s, TCGType type, MemOp vsew)
{
const VsetCache *p = &riscv_vset_cache[type - TCG_TYPE_V64][vsew];
s->riscv_cur_type = type;
s->riscv_cur_vsew = vsew;
if (p->movi_insn) {
tcg_out32(s->p->movi_insn);
}
tcg_out32(s, p->vset_insn);
}
static bool vtype_check(unsigned vtype)
{
unsigned long tmp;
asm("vsetvl %0, zero, %1" : "=r"(tmp) : "r"(vtype)); /* in .inst form */
return tmp != 0;
}
static void probe_frac_lmul_1(TCGType type, MemOp vsew)
{
VsetCache *p = &riscv_vset_cache[type - TCG_TYPE_V64][vsew];
unsigned avl = tcg_type_size(type) >> vsew;
int lmul = type - riscv_lg2_vlenb;
unsigned vtype = encode_vtype(true, true, vsew, lmul & 7);
bool lmul_eq_avl = true;
/* Guaranteed by Zve64x. */
assert(lmul < 3);
/*
* For LMUL < -3, the host vector size is so large that TYPE
* is smaller than the minimum 1/8 fraction.
*
* For other fractional LMUL settings, implementations must
* support SEW settings between SEW_MIN and LMUL * ELEN, inclusive.
* So if ELEN = 64, LMUL = 1/2, then SEW will support e8, e16, e32,
* but e64 may not be supported. In other words, the hardware only
* guarantees SEW_MIN <= SEW <= LMUL * ELEN. Check.
*/
if (lmul < 0 && (lmul < -3 || !vtype_check(vtype)) {
vtype = encode_vtype(true, true, vsew, VLMUL_M1);
lmul_eq_avl = false;
}
if (avl < 32) {
p->vset_insn = encode_vseti(OPC_VSETIVLI, TCG_REG_ZERO, avl, vtype);
} else if (lmul_eq_avl) {
/* rd != 0 and rs1 == 0 uses vlmax */
p->vset_insn = encode_vset(OPC_VSETVLI, TCG_REG_TMP0, TCG_REG_ZERO,
vtype);
} else {
p->movi_insn = encode_i(OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl);
p->vset_insn = encode_vset(OPC_VSETVLI, TCG_REG_ZERO, TCG_REG_TMP0,
vtype);
}
}
static void probe_frac_lmul(void)
{
/* Match riscv_lg2_vlenb to TCG_TYPE_V64. */
QEMU_BUILD_BUG_ON(TCG_TYPE_V64 != 3);
for (TCGType t = TCG_TYPE_V64; t <= TCG_TYPE_V256; t++) {
for (MemOp e = MO_8; e <= MO_64; e++) {
probe_frac_lmul_1(t, e);
}
}
}
So that everything is pre-computed at startup.
r~