On 10/8/24 01:51, LIU Zhiwei wrote:
It occurs to me that, rather than caching valid_frac_lmul[][], we can pre-compute encode_vtype and lmul_eq_avl.
Do you mean cache vtype and lmul_eq_avl for different (lmul, sew) pairs instead of 
valid_frac_lmul?
Or even one step further:

typedef struct VsetCache {
    unsigned movi_insn;
    unsigned vset_insn;
} VsetCache;

static VsetCache riscv_vset_cache[3][4];

static void set_vtype(TCGContext *s, TCGType type, MemOp vsew)
{
    const VsetCache *p = &riscv_vset_cache[type - TCG_TYPE_V64][vsew];

    s->riscv_cur_type = type;
    s->riscv_cur_vsew = vsew;

    if (p->movi_insn) {
        tcg_out32(s->p->movi_insn);
    }
    tcg_out32(s, p->vset_insn);
}

static bool vtype_check(unsigned vtype)
{
    unsigned long tmp;
    asm("vsetvl %0, zero, %1" : "=r"(tmp) : "r"(vtype));     /* in .inst form */
    return tmp != 0;
}

static void probe_frac_lmul_1(TCGType type, MemOp vsew)
{
    VsetCache *p = &riscv_vset_cache[type - TCG_TYPE_V64][vsew];
    unsigned avl = tcg_type_size(type) >> vsew;
    int lmul = type - riscv_lg2_vlenb;
    unsigned vtype = encode_vtype(true, true, vsew, lmul & 7);
    bool lmul_eq_avl = true;

    /* Guaranteed by Zve64x. */
    assert(lmul < 3);

    /*
     * For LMUL < -3, the host vector size is so large that TYPE
     * is smaller than the minimum 1/8 fraction.
     *
     * For other fractional LMUL settings, implementations must
     * support SEW settings between SEW_MIN and LMUL * ELEN, inclusive.
     * So if ELEN = 64, LMUL = 1/2, then SEW will support e8, e16, e32,
     * but e64 may not be supported. In other words, the hardware only
     * guarantees SEW_MIN <= SEW <= LMUL * ELEN.  Check.
     */
    if (lmul < 0 && (lmul < -3 || !vtype_check(vtype)) {
        vtype = encode_vtype(true, true, vsew, VLMUL_M1);
        lmul_eq_avl = false;
    }

    if (avl < 32) {
        p->vset_insn = encode_vseti(OPC_VSETIVLI, TCG_REG_ZERO, avl, vtype);
    } else if (lmul_eq_avl) {
        /* rd != 0 and rs1 == 0 uses vlmax */
        p->vset_insn = encode_vset(OPC_VSETVLI, TCG_REG_TMP0, TCG_REG_ZERO, 
vtype);
    } else {
        p->movi_insn = encode_i(OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl);
        p->vset_insn = encode_vset(OPC_VSETVLI, TCG_REG_ZERO, TCG_REG_TMP0, 
vtype);
    }
}

static void probe_frac_lmul(void)
{
    /* Match riscv_lg2_vlenb to TCG_TYPE_V64. */
    QEMU_BUILD_BUG_ON(TCG_TYPE_V64 != 3);

    for (TCGType t = TCG_TYPE_V64; t <= TCG_TYPE_V256; t++) {
        for (MemOp e = MO_8; e <= MO_64; e++) {
            probe_frac_lmul_1(t, e);
        }
    }
}

So that everything is pre-computed at startup.


r~

Reply via email to