On Thu, Oct 10, 2019 at 01:33:02PM -0400, Richard Henderson wrote: > On 10/10/19 2:16 AM, Andrew Jones wrote: > >> It might be best to avoid the ifdef altogether: > >> > >> for (i = 0; i < 32; ++i) { > >> uint64_t *d = (uint64_t *)&buf[sve_zreg_offset(vq, i)]; > >> for (j = 0; j < vq * 2; ++j) { > >> d[j] = cpu_to_le64(env->vfp.zregs[i].d[j]); > >> } > >> } > >> > >> The compiler may well transform the inner loop to memcpy for little-endian > >> host, but even if it doesn't core dumping is hardly performance sensitive. > > > > True. I even had something like the above at first, but then > > overcomplicated it with the #ifdef-ing. > > Ah, I wonder if you changed things around with the ifdefs due to the pregs. > There's no trivial solution for those. It'd be nice to share the bswapping > subroutine that you add in the SVE KVM patch set, and size the temporary array > using ARM_MAX_VQ.
How about something like this squashed in? Thanks, drew diff --git a/target/arm/arch_dump.c b/target/arm/arch_dump.c index b02e398430b9..7c7fd23f4d94 100644 --- a/target/arm/arch_dump.c +++ b/target/arm/arch_dump.c @@ -182,6 +182,7 @@ static int aarch64_write_elf64_sve(WriteCoreDumpFunction f, struct aarch64_note *note; ARMCPU *cpu = env_archcpu(env); uint32_t vq = sve_current_vq(env); + uint64_t tmp[ARM_MAX_VQ * 2], *r; uint32_t fpr; uint8_t *buf; int ret, i; @@ -198,31 +199,14 @@ static int aarch64_write_elf64_sve(WriteCoreDumpFunction f, note->sve.flags = cpu_to_dump16(s, 1); for (i = 0; i < 32; ++i) { -#ifdef HOST_WORDS_BIGENDIAN - uint64_t d[vq * 2]; - int j; - - for (j = 0; j < vq * 2; ++j) { - d[j] = bswap64(env->vfp.zregs[i].d[j]); - } -#else - uint64_t *d = &env->vfp.zregs[i].d[0]; -#endif - memcpy(&buf[sve_zreg_offset(vq, i)], &d[0], vq * 16); + r = sve_bswap64(tmp, &env->vfp.zregs[i].d[0], vq * 2); + memcpy(&buf[sve_zreg_offset(vq, i)], r, vq * 16); } for (i = 0; i < 17; ++i) { -#ifdef HOST_WORDS_BIGENDIAN - uint64_t d[DIV_ROUND_UP(vq * 2, 8)]; - int j; - - for (j = 0; j < DIV_ROUND_UP(vq * 2, 8); ++j) { - d[j] = bswap64(env->vfp.pregs[i].p[j]); - } -#else - uint64_t *d = &env->vfp.pregs[i].p[0]; -#endif - memcpy(&buf[sve_preg_offset(vq, i)], &d[0], vq * 16 / 8); + r = sve_bswap64(tmp, r = &env->vfp.pregs[i].p[0], + DIV_ROUND_UP(vq * 2, 8)); + memcpy(&buf[sve_preg_offset(vq, i)], r, vq * 16 / 8); } fpr = cpu_to_dump32(s, vfp_get_fpsr(env)); diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 5b9c3e4cd73d..b3092e5213e6 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -975,6 +975,31 @@ void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq); void aarch64_sve_change_el(CPUARMState *env, int old_el, int new_el, bool el0_a64); void aarch64_add_sve_properties(Object *obj); + +/* + * SVE registers are encoded in KVM's memory in an endianness-invariant format. + * The byte at offset i from the start of the in-memory representation contains + * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the + * lowest offsets are stored in the lowest memory addresses, then that nearly + * matches QEMU's representation, which is to use an array of host-endian + * uint64_t's, where the lower offsets are at the lower indices. To complete + * the translation we just need to byte swap the uint64_t's on big-endian hosts. + */ +static inline uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) +{ +#ifdef HOST_WORDS_BIGENDIAN + int i; + + for (i = 0; i < nr; ++i) { + dst[i] = bswap64(src[i]); + } + + return dst; +#else + return src; +#endif +} + #else static inline void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq) { } static inline void aarch64_sve_change_el(CPUARMState *env, int o, diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 876184b8fe4d..e2da756e65ed 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -876,30 +876,6 @@ static int kvm_arch_put_fpsimd(CPUState *cs) return 0; } -/* - * SVE registers are encoded in KVM's memory in an endianness-invariant format. - * The byte at offset i from the start of the in-memory representation contains - * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the - * lowest offsets are stored in the lowest memory addresses, then that nearly - * matches QEMU's representation, which is to use an array of host-endian - * uint64_t's, where the lower offsets are at the lower indices. To complete - * the translation we just need to byte swap the uint64_t's on big-endian hosts. - */ -static uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) -{ -#ifdef HOST_WORDS_BIGENDIAN - int i; - - for (i = 0; i < nr; ++i) { - dst[i] = bswap64(src[i]); - } - - return dst; -#else - return src; -#endif -} - /* * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits * and PREGS and the FFR have a slice size of 256 bits. However we simply hard