Different gcc versions have different features, macro CONFIG_LSX_OPT and CONFIG_LASX_OPT is added here to detect whether gcc supports built-in lsx/lasx macro.
Function buffer_zero_lsx() is added for 128bit simd fpu optimization, and function buffer_zero_lasx() is for 256bit simd fpu optimization. Loongarch gcc built-in lsx/lasx macro can be used only when compiler option -mlsx/-mlasx is added, and there is no separate compiler option for function only. So it is only in effect when qemu is compiled with parameter --extra-cflags="-mlasx" Signed-off-by: Bibo Mao <maob...@loongson.cn> --- meson.build | 11 +++++ util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/meson.build b/meson.build index 6386607144..29bc362d7a 100644 --- a/meson.build +++ b/meson.build @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } ''')) +# For Loongarch64, detect if LSX/LASX are available. + config_host_data.set('CONFIG_LSX_OPT', cc.compiles(''' + #include "lsxintrin.h" + int foo(__m128i v) { return __lsx_bz_v(v); } + ''')) + +config_host_data.set('CONFIG_LASX_OPT', cc.compiles(''' + #include "lasxintrin.h" + int foo(__m256i v) { return __lasx_xbz_v(v); } + ''')) + if get_option('membarrier').disabled() have_membarrier = false elif host_os == 'windows' diff --git a/util/bufferiszero.c b/util/bufferiszero.c index 74864f7b78..751e81dbb3 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -265,6 +265,109 @@ static biz_accel_fn const accel_table[] = { buffer_is_zero_int_ge256, buffer_is_zero_simd, }; +#elif defined(__loongarch__) +#ifdef CONFIG_LSX_OPT +#include "lsxintrin.h" +static bool buffer_zero_lsx(const void *buf, size_t len) +{ + /* Unaligned loads at head/tail. */ + __m128i v = *(__m128i *)(buf); + __m128i w = *(__m128i *)(buf + len - 16); + /* Align head/tail to 16-byte boundaries. */ + const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); + const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); + + /* Collect a partial block at tail end. */ + v |= e[-1]; w |= e[-2]; + v |= e[-3]; w |= e[-4]; + v |= e[-5]; w |= e[-6]; + v |= e[-7]; v |= w; + + /* + * Loop over complete 128-byte blocks. + * With the head and tail removed, e - p >= 14, so the loop + * must iterate at least once. + */ + do { + if (!__lsx_bz_v(v)) { + return false; + } + v = p[0]; w = p[1]; + v |= p[2]; w |= p[3]; + v |= p[4]; w |= p[5]; + v |= p[6]; w |= p[7]; + v |= w; + p += 8; + } while (p < e - 7); + + return __lsx_bz_v(v); +} +#endif + +#ifdef CONFIG_LASX_OPT +#include "lasxintrin.h" +static bool buffer_zero_lasx(const void *buf, size_t len) +{ + /* Unaligned loads at head/tail. */ + __m256i v = *(__m256i *)(buf); + __m256i w = *(__m256i *)(buf + len - 32); + /* Align head/tail to 32-byte boundaries. */ + const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32); + const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32); + + /* Collect a partial block at tail end. */ + v |= e[-1]; w |= e[-2]; + v |= e[-3]; w |= e[-4]; + v |= e[-5]; w |= e[-6]; + v |= e[-7]; v |= w; + + /* Loop over complete 256-byte blocks. */ + for (; p < e - 7; p += 8) { + /* PTEST is not profitable here. */ + if (!__lasx_xbz_v(v)) { + return false; + } + + v = p[0]; w = p[1]; + v |= p[2]; w |= p[3]; + v |= p[4]; w |= p[5]; + v |= p[6]; w |= p[7]; + v |= w; + } + + return __lasx_xbz_v(v); +} +#endif + +static biz_accel_fn const accel_table[] = { + buffer_is_zero_int_ge256, +#ifdef CONFIG_LSX_OPT + buffer_zero_lsx, +#endif +#ifdef CONFIG_LASX_OPT + buffer_zero_lasx, +#endif +}; + +static unsigned best_accel(void) +{ + unsigned info = cpuinfo_init(); + + /* CONFIG_LSX_OPT must be enabled if CONFIG_LASX_OPT is enabled */ +#ifdef CONFIG_LASX_OPT + if (info & CPUINFO_LASX) { + return 2; + } +#endif + +#ifdef CONFIG_LSX_OPT + if (info & CPUINFO_LSX) { + return 1; + } +#endif + + return 0; +} #else #define best_accel() 0 static biz_accel_fn const accel_table[1] = { -- 2.39.3