On Mon, 18 Nov 2024 15:20:17 GMT, Quan Anh Mai <qa...@openjdk.org> wrote:
>> @merykitty I guess we can always use >> [vmovdqu](https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64). >> >> And in fact that is exactly what we do: >> >> public class Test { >> static int RANGE = 1024*1024; >> >> public static void main(String[] args) { >> byte[] aB = new byte[RANGE]; >> byte[] bB = new byte[RANGE]; >> for (int i = 0; i < 100_000; i++) { >> test1(aB, bB); >> } >> } >> >> static void test1(byte[] a, byte[] b) { >> >> for (int i = 0; i < RANGE; i++) { >> >> a[i] = b[i]; >> } >> >> } >> } >> >> `../java -XX:CompileCommand=compileonly,Test::test* >> -XX:CompileCommand=printcompilation,Test::test* -XX:+TraceLoopOpts >> -XX:-TraceSuperWord -XX:+TraceNewVectors -Xbatch -XX:+AlignVector >> -XX:CompileCommand=compileonly,Test::test* >> -XX:CompileCommand=printassembly,Test::test* Test.java` >> >> >> ;; B20: # out( B20 B21 ) <- in( B19 B20 ) Loop( B20-B20 inner main of >> N178 strip mined) Freq: 8.13586e+09 >> 0x00007fc3a4bb0780: movslq %ebx,%rdi >> 0x00007fc3a4bb0783: movslq %ebx,%r14 >> 0x00007fc3a4bb0786: vmovdqu32 0x10(%r13,%r14,1),%zmm1 >> 0x00007fc3a4bb0791: vmovdqu32 %zmm1,0x10(%r9,%r14,1) >> 0x00007fc3a4bb079c: vmovdqu32 0x50(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb07a7: vmovdqu32 %zmm1,0x50(%r9,%rdi,1) >> 0x00007fc3a4bb07b2: vmovdqu32 0x90(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb07bd: vmovdqu32 %zmm1,0x90(%r9,%rdi,1) >> 0x00007fc3a4bb07c8: vmovdqu32 0xd0(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb07d3: vmovdqu32 %zmm1,0xd0(%r9,%rdi,1) >> 0x00007fc3a4bb07de: vmovdqu32 0x110(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb07e9: vmovdqu32 %zmm1,0x110(%r9,%rdi,1) >> 0x00007fc3a4bb07f4: vmovdqu32 0x150(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb07ff: vmovdqu32 %zmm1,0x150(%r9,%rdi,1) >> 0x00007fc3a4bb080a: vmovdqu32 0x190(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb0815: vmovdqu32 %zmm1,0x190(%r9,%rdi,1) >> 0x00007fc3a4bb0820: vmovdqu32 0x1d0(%r13,%rdi,1),%zmm1 >> 0x00007fc3a4bb082b: vmovdqu32 %zmm1,0x1d0(%r9,%rdi,1) ;*bastore >> {reexecute=0 rethrow=0 return_oop=0} >> ; - >> Test::test1@14 (line 14) >> 0x00007fc3a4bb0836: add $0x200,%ebx ;*iinc >> {reexecute=0 rethrow=0 return_oop=0} >> ... > > @eme64 What I mean here is that `AlignVector` seems useless because the > accesses are going to be misaligned either way. @merykitty FYI: `src/hotspot/share/opto/vectorization.hpp: static bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }` The relevant code: src/hotspot/cpu/x86/matcher_x86.hpp: static constexpr bool misaligned_vectors_ok() { // x86 supports misaligned vectors store/load. static constexpr bool misaligned_vectors_ok() { return true; } src/hotspot/cpu/ppc/matcher_ppc.hpp: static constexpr bool misaligned_vectors_ok() { // PPC implementation uses VSX load/store instructions (if // SuperwordUseVSX) which support 4 byte but not arbitrary alignment static constexpr bool misaligned_vectors_ok() { return false; } src/hotspot/cpu/aarch64/matcher_aarch64.hpp: static constexpr bool misaligned_vectors_ok() { // aarch64 supports misaligned vectors store/load. static constexpr bool misaligned_vectors_ok() { return true; } src/hotspot/cpu/s390/matcher_s390.hpp: static constexpr bool misaligned_vectors_ok() { // z/Architecture does support misaligned store/load at minimal extra cost. static constexpr bool misaligned_vectors_ok() { return true; } src/hotspot/cpu/arm/matcher_arm.hpp: static constexpr bool misaligned_vectors_ok() { // ARM doesn't support misaligned vectors store/load. static constexpr bool misaligned_vectors_ok() { return false; } src/hotspot/cpu/riscv/matcher_riscv.hpp: static constexpr bool misaligned_vectors_ok() { // riscv supports misaligned vectors store/load. static constexpr bool misaligned_vectors_ok() { return true; } We can see that only PPC and ARM32 have such strict alignment requirements. And it turns out that PPC only needs 4-byte alignment, and ARM32 is fine with 8-byte alignment. So all of our platforms do not necessarily need full vector-width alignment. ------------- PR Comment: https://git.openjdk.org/jdk/pull/20677#issuecomment-2483505834