On Sun, Oct 8, 2023 at 9:22 AM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > Previously, I removed the movmisalign pattern to fix the execution FAILs in > this commit: > https://github.com/gcc-mirror/gcc/commit/f7bff24905a6959f85f866390db2fff1d6f95520 > > I was thinking that RVV doesn't allow misaligned at the beginning so I > removed that pattern. > However, after deep investigation && reading RVV ISA again and experiment on > SPIKE, > I realized I was wrong. > > RVV ISA reference: > https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-memory-alignment-constraints > > "If an element accessed by a vector memory instruction is not naturally > aligned to the size of the element, > either the element is transferred successfully or an address misaligned > exception is raised on that element."
But you gobble the "or .." into an existing -mstrict-align flag - are you sure all implementations are self-consistent with handling non-vector memory instructions and vector memory instructions here? At least the above wording doesn't seem to impose such requirement. > It's obvious that RVV ISA does allow misaligned vector load/store. > > And experiment and confirm on SPIKE: > > [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike > --isa=rv64gcv --varch=vlen:128,elen:64 > ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 > a.out > bbl loader > z 0000000000000000 ra 0000000000010158 sp 0000003ffffffb40 gp > 0000000000012c48 > tp 0000000000000000 t0 00000000000110da t1 000000000000000f t2 > 0000000000000000 > s0 0000000000013460 s1 0000000000000000 a0 0000000000012ef5 a1 > 0000000000012018 > a2 0000000000012a71 a3 000000000000000d a4 0000000000000004 a5 > 0000000000012a71 > a6 0000000000012a71 a7 0000000000012018 s2 0000000000000000 s3 > 0000000000000000 > s4 0000000000000000 s5 0000000000000000 s6 0000000000000000 s7 > 0000000000000000 > s8 0000000000000000 s9 0000000000000000 sA 0000000000000000 sB > 0000000000000000 > t3 0000000000000000 t4 0000000000000000 t5 0000000000000000 t6 > 0000000000000000 > pc 0000000000010258 va/inst 00000000020660a7 sr 8000000200006620 > Store/AMO access fault! > > [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike > --misaligned --isa=rv64gcv --varch=vlen:128,elen:64 > ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 > a.out > bbl loader > > We can see SPIKE can pass previous *FAILED* execution tests with specifying > --misaligned to SPIKE. > > So, to honor RVV ISA SPEC, we should add movmisalign pattern back base on the > investigations I have done since > it can improve multiple vectorization tests and fix dumple FAILs. > > This patch fixes these following dump FAILs: > > FAIL: gcc.dg/vect/vect-bitfield-read-2.c -flto -ffat-lto-objects > scan-tree-dump-not optimized "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-read-2.c scan-tree-dump-not optimized > "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-read-4.c -flto -ffat-lto-objects > scan-tree-dump-not optimized "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-read-4.c scan-tree-dump-not optimized > "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-write-2.c -flto -ffat-lto-objects > scan-tree-dump-not optimized "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-write-2.c scan-tree-dump-not optimized > "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-write-3.c -flto -ffat-lto-objects > scan-tree-dump-not optimized "Invalid sum" > FAIL: gcc.dg/vect/vect-bitfield-write-3.c scan-tree-dump-not optimized > "Invalid sum" > > Consider this following case: > > struct s { > unsigned i : 31; > char a : 4; > }; > > #define N 32 > #define ELT0 {0x7FFFFFFFUL, 0} > #define ELT1 {0x7FFFFFFFUL, 1} > #define ELT2 {0x7FFFFFFFUL, 2} > #define ELT3 {0x7FFFFFFFUL, 3} > #define RES 48 > struct s A[N] > = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3, > ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3, > ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3, > ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3}; > > int __attribute__ ((noipa)) > f(struct s *ptr, unsigned n) { > int res = 0; > for (int i = 0; i < n; ++i) > res += ptr[i].a; > return res; > } > > -O3 -S -fno-vect-cost-model (default strict-align): > > f: > mv a4,a0 > beq a1,zero,.L9 > addiw a5,a1,-1 > li a3,14 > vsetivli zero,16,e64,m8,ta,ma > bleu a5,a3,.L3 > andi a5,a0,127 > bne a5,zero,.L3 > srliw a3,a1,4 > slli a3,a3,7 > li a0,15 > slli a0,a0,32 > add a3,a3,a4 > mv a5,a4 > li a2,32 > vmv.v.x v16,a0 > vsetvli zero,zero,e32,m4,ta,ma > vmv.v.i v4,0 > .L4: > vsetvli zero,zero,e64,m8,ta,ma > vle64.v v8,0(a5) > addi a5,a5,128 > vand.vv v8,v8,v16 > vsetvli zero,zero,e32,m4,ta,ma > vnsrl.wx v8,v8,a2 > vadd.vv v4,v4,v8 > bne a5,a3,.L4 > li a3,0 > andi a5,a1,15 > vmv.s.x v1,a3 > andi a3,a1,-16 > vredsum.vs v1,v4,v1 > vmv.x.s a0,v1 > mv a2,a0 > beq a5,zero,.L15 > slli a5,a3,3 > add a5,a4,a5 > lw a0,4(a5) > andi a0,a0,15 > addiw a4,a3,1 > addw a0,a0,a2 > bgeu a4,a1,.L15 > lw a2,12(a5) > andi a2,a2,15 > addiw a4,a3,2 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,20(a5) > andi a2,a2,15 > addiw a4,a3,3 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,28(a5) > andi a2,a2,15 > addiw a4,a3,4 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,36(a5) > andi a2,a2,15 > addiw a4,a3,5 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,44(a5) > andi a2,a2,15 > addiw a4,a3,6 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,52(a5) > andi a2,a2,15 > addiw a4,a3,7 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a4,60(a5) > andi a4,a4,15 > addw a4,a4,a0 > addiw a2,a3,8 > mv a0,a4 > bgeu a2,a1,.L15 > lw a0,68(a5) > andi a0,a0,15 > addiw a2,a3,9 > addw a0,a0,a4 > bgeu a2,a1,.L15 > lw a2,76(a5) > andi a2,a2,15 > addiw a4,a3,10 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,84(a5) > andi a2,a2,15 > addiw a4,a3,11 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,92(a5) > andi a2,a2,15 > addiw a4,a3,12 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a2,100(a5) > andi a2,a2,15 > addiw a4,a3,13 > addw a0,a2,a0 > bgeu a4,a1,.L15 > lw a4,108(a5) > andi a4,a4,15 > addiw a3,a3,14 > addw a0,a4,a0 > bgeu a3,a1,.L15 > lw a5,116(a5) > andi a5,a5,15 > addw a0,a5,a0 > ret > .L9: > li a0,0 > .L15: > ret > .L3: > mv a5,a4 > slli a4,a1,32 > srli a1,a4,29 > add a1,a5,a1 > li a0,0 > .L7: > lw a4,4(a5) > andi a4,a4,15 > addi a5,a5,8 > addw a0,a4,a0 > bne a5,a1,.L7 > ret > > -O3 -S -mno-strict-align -fno-vect-cost-model: > > f: > beq a1,zero,.L4 > slli a1,a1,32 > li a5,15 > vsetvli a4,zero,e64,m1,ta,ma > slli a5,a5,32 > srli a1,a1,32 > li a6,32 > vmv.v.x v3,a5 > vsetvli zero,zero,e32,mf2,ta,ma > vmv.v.i v2,0 > .L3: > vsetvli a5,a1,e64,m1,ta,ma > vle64.v v1,0(a0) > vsetvli a3,zero,e64,m1,ta,ma > slli a2,a5,3 > vand.vv v1,v1,v3 > sub a1,a1,a5 > vsetvli zero,zero,e32,mf2,ta,ma > add a0,a0,a2 > vnsrl.wx v1,v1,a6 > vsetvli zero,a5,e32,mf2,tu,ma > vadd.vv v2,v2,v1 > bne a1,zero,.L3 > li a5,0 > vsetvli a3,zero,e32,mf2,ta,ma > vmv.s.x v1,a5 > vredsum.vs v2,v2,v1 > vmv.x.s a0,v2 > ret > .L4: > li a0,0 > ret > > We can see it improves this case codegen a lot. > > NOTE: Currently, we use -mno-strict-align to enable movmisalign pattern which > is NOT the accurate since > according to RVV ISA: > > "Support for misaligned vector memory accesses is independent of an > implementation’s support for misaligned scalar memory accesses." > > Ideally, we need a separate compile option to enable vector misalign > independent on scalar memory misalign (For example, -mno-vector-strict-align). > > gcc/ChangeLog: > > * config/riscv/vector.md (movmisalign<mode>): Support misalign > pattern for RVV VLA modes. > > gcc/testsuite/ChangeLog: > > * lib/target-supports.exp: Add -mno-strict-align to run vect tests. > > --- > gcc/config/riscv/vector.md | 13 +++++++++++++ > gcc/testsuite/lib/target-supports.exp | 2 ++ > 2 files changed, 15 insertions(+) > > diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md > index cf5c0a40257..1bed1e40bd4 100644 > --- a/gcc/config/riscv/vector.md > +++ b/gcc/config/riscv/vector.md > @@ -1326,6 +1326,19 @@ > } > ) > > +;; According to RVV ISA: > +;; If an element accessed by a vector memory instruction is not naturally > aligned to the size of the element, > +;; either the element is transferred successfully or an address misaligned > exception is raised on that element. > +(define_expand "movmisalign<mode>" > + [(set (match_operand:V 0 "nonimmediate_operand") > + (match_operand:V 1 "general_operand"))] > + "TARGET_VECTOR && !STRICT_ALIGNMENT" > + { > + emit_move_insn (operands[0], operands[1]); > + DONE; > + } > +) > + > ;; ----------------------------------------------------------------- > ;; ---- Duplicate Operations > ;; ----------------------------------------------------------------- > diff --git a/gcc/testsuite/lib/target-supports.exp > b/gcc/testsuite/lib/target-supports.exp > index af52c38433d..f0b89cf4722 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -11389,11 +11389,13 @@ proc check_vect_support_and_set_flags { } { > if [check_effective_target_riscv_vector_hw] { > lappend DEFAULT_VECTCFLAGS "--param" > "riscv-autovec-preference=scalable" > lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi" > + lappend DEFAULT_VECTCFLAGS "-mno-strict-align" > set dg-do-what-default run > } else { > lappend DEFAULT_VECTCFLAGS "-march=rv64gcv_zvfh" "-mabi=lp64d" > lappend DEFAULT_VECTCFLAGS "--param" > "riscv-autovec-preference=scalable" > lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi" > + lappend DEFAULT_VECTCFLAGS "-mno-strict-align" > set dg-do-what-default compile > } > } else { > -- > 2.36.3 >