Re: [PATCH] RISC-V: Support movmisalign of RVV VLA modes

Richard Biener Mon, 09 Oct 2023 01:04:40 -0700

On Sun, Oct 8, 2023 at 9:22 AM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote:
>
> Previously, I removed the movmisalign pattern to fix the execution FAILs in 
> this commit:
> https://github.com/gcc-mirror/gcc/commit/f7bff24905a6959f85f866390db2fff1d6f95520
>
> I was thinking that RVV doesn't allow misaligned at the beginning so I 
> removed that pattern.
> However, after deep investigation && reading RVV ISA again and experiment on 
> SPIKE,
> I realized I was wrong.
>
> RVV ISA reference: 
> https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-memory-alignment-constraints
>
> "If an element accessed by a vector memory instruction is not naturally 
> aligned to the size of the element,
>  either the element is transferred successfully or an address misaligned 
> exception is raised on that element."


But you gobble the "or .." into an existing -mstrict-align flag - are
you sure all implementations are
self-consistent with handling non-vector memory instructions and
vector memory instructions here?
At least the above wording doesn't seem to impose such requirement.

> It's obvious that RVV ISA does allow misaligned vector load/store.
>
> And experiment and confirm on SPIKE:
>
> [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike
>  --isa=rv64gcv --varch=vlen:128,elen:64 
> ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64
>   a.out
> bbl loader
> z  0000000000000000 ra 0000000000010158 sp 0000003ffffffb40 gp 
> 0000000000012c48
> tp 0000000000000000 t0 00000000000110da t1 000000000000000f t2 
> 0000000000000000
> s0 0000000000013460 s1 0000000000000000 a0 0000000000012ef5 a1 
> 0000000000012018
> a2 0000000000012a71 a3 000000000000000d a4 0000000000000004 a5 
> 0000000000012a71
> a6 0000000000012a71 a7 0000000000012018 s2 0000000000000000 s3 
> 0000000000000000
> s4 0000000000000000 s5 0000000000000000 s6 0000000000000000 s7 
> 0000000000000000
> s8 0000000000000000 s9 0000000000000000 sA 0000000000000000 sB 
> 0000000000000000
> t3 0000000000000000 t4 0000000000000000 t5 0000000000000000 t6 
> 0000000000000000
> pc 0000000000010258 va/inst 00000000020660a7 sr 8000000200006620
> Store/AMO access fault!
>
> [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike
>  --misaligned --isa=rv64gcv --varch=vlen:128,elen:64 
> ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64
>   a.out
> bbl loader
>
> We can see SPIKE can pass previous *FAILED* execution tests with specifying 
> --misaligned to SPIKE.
>
> So, to honor RVV ISA SPEC, we should add movmisalign pattern back base on the 
> investigations I have done since
> it can improve multiple vectorization tests and fix dumple FAILs.
>
> This patch fixes these following dump FAILs:
>
> FAIL: gcc.dg/vect/vect-bitfield-read-2.c -flto -ffat-lto-objects  
> scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-2.c scan-tree-dump-not optimized 
> "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-4.c -flto -ffat-lto-objects  
> scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-4.c scan-tree-dump-not optimized 
> "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-2.c -flto -ffat-lto-objects  
> scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-2.c scan-tree-dump-not optimized 
> "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-3.c -flto -ffat-lto-objects  
> scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-3.c scan-tree-dump-not optimized 
> "Invalid sum"
>
> Consider this following case:
>
> struct s {
>     unsigned i : 31;
>     char a : 4;
> };
>
> #define N 32
> #define ELT0 {0x7FFFFFFFUL, 0}
> #define ELT1 {0x7FFFFFFFUL, 1}
> #define ELT2 {0x7FFFFFFFUL, 2}
> #define ELT3 {0x7FFFFFFFUL, 3}
> #define RES 48
> struct s A[N]
>   = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
>       ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
>       ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
>       ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
>
> int __attribute__ ((noipa))
> f(struct s *ptr, unsigned n) {
>     int res = 0;
>     for (int i = 0; i < n; ++i)
>       res += ptr[i].a;
>     return res;
> }
>
> -O3 -S -fno-vect-cost-model (default strict-align):
>
> f:
>         mv      a4,a0
>         beq     a1,zero,.L9
>         addiw   a5,a1,-1
>         li      a3,14
>         vsetivli        zero,16,e64,m8,ta,ma
>         bleu    a5,a3,.L3
>         andi    a5,a0,127
>         bne     a5,zero,.L3
>         srliw   a3,a1,4
>         slli    a3,a3,7
>         li      a0,15
>         slli    a0,a0,32
>         add     a3,a3,a4
>         mv      a5,a4
>         li      a2,32
>         vmv.v.x v16,a0
>         vsetvli zero,zero,e32,m4,ta,ma
>         vmv.v.i v4,0
> .L4:
>         vsetvli zero,zero,e64,m8,ta,ma
>         vle64.v v8,0(a5)
>         addi    a5,a5,128
>         vand.vv v8,v8,v16
>         vsetvli zero,zero,e32,m4,ta,ma
>         vnsrl.wx        v8,v8,a2
>         vadd.vv v4,v4,v8
>         bne     a5,a3,.L4
>         li      a3,0
>         andi    a5,a1,15
>         vmv.s.x v1,a3
>         andi    a3,a1,-16
>         vredsum.vs      v1,v4,v1
>         vmv.x.s a0,v1
>         mv      a2,a0
>         beq     a5,zero,.L15
>         slli    a5,a3,3
>         add     a5,a4,a5
>         lw      a0,4(a5)
>         andi    a0,a0,15
>         addiw   a4,a3,1
>         addw    a0,a0,a2
>         bgeu    a4,a1,.L15
>         lw      a2,12(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,2
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,20(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,3
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,28(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,4
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,36(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,5
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,44(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,6
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,52(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,7
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a4,60(a5)
>         andi    a4,a4,15
>         addw    a4,a4,a0
>         addiw   a2,a3,8
>         mv      a0,a4
>         bgeu    a2,a1,.L15
>         lw      a0,68(a5)
>         andi    a0,a0,15
>         addiw   a2,a3,9
>         addw    a0,a0,a4
>         bgeu    a2,a1,.L15
>         lw      a2,76(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,10
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,84(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,11
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,92(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,12
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a2,100(a5)
>         andi    a2,a2,15
>         addiw   a4,a3,13
>         addw    a0,a2,a0
>         bgeu    a4,a1,.L15
>         lw      a4,108(a5)
>         andi    a4,a4,15
>         addiw   a3,a3,14
>         addw    a0,a4,a0
>         bgeu    a3,a1,.L15
>         lw      a5,116(a5)
>         andi    a5,a5,15
>         addw    a0,a5,a0
>         ret
> .L9:
>         li      a0,0
> .L15:
>         ret
> .L3:
>         mv      a5,a4
>         slli    a4,a1,32
>         srli    a1,a4,29
>         add     a1,a5,a1
>         li      a0,0
> .L7:
>         lw      a4,4(a5)
>         andi    a4,a4,15
>         addi    a5,a5,8
>         addw    a0,a4,a0
>         bne     a5,a1,.L7
>         ret
>
> -O3 -S -mno-strict-align -fno-vect-cost-model:
>
> f:
>         beq     a1,zero,.L4
>         slli    a1,a1,32
>         li      a5,15
>         vsetvli a4,zero,e64,m1,ta,ma
>         slli    a5,a5,32
>         srli    a1,a1,32
>         li      a6,32
>         vmv.v.x v3,a5
>         vsetvli zero,zero,e32,mf2,ta,ma
>         vmv.v.i v2,0
> .L3:
>         vsetvli a5,a1,e64,m1,ta,ma
>         vle64.v v1,0(a0)
>         vsetvli a3,zero,e64,m1,ta,ma
>         slli    a2,a5,3
>         vand.vv v1,v1,v3
>         sub     a1,a1,a5
>         vsetvli zero,zero,e32,mf2,ta,ma
>         add     a0,a0,a2
>         vnsrl.wx        v1,v1,a6
>         vsetvli zero,a5,e32,mf2,tu,ma
>         vadd.vv v2,v2,v1
>         bne     a1,zero,.L3
>         li      a5,0
>         vsetvli a3,zero,e32,mf2,ta,ma
>         vmv.s.x v1,a5
>         vredsum.vs      v2,v2,v1
>         vmv.x.s a0,v2
>         ret
> .L4:
>         li      a0,0
>         ret
>
> We can see it improves this case codegen a lot.
>
> NOTE: Currently, we use -mno-strict-align to enable movmisalign pattern which 
> is NOT the accurate since
> according to RVV ISA:
>
> "Support for misaligned vector memory accesses is independent of an 
> implementation’s support for misaligned scalar memory accesses."
>
> Ideally, we need a separate compile option to enable vector misalign 
> independent on scalar memory misalign (For example, -mno-vector-strict-align).
>
> gcc/ChangeLog:
>
>         * config/riscv/vector.md (movmisalign<mode>): Support misalign 
> pattern for RVV VLA modes.
>
> gcc/testsuite/ChangeLog:
>
>         * lib/target-supports.exp: Add -mno-strict-align to run vect tests.
>
> ---
>  gcc/config/riscv/vector.md            | 13 +++++++++++++
>  gcc/testsuite/lib/target-supports.exp |  2 ++
>  2 files changed, 15 insertions(+)
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index cf5c0a40257..1bed1e40bd4 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -1326,6 +1326,19 @@
>    }
>  )
>
> +;; According to RVV ISA:
> +;; If an element accessed by a vector memory instruction is not naturally 
> aligned to the size of the element,
> +;; either the element is transferred successfully or an address misaligned 
> exception is raised on that element.
> +(define_expand "movmisalign<mode>"
> +  [(set (match_operand:V 0 "nonimmediate_operand")
> +       (match_operand:V 1 "general_operand"))]
> +  "TARGET_VECTOR && !STRICT_ALIGNMENT"
> +  {
> +    emit_move_insn (operands[0], operands[1]);
> +    DONE;
> +  }
> +)
> +
>  ;; -----------------------------------------------------------------
>  ;; ---- Duplicate Operations
>  ;; -----------------------------------------------------------------
> diff --git a/gcc/testsuite/lib/target-supports.exp 
> b/gcc/testsuite/lib/target-supports.exp
> index af52c38433d..f0b89cf4722 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -11389,11 +11389,13 @@ proc check_vect_support_and_set_flags { } {
>         if [check_effective_target_riscv_vector_hw] {
>             lappend DEFAULT_VECTCFLAGS "--param" 
> "riscv-autovec-preference=scalable"
>             lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
> +           lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
>             set dg-do-what-default run
>         } else {
>             lappend DEFAULT_VECTCFLAGS "-march=rv64gcv_zvfh" "-mabi=lp64d"
>             lappend DEFAULT_VECTCFLAGS "--param" 
> "riscv-autovec-preference=scalable"
>             lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
> +           lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
>             set dg-do-what-default compile
>         }
>      } else {
> --
> 2.36.3
>

Re: [PATCH] RISC-V: Support movmisalign of RVV VLA modes

Reply via email to