On 1/20/25 19:07, Li, Pan2 wrote:
> Agree, the mode-switch will take care of the frm when meet a call (covered by 
> testcase already).
>
>    5   │
>    6   │ extern size_t normalize_vl_1 (size_t vl);
>    7   │ extern size_t normalize_vl_2 (size_t vl);
>    8   │
>    9   │ vfloat32m1_t
>   10   │ test_float_point_dynamic_frm (vfloat32m1_t op1, vfloat32m1_t op2,
>   11   │                  unsigned count, size_t vl)
>   12   │ {
>   13   │   vfloat32m1_t result = op1;
>   14   │
>   15   │   for (unsigned i = 0; i < count; i++)
>   16   │     {
>   17   │       if (i % 3 == 0)
>   18   │     {
>   19   │       result = __riscv_vfadd_vv_f32m1 (op1, result, vl);
>   20   │       vl = normalize_vl_1 (vl);
>   21   │     }
>   22   │       else
>   23   │     {
>   24   │       result = __riscv_vfadd_vv_f32m1_rm (result, op2, 1, vl);
>   25   │       vl = normalize_vl_2 (vl);
>   26   │     }
>   27   │     }
>   28   │
>   29   │   return result;
>   30   │ }
>
> .L12:
>         csrr    a5,vlenb
>         add     a5,a5,sp
>         vl1re32.v       v1,0(a5)
>         vsetvli zero,a1,e32,m1,ta,ma
>         addiw   s0,s0,1
>         vfadd.vv        v8,v1,v8 // Do not pollute frm, nothing need to do 
> here
>         vs1r.v  v8,0(sp)
>         call    normalize_vl_1 
>         vl1re32.v       v8,0(sp)
>         frrm    a4
>         mv      a1,a0
>         beq     s3,s0,.L8
> .L5:
>         mulw    a5,s0,s2
>         mv      a0,a1
>         bleu    a5,s1,.L12
>         fsrmi   1
>         csrr    a5,vlenb
>         slli    a5,a5,1
>         add     a5,a5,sp                                                      
>                                                                               
>                                                                               
>                                           
>         vl1re32.v       v1,0(a5)                                              
>                                                                               
>                                                                               
>                                   
>         vsetvli zero,a1,e32,m1,ta,ma                                          
>                                                                               
>                                                                               
>                                           
>        vfadd.vv        v8,v8,v1 // Pollute frm, will restore frm before call
>         vs1r.v  v8,0(sp)
>         fsrm    a4
>         call    normalize_vl_2
>         addiw   s0,s0,1
>         vl1re32.v       v8,0(sp)
>         frrm    a4
>         mv      a1,a0
>         bne     s3,s0,.L5
>
> while for llround autovec, it will also perform something like restore frm 
> before leave the func.
>
>    8   │ #define TEST_UNARY_CALL_CVT(TYPE_IN, TYPE_OUT, CALL) \
>    9   │   void test_##TYPE_IN##_##TYPE_OUT##_##CALL (        \
>   10   │     TYPE_OUT *out, TYPE_IN *in, unsigned count)      \
>   11   │   {                                                  \
>   12   │     for (unsigned i = 0; i < count; i++)             \
>   13   │       out[i] = CALL (in[i]);                         \
>   14   │   }
>
> TEST_UNARY_CALL_CVT (double, int64_t, __builtin_llround)
>
> test_double_int64_t___builtin_llround:
>   frrm    a3                                                                  
>                                                                               
>                                                                               
>                                      
>   beq     a2,zero,.L8                                                         
>                                                                               
>                                                                               
>                                      
>   fsrmi   4                                                                   
>                                                                               
>                                                                               
>                                      
>   slli    a2,a2,32                                                            
>                                                                               
>                                                                               
>                                      
>   srli    a2,a2,32                                                            
>                                                                               
>                                                                               
>                              
> .L3:                                                                          
>                                                                               
>                                                                               
>                                            
>   vsetvli a5,a2,e64,m1,ta,ma                                                  
>                                                                               
>                                                                               
>                                      
>   vle64.v v1,0(a1)
>   slli    a4,a5,3                                                             
>                                                                               
>                                                                               
>                                      
>   sub     a2,a2,a5                                                            
>                                                                               
>                                                                               
>                                     
>   add     a1,a1,a4                                                            
>                                                                               
>                                                                               
>                                    
>   vfcvt.x.f.v     v1,v1
>   vse64.v v1,0(a0)
>   add     a0,a0,a4                                                            
>                                                                               
>                                                                               
>                                     
>   bne     a2,zero,.L3
> .L8:
>   fsrm    a3

Silly question, what exactly is the procedure calling convention rule for
FCSR/FRM ? Is it a Caller saved or a Callee saved Reg.
The psABI CC doc is not explicit in those terms at least [1]

|   "The Floating-Point Control and Status Register (fcsr) must have thread
storage duration
|   in accordance with C11 section 7.6 "Floating-point   environment <fenv.h"

Per your llround snippet#2  is seems like Callee saved (function is restoring
the mode before it returns), but then in the snippet #1 at top, why does it need
to save the value before a function call, can't the callee just restore it back.
I'm surely missing something here.

P.S. I though I had the test reduced, but turns out the reduced one is fine,
even if it has some abnormal control flow and FRM set/restore.
Will try to narrow down using qemu tracing the actual function which is leaking
out FRM.

Thx,
-Vineet

[1] 
https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc

Reply via email to