Agree, the mode-switch will take care of the frm when meet a call (covered by 
testcase already).

   5   │
   6   │ extern size_t normalize_vl_1 (size_t vl);
   7   │ extern size_t normalize_vl_2 (size_t vl);
   8   │
   9   │ vfloat32m1_t
  10   │ test_float_point_dynamic_frm (vfloat32m1_t op1, vfloat32m1_t op2,
  11   │                  unsigned count, size_t vl)
  12   │ {
  13   │   vfloat32m1_t result = op1;
  14   │
  15   │   for (unsigned i = 0; i < count; i++)
  16   │     {
  17   │       if (i % 3 == 0)
  18   │     {
  19   │       result = __riscv_vfadd_vv_f32m1 (op1, result, vl);
  20   │       vl = normalize_vl_1 (vl);
  21   │     }
  22   │       else
  23   │     {
  24   │       result = __riscv_vfadd_vv_f32m1_rm (result, op2, 1, vl);
  25   │       vl = normalize_vl_2 (vl);
  26   │     }
  27   │     }
  28   │
  29   │   return result;
  30   │ }

.L12:
        csrr    a5,vlenb
        add     a5,a5,sp
        vl1re32.v       v1,0(a5)
        vsetvli zero,a1,e32,m1,ta,ma
        addiw   s0,s0,1
        vfadd.vv        v8,v1,v8 // Do not pollute frm, nothing need to do here
        vs1r.v  v8,0(sp)
        call    normalize_vl_1 
        vl1re32.v       v8,0(sp)
        frrm    a4
        mv      a1,a0
        beq     s3,s0,.L8
.L5:
        mulw    a5,s0,s2
        mv      a0,a1
        bleu    a5,s1,.L12
        fsrmi   1
        csrr    a5,vlenb
        slli    a5,a5,1
        add     a5,a5,sp                                                        
                                                                                
                                                                                
                                    
        vl1re32.v       v1,0(a5)                                                
                                                                                
                                                                                
                            
        vsetvli zero,a1,e32,m1,ta,ma                                            
                                                                                
                                                                                
                                    
       vfadd.vv        v8,v8,v1 // Pollute frm, will restore frm before call
        vs1r.v  v8,0(sp)
        fsrm    a4
        call    normalize_vl_2
        addiw   s0,s0,1
        vl1re32.v       v8,0(sp)
        frrm    a4
        mv      a1,a0
        bne     s3,s0,.L5

while for llround autovec, it will also perform something like restore frm 
before leave the func.

   8   │ #define TEST_UNARY_CALL_CVT(TYPE_IN, TYPE_OUT, CALL) \
   9   │   void test_##TYPE_IN##_##TYPE_OUT##_##CALL (        \
  10   │     TYPE_OUT *out, TYPE_IN *in, unsigned count)      \
  11   │   {                                                  \
  12   │     for (unsigned i = 0; i < count; i++)             \
  13   │       out[i] = CALL (in[i]);                         \
  14   │   }

TEST_UNARY_CALL_CVT (double, int64_t, __builtin_llround)

test_double_int64_t___builtin_llround:
  frrm    a3                                                                    
                                                                                
                                                                                
                               
  beq     a2,zero,.L8                                                           
                                                                                
                                                                                
                               
  fsrmi   4                                                                     
                                                                                
                                                                                
                               
  slli    a2,a2,32                                                              
                                                                                
                                                                                
                               
  srli    a2,a2,32                                                              
                                                                                
                                                                                
                       
.L3:                                                                            
                                                                                
                                                                                
                                     
  vsetvli a5,a2,e64,m1,ta,ma                                                    
                                                                                
                                                                                
                               
  vle64.v v1,0(a1)
  slli    a4,a5,3                                                               
                                                                                
                                                                                
                               
  sub     a2,a2,a5                                                              
                                                                                
                                                                                
                              
  add     a1,a1,a4                                                              
                                                                                
                                                                                
                             
  vfcvt.x.f.v     v1,v1
  vse64.v v1,0(a0)
  add     a0,a0,a4                                                              
                                                                                
                                                                                
                              
  bne     a2,zero,.L3
.L8:
  fsrm    a3

Pan

-----Original Message-----
From: Jeff Law <jeffreya...@gmail.com> 
Sent: Tuesday, January 21, 2025 12:47 AM
To: Palmer Dabbelt <pal...@rivosinc.com>
Cc: Li, Pan2 <pan2...@intel.com>; Vineet Gupta <vine...@rivosinc.com>; 
gnu-toolch...@rivosinc.com; Robin Dapp <rdapp....@gmail.com>; 
juzhe.zh...@rivai.ai; gcc-patches@gcc.gnu.org
Subject: Re: gcc mode switching issue (was Re: RISC-V round_away () handling of 
non canonical rounding modes)



On 1/18/25 2:41 PM, Palmer Dabbelt wrote:
> 
> Ya, and thanks for the help.  For anyone else watching, the rest is over 
> here:
> https://inbox.sourceware.org/libc-alpha/78a20579-2d29-4b3b- 
> af94-434dde755...@rivosinc.com/ (inbox.sourceware.org doesn't seem to 
> handle threading across lists) -- it certainly looked like a glibc issue 
> at first, so it ended up over there.
> 
> On a sort of related note: last night/this morning I realized that if 
> calling into glibc with RMM set is UB, then aren't we going to open 
> ourselves up issues like getting a signal in the middle of an inlined 
> __builtin_{l,}round() expansion?  Maybe fast-math/sigals are far enough 
> down the rabbit hole nobody cares, though?
I believe it is considered invalid to call into the various libraries 
with a different rounding mode set -- I'm pretty sure the libraries 
don't set the rounding modes explicitly unless they're doing something 
special and know they need non-default rounding semantics.

Essentially we wouldn't want to take the penalty to ensure a rounding 
mode as we enter the math libraries for the relatively rare case that 
the user's code has set a non-standard rounding mode.

I don't *think* the RV backend changes the rounding mode anywhere unless 
the user has explicitly asked for it - at which point I think it's on 
the user to make sure it's safe to do so.

jeff

Reply via email to