Hello, I'd like to fix this ancient PR. The attached patch picks up the suggested changes mentioned in comment #3 to avoid changing the FPSCR.FR bit in the sdivsi3_i4 and udivsi3_i4 library functions. As mentioned in the PR, this makes integer division a bit slower when using -mdiv=call-fp, but it allows better utilization of the SH4 matrix multiplication feature. I've also added SH4A versions of the library functions which utilize the fpchg instruction.
Tested on rev 199102 with make -k check RUNTESTFLAGS="--target_board=sh-sim \{-m4/-mb/-mdiv=call-fp,-m4-single/-mb/-mdiv=call-fp,-m4a/-mb/-mdiv=call-fp, -m4a-single/-mb/-mdiv=call-fp,-m4/-ml/-mdiv=call-fp, -m4-single/-ml/-mdiv=call-fp,-m4a/-ml/-mdiv=call-fp, -m4a-single/-ml/-mdiv=call-fp}" and no new failures. OK for trunk? Cheers, Oleg libgcc/ChangeLog: PR target/6526 * config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation. testsuite/ChangeLog: PR target/6526 * gcc.target/sh/pr6526.c: New.
Index: libgcc/config/sh/lib1funcs.S =================================================================== --- libgcc/config/sh/lib1funcs.S (revision 199109) +++ libgcc/config/sh/lib1funcs.S (working copy) @@ -1003,11 +1003,17 @@ ENDFUNC(GLOBAL(mulsi3)) #endif #endif /* ! __SH5__ */ + +/*------------------------------------------------------------------------------ + 32 bit signed integer division that uses FPU double precision division. */ + #ifdef L_sdivsi3_i4 .title "SH DIVIDE" -!! 4 byte integer Divide code for the Renesas SH + #if defined (__SH4__) || defined (__SH2A__) -!! args in r4 and r5, result in fpul, clobber dr0, dr2 +/* This variant is used when FPSCR.PR = 1 (double precision) is the default + setting. + Args in r4 and r5, result in fpul, clobber dr0, dr2. */ .global GLOBAL(sdivsi3_i4) HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) @@ -1021,8 +1027,13 @@ ftrc dr0,fpul ENDFUNC(GLOBAL(sdivsi3_i4)) + #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__) -!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2 +/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default + setting. + Args in r4 and r5, result in fpul, clobber r2, dr0, dr2. + For this to work, we must temporarily switch the FPU do double precision, + but we better do not touch FPSCR.FR. See PR 6526. */ #if ! __SH5__ || __SH5__ == 32 #if __SH5__ @@ -1031,24 +1042,43 @@ .global GLOBAL(sdivsi3_i4) HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) GLOBAL(sdivsi3_i4): - sts.l fpscr,@-r15 - mov #8,r2 - swap.w r2,r2 - lds r2,fpscr - lds r4,fpul - float fpul,dr0 - lds r5,fpul - float fpul,dr2 - fdiv dr2,dr0 - ftrc dr0,fpul + +#ifndef __SH4A__ + mov.l r3,@-r15 + sts fpscr,r2 + mov #8,r3 + swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit) + or r2,r3 + lds r3,fpscr // Set FPSCR.PR = 1. + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + lds r2,fpscr rts - lds.l @r15+,fpscr + mov.l @r15+,r3 +#else +/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */ + fpchg + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + fpchg +#endif /* __SH4A__ */ + ENDFUNC(GLOBAL(sdivsi3_i4)) #endif /* ! __SH5__ || __SH5__ == 32 */ #endif /* ! __SH4__ || __SH2A__ */ -#endif +#endif /* L_sdivsi3_i4 */ +//------------------------------------------------------------------------------ #ifdef L_sdivsi3 /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with sh2e/sh3e code. */ @@ -1367,54 +1397,60 @@ mov #0,r0 ENDFUNC(GLOBAL(sdivsi3)) -#endif /* ! __SHMEDIA__ */ -#endif +#endif /* ! __SHMEDIA__ */ +#endif /* L_sdivsi3 */ + +/*------------------------------------------------------------------------------ + 32 bit unsigned integer division that uses FPU double precision division. */ + #ifdef L_udivsi3_i4 + .title "SH DIVIDE" - .title "SH DIVIDE" -!! 4 byte integer Divide code for the Renesas SH #if defined (__SH4__) || defined (__SH2A__) -!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4, -!! and t bit +/* This variant is used when FPSCR.PR = 1 (double precision) is the default + setting. + Args in r4 and r5, result in fpul, + clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */ .global GLOBAL(udivsi3_i4) HIDDEN_FUNC(GLOBAL(udivsi3_i4)) GLOBAL(udivsi3_i4): - mov #1,r1 - cmp/hi r1,r5 - bf trivial - rotr r1 - xor r1,r4 - lds r4,fpul - mova L1,r0 + mov #1,r1 + cmp/hi r1,r5 + bf/s trivial + rotr r1 + xor r1,r4 + lds r4,fpul + mova L1,r0 #ifdef FMOVD_WORKS - fmov.d @r0+,dr4 + fmov.d @r0+,dr4 #else - fmov.s @r0+,DR40 - fmov.s @r0,DR41 + fmov.s @r0+,DR40 + fmov.s @r0,DR41 #endif - float fpul,dr0 - xor r1,r5 - lds r5,fpul - float fpul,dr2 - fadd dr4,dr0 - fadd dr4,dr2 - fdiv dr2,dr0 + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 rts - ftrc dr0,fpul + ftrc dr0,fpul trivial: rts - lds r4,fpul + lds r4,fpul .align 2 #ifdef FMOVD_WORKS - .align 3 ! make double below 8 byte aligned. + .align 3 // Make the double below 8 byte aligned. #endif L1: .double 2147483648 ENDFUNC(GLOBAL(udivsi3_i4)) + #elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__) #if ! __SH5__ || __SH5__ == 32 !! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33 @@ -1436,57 +1472,106 @@ ENDFUNC(GLOBAL(udivsi3_i4)) #endif /* ! __SH5__ || __SH5__ == 32 */ + #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) -!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 +/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default + setting. + Args in r4 and r5, result in fpul, + clobber r0, r1, r4, r5, dr0, dr2, dr4. + For this to work, we must temporarily switch the FPU do double precision, + but we better do not touch FPSCR.FR. See PR 6526. */ .global GLOBAL(udivsi3_i4) HIDDEN_FUNC(GLOBAL(udivsi3_i4)) GLOBAL(udivsi3_i4): - mov #1,r1 - cmp/hi r1,r5 - bf trivial - sts.l fpscr,@-r15 - mova L1,r0 - lds.l @r0+,fpscr - rotr r1 - xor r1,r4 - lds r4,fpul + +#ifndef __SH4A__ + mov #1,r1 + cmp/hi r1,r5 + bf/s trivial + rotr r1 // r1 = 1 << 31 + sts.l fpscr,@-r15 + xor r1,r4 + mov.l @(0,r15),r0 + xor r1,r5 + mov.l L2,r1 + lds r4,fpul + or r0,r1 + mova L1,r0 + lds r1,fpscr #ifdef FMOVD_WORKS - fmov.d @r0+,dr4 + fmov.d @r0+,dr4 #else - fmov.s @r0+,DR40 - fmov.s @r0,DR41 + fmov.s @r0+,DR40 + fmov.s @r0,DR41 #endif - float fpul,dr0 - xor r1,r5 - lds r5,fpul - float fpul,dr2 - fadd dr4,dr0 - fadd dr4,dr2 - fdiv dr2,dr0 - ftrc dr0,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul rts - lds.l @r15+,fpscr + lds.l @r15+,fpscr #ifdef FMOVD_WORKS - .align 3 ! make double below 8 byte aligned. + .align 3 // Make the double below 8 byte aligned. #endif trivial: rts - lds r4,fpul + lds r4,fpul .align 2 -L1: -#ifndef FMOVD_WORKS - .long 0x80000 +L2: +#ifdef FMOVD_WORKS + .long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1 #else - .long 0x180000 + .long 0x80000 // FPSCR.PR = 1 #endif +L1: .double 2147483648 +#else +/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. + Although on SH4A fmovd usually works, it would require either additional + two fschg instructions or an FPSCR push + pop. It's not worth the effort + for loading only one double constant. */ + mov #1,r1 + cmp/hi r1,r5 + bf/s trivial + rotr r1 // r1 = 1 << 31 + fpchg + mova L1,r0 + xor r1,r4 + fmov.s @r0+,DR40 + lds r4,fpul + fmov.s @r0,DR41 + xor r1,r5 + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + fpchg + +trivial: + rts + lds r4,fpul + + .align 2 +L1: + .double 2147483648 + +#endif /* __SH4A__ */ + + ENDFUNC(GLOBAL(udivsi3_i4)) #endif /* ! __SH4__ */ -#endif +#endif /* L_udivsi3_i4 */ #ifdef L_udivsi3 /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with Index: gcc/testsuite/gcc.target/sh/pr6526.c =================================================================== --- gcc/testsuite/gcc.target/sh/pr6526.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr6526.c (revision 0) @@ -0,0 +1,64 @@ +/* Check that the XF registers are not clobbered by an integer division + that is done using double precision FPU division. */ +/* { dg-do run { target "sh*-*-*" } } */ +/* { dg-options "-O1 -mdiv=call-fp" } */ +/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */ + +#include <assert.h> +#include <stdlib.h> + +extern void __set_fpscr (int); + +void +write_xf0 (float* f) +{ + __asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory"); +} + +void +read_xf0 (float* f) +{ + __asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory"); +} + +int __attribute__ ((noinline)) +test_00 (int a, int b) +{ + return a / b; +} + +unsigned int __attribute__ ((noinline)) +test_01 (unsigned a, unsigned b) +{ + return a / b; +} + +int __attribute__ ((noinline)) +test_02 (int x) +{ + return x & 0; +} + +int +main (void) +{ + float test_value; + int r = 0; + + /* Set FPSCR.FR to 1. */ + __set_fpscr (0x200000); + + test_value = 123; + write_xf0 (&test_value); + r += test_00 (40, 4); + read_xf0 (&test_value); + assert (test_value == 123); + + test_value = 321; + write_xf0 (&test_value); + r += test_01 (50, 5); + read_xf0 (&test_value); + assert (test_value == 321); + + return test_02 (r); +}