On Tue, Aug 09, 2005 at 02:58:51PM -0700, Richard Henderson wrote: > On Tue, Aug 09, 2005 at 02:30:46PM -0700, H. J. Lu wrote: > > There is a minor problem. How can I add crtfastmath.o for SSE targets > > only? > > You don't. You either add code to detect sse, or you make the > spec depend on -mfpmath=sse. >
Here is the patch to enable FTZ/DAZ for SSE via fast math. There are no regressions on Linux/x86_64 nor Linux/ia32. The performance of one FP benchmark on EM64T is more than doubled with -ffast-math. H.J. --- 2005-08-09 H.J. Lu <[EMAIL PROTECTED]> * config.gcc (i[34567]86-*-linux*): Add i386/t-crtfm to tm-file. (x86_64-*-linux*): Likewise. * config/i386/crtfastmath.c: New file. * config/i386/t-crtfm: Likewise. * config/i386/linux.h (ENDFILE_SPEC): New. * config/i386/linux64.h (ENDFILE_SPEC): Likewise. * config/i386/t-linux64 (EXTRA_MULTILIB_PARTS): Add crtfastmath.o. --- gcc/config.gcc.sse 2005-08-06 07:22:06.000000000 -0700 +++ gcc/config.gcc 2005-08-09 15:09:16.313927259 -0700 @@ -1001,7 +1001,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfree i[34567]86-*-knetbsd*-gnu) tm_file="${tm_file} knetbsd-gnu.h i386/knetbsd-gnu.h" ;; i[34567]86-*-kfreebsd*-gnu) tm_file="${tm_file} kfreebsd-gnu.h i386/kfreebsd-gnu.h" ;; esac - tmake_file="${tmake_file} i386/t-crtstuff" + tmake_file="${tmake_file} i386/t-crtstuff i386/t-crtfm" ;; x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu) tm_file="${tm_file} i386/unix.h i386/att.h dbxelf.h elfos.h svr4.h linux.h \ @@ -1010,7 +1010,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu x86_64-*-kfreebsd*-gnu) tm_file="${tm_file} kfreebsd-gnu.h" ;; x86_64-*-knetbsd*-gnu) tm_file="${tm_file} knetbsd-gnu.h" ;; esac - tmake_file="${tmake_file} i386/t-linux64" + tmake_file="${tmake_file} i386/t-linux64 i386/t-crtfm" ;; i[34567]86-*-gnu*) ;; --- gcc/config/i386/crtfastmath.c.sse 2005-08-09 15:09:39.634095529 -0700 +++ gcc/config/i386/crtfastmath.c 2005-08-09 15:29:01.796141023 -0700 @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2005 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * In addition to the permissions in the GNU General Public License, the + * Free Software Foundation gives you unlimited permission to link the + * compiled version of this file with other programs, and to distribute + * those programs without any restriction coming from the use of this + * file. (The General Public License restrictions do apply in other + * respects; for example, they cover modification of the file, and + * distribution when not linked into another program.) + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * As a special exception, if you link this library with files + * compiled with GCC to produce an executable, this does not cause + * the resulting executable to be covered by the GNU General Public License. + * This exception does not however invalidate any other reasons why + * the executable file might be covered by the GNU General Public License. + */ + +#define MXCSR_DAZ (1 << 6) /* Enable denormals are zero mode */ +#define MXCSR_FTZ (1 << 15) /* Enable flush to zero mode */ + +static void __attribute__((constructor)) +set_fast_math (void) +{ + /* Check if SSE is available. */ + unsigned int eax, ebx, ecx, edx; + asm volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1" + : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx) + : "0" (1)); + + if (edx & (1 << 25)) + { + unsigned int mxcsr = __builtin_ia32_stmxcsr (); + mxcsr |= MXCSR_DAZ | MXCSR_FTZ; + __builtin_ia32_ldmxcsr (mxcsr); + } +} --- gcc/config/i386/linux.h.sse 2004-11-28 17:04:42.000000000 -0800 +++ gcc/config/i386/linux.h 2005-08-09 14:22:44.554244342 -0700 @@ -121,6 +121,12 @@ Boston, MA 02111-1307, USA. */ %{!dynamic-linker:-dynamic-linker %(dynamic_linker)}} \ %{static:-static}}}" +/* Similar to standard Linux, but adding -ffast-math support. */ +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ + %{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s" + /* A C statement (sans semicolon) to output to the stdio stream FILE the assembler definition of uninitialized global DECL named NAME whose size is SIZE bytes and alignment is ALIGN bytes. --- gcc/config/i386/linux64.h.sse 2004-11-28 17:04:42.000000000 -0800 +++ gcc/config/i386/linux64.h 2005-08-09 15:46:50.138601985 -0700 @@ -64,6 +64,12 @@ Boston, MA 02111-1307, USA. */ %{!m32:%{!dynamic-linker:-dynamic-linker /lib64/ld-linux-x86-64.so.2}}} \ %{static:-static}}" +/* Similar to standard Linux, but adding -ffast-math support. */ +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ + %{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s" + #define MULTILIB_DEFAULTS { "m64" } #undef NEED_INDICATE_EXEC_STACK --- gcc/config/i386/t-crtfm.sse 2005-08-09 15:09:33.236146774 -0700 +++ gcc/config/i386/t-crtfm 2005-08-09 15:53:19.144246878 -0700 @@ -0,0 +1,6 @@ +EXTRA_PARTS += crtfastmath.o + +$(T)crtfastmath.o: $(srcdir)/config/i386/crtfastmath.c $(GCC_PASSES) + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) -msse -c \ + $(srcdir)/config/i386/crtfastmath.c \ + -o $(T)crtfastmath$(objext) --- gcc/config/i386/t-linux64.sse 2003-03-03 12:03:59.000000000 -0800 +++ gcc/config/i386/t-linux64 2005-08-09 15:36:22.796680353 -0700 @@ -11,7 +11,8 @@ MULTILIB_OSDIRNAMES = ../lib64 ../lib LIBGCC = stmp-multilib INSTALL_LIBGCC = install-multilib -EXTRA_MULTILIB_PARTS=crtbegin.o crtend.o crtbeginS.o crtendS.o crtbeginT.o +EXTRA_MULTILIB_PARTS=crtbegin.o crtend.o crtbeginS.o crtendS.o \ + crtbeginT.o crtfastmath.o # The pushl in CTOR initialization interferes with frame pointer elimination. # crtend*.o cannot be compiled without -fno-asynchronous-unwind-tables,