[PATCH v1] Support for SPARC M7 and VIS 4.0

Jose E. Marchesi Thu, 02 Jun 2016 03:22:04 -0700

[Changes from version 0 of the patch:
- Added a comment to config/sparc/niagara4.md documenting the
  discrepancy of the documented instruction latency numbers vs. the
  implemented ones.
- Added comments to sparc_option_override documenting the changes
  in the cache parameters.
- Use the default PARAM_L1_CACHE_LINE_SIZE instead of restating the
  default explicitly.
- Added a comment to config/sparc/sparc.md to avoid confusion
  on the usage of a vis3_ variable in the <vis3_addsub_ss_patname>v8qi3
  insn.
- Fixed the ChangeLog entries by adding missing files and to
  use present tense.
- Removed trailing whitespaces from the patch.
- Removed spurious XXX marks from the patch.]



This patch adds support for -mcpu=niagara7, corresponding to the SPARC
M7 CPU as documented in the Oracle SPARC Architecture 2015 and the M7
Processor Supplement.  The patch also includes intrinsics support for
all the VIS 4.0 instructions.
    
This patch has been tested in sparc64-*-linux-gnu, sparcv9-*-linux-gnu
and sparc-sun-solaris2.11 targets.
    
gcc/ChangeLog:

        * config/sparc/sparc.md (cpu): Add niagara7 cpu type.
        Include the M7 SPARC DFA scheduler.
        New attribute v3pipe.
        Annotate insns with v3pipe where appropriate.
        Define cpu_feature vis4.
        Add lzd instruction type and set it on clzdi_sp64 and clzsi_sp64.
        Add (V8QI "8") to vbits.
        Add insns {add,sub}v8qi3
        Add insns ss{add,sub}v8qi3
        Add insns us{add,sub}{v8qi,v4hi}3
        Add insns {min,max}{v8qi,v4hi,v2si}3
        Add insns {minu,maxu}{v8qi,v4hi,v2si}3
        Add insns fpcmp{le,gt,ule,ug,ule,ugt}{8,16,32}_vis.
        * config/sparc/niagara4.md: Add a comment explaining the
        discrepancy between the documented latenty numbers and the
        implemented ones.
        * config/sparc/niagara7.md: New file.
        * configure.ac (HAVE_AS_SPARC5_VIS4): Define if the assembler
        supports SPARC5 and VIS 4.0 instructions.
        * configure: Regenerate.
        * config.in: Likewise.
        * config.gcc: niagara7 is a supported cpu in sparc*-*-* targets.
        * config/sparc/sol2.h (ASM_CPU32_DEFAUILT_SPEC): Set for
        TARGET_CPU_niagara7.
        (ASM_CPU64_DEFAULT_SPEC): Likewise.
        (CPP_CPU_SPEC): Handle niagara7.
        (ASM_CPU_SPEC): Likewise.
        * config/sparc/sparc-opts.h (processor_type): Add
        PROCESSOR_NIAGARA7.
        (mvis4): New option.
        * config/sparc/sparc.h (TARGET_CPU_niagara7): Define.
        (AS_NIAGARA7_FLAG): Define.
        (ASM_CPU64_DEFAULT_SPEC): Set for niagara7.
        (CPP_CPU64_DEFAULT_SPEC): Likewise.
        (CPP_CPU_SPEC): Handle niagara7.
        (ASM_CPU_SPEC): Likewise.
        * config/sparc/sparc.c (niagara7_costs): Define.
        (sparc_option_override): Handle niagara7 and adjust cache-related
        parameters with better values for niagara cpus.  Also support VIS4.
        (sparc32_initialize_trampoline): Likewise.
        (sparc_use_sched_lookahead): Likewise.
        (sparc_issue_rate): Likewise.
        (sparc_register_move_cost): Likewise.
        (dump_target_flag_bits): Support VIS4.
        (sparc_vis_init_builtins): Likewise.
        (sparc_builtins): Likewise.
        * config/sparc/sparc-c.c (sparc_target_macros): Define __VIS__ for
        VIS4 4.0.
        * config/sparc/driver-sparc.c (cpu_names): Add SPARC-M7 and
        UltraSparc M7.
        * config/sparc/sparc.opt (sparc_processor_type): New value
        niagara7.
        * config/sparc/visintrin.h (__attribute__): Prototypes for the
        VIS4 builtins.
        * doc/invoke.texi (SPARC Options): Document -mcpu=niagara7 and
        -mvis4.
        * doc/extend.texi (SPARC VIS Built-in Functions): Document the
        VIS4 builtins.

gcc/testsuite/ChangeLog:
    
        * gcc.target/sparc/vis4misc.c: New file.
        * gcc.target/sparc/fpcmp.c: Likewise.
        * gcc.target/sparc/fpcmpu.c: Likewise.

diff --git a/gcc/config.gcc b/gcc/config.gcc
index ae6e9ae..e47535b 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -4259,7 +4259,7 @@ case "${target}" in
                        | sparclite | f930 | f934 | sparclite86x \
                        | sparclet | tsc701 \
                        | v9 | ultrasparc | ultrasparc3 | niagara | niagara2 \
-                       | niagara3 | niagara4)
+                       | niagara3 | niagara4 | niagara7)
                                # OK
                                ;;
                        *)
diff --git a/gcc/config.in b/gcc/config.in
index 39d1e75..2deb8ed 100644
--- a/gcc/config.in
+++ b/gcc/config.in
@@ -628,6 +628,11 @@
 #undef HAVE_AS_SPARC4
 #endif
 
+/* Define if your assembler supports SPARC5 and VIS4.0 instructions. */
+#ifndef USED_FOR_TARGET
+#undef HAVE_AS_SPARC5_VIS4
+#endif
+
 
 /* Define if your assembler and linker support GOTDATA_OP relocs. */
 #ifndef USED_FOR_TARGET
diff --git a/gcc/config/sparc/driver-sparc.c b/gcc/config/sparc/driver-sparc.c
index 7e9ee24..b81763e 100644
--- a/gcc/config/sparc/driver-sparc.c
+++ b/gcc/config/sparc/driver-sparc.c
@@ -57,6 +57,7 @@ static const struct cpu_names {
   { "UltraSPARC-T2+",  "niagara2" },
   { "SPARC-T3",                "niagara3" },
   { "SPARC-T4",                "niagara4" },
+  { "SPARC-M7",                "niagara7" },
 #else
   { "SuperSparc",      "supersparc" },
   { "HyperSparc",      "hypersparc" },
@@ -73,6 +74,7 @@ static const struct cpu_names {
   { "UltraSparc T2",   "niagara2" },
   { "UltraSparc T3",   "niagara3" },
   { "UltraSparc T4",   "niagara4" },
+  { "UltraSparc M7",   "niagara7" },
   { "LEON",            "leon3" },
 #endif
   { NULL,      NULL }
diff --git a/gcc/config/sparc/niagara4.md b/gcc/config/sparc/niagara4.md
index a826fb4..925fc6c 100644
--- a/gcc/config/sparc/niagara4.md
+++ b/gcc/config/sparc/niagara4.md
@@ -75,6 +75,13 @@
       (eq_attr "fptype" "double")))
   "n4_slot1")
 
+;; The latency numbers for VIS instructions in the reservations below
+;; reflect empirical results, and don't match with the documented
+;; latency numbers in the T4 Processor Supplement.  This is because
+;; the HW chaps didn't feel it necessary to document the complexity in
+;; the PRM, and just assigned a latency of 11 to all/most of the VIS
+;; instructions.
+
 (define_insn_reservation "n4_vis_move_11cycle" 11
   (and (eq_attr "cpu" "niagara4")
     (and (eq_attr "type" "vismv")
diff --git a/gcc/config/sparc/niagara7.md b/gcc/config/sparc/niagara7.md
new file mode 100644
index 0000000..56a4edb
--- /dev/null
+++ b/gcc/config/sparc/niagara7.md
@@ -0,0 +1,136 @@
+;; Scheduling description for Niagara-7
+;;   Copyright (C) 2016 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "niagara7_0")
+
+(define_cpu_unit "n7_slot0,n7_slot1,n7_slot2" "niagara7_0")
+(define_reservation "n7_single_issue" "n7_slot0 + n7_slot1 + n7_slot2")
+
+(define_cpu_unit "n7_load_store" "niagara7_0")
+
+(define_insn_reservation "n7_single" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "multi,savew,flushw,trap"))
+  "n7_single_issue")
+
+(define_insn_reservation "n7_iflush" 27
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "iflush"))
+  "(n7_slot0 | n7_slot1), nothing*26")
+
+(define_insn_reservation "n7_integer" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "ialu,ialuX,shift,cmove,compare"))
+  "(n7_slot0 | n7_slot1)")
+
+(define_insn_reservation "n7_imul" 12
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "imul"))
+  "n7_slot1, nothing*11")
+
+(define_insn_reservation "n7_idiv" 35
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "idiv"))
+  "n7_slot1, nothing*34")
+
+(define_insn_reservation "n7_load" 5
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "load,fpload,sload"))
+  "(n7_slot0 + n7_load_store), nothing*4")
+
+(define_insn_reservation "n7_store" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "store,fpstore"))
+  "(n7_slot0 | n7_slot2) + n7_load_store")
+
+(define_insn_reservation "n7_cti" 1
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" 
"cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
+  "n7_slot1")
+
+(define_insn_reservation "n7_fp" 11
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul"))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_array" 12
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "array,edge,edgen"))
+  "n7_slot1, nothing*11")
+
+(define_insn_reservation "n7_fpdivs" 24
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpdivs,fpsqrts"))
+  "n7_slot1, nothing*23")
+
+(define_insn_reservation "n7_fpdivd" 37
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fpdivd,fpsqrtd"))
+  "n7_slot1, nothing*36")
+
+(define_insn_reservation "n7_lzd" 12
+  (and (eq_attr "cpu" "niagara7")
+       (eq_attr "type" "lzd"))
+  "(n7_slot0 | n7_slot1), nothing*11")
+
+;; There is an internal unit called the "V3 pipe", that was originally
+;; intended to process some of the short cryptographic instructions.
+;; However, as soon as in the T4 several of the VIS instructions
+;; (notably non-FP instructions) have been moved to the V3 pipe.
+;; Consequently, these instructions feature a latency of 3 instead of
+;; 11 or 12 cycles, provided their consumers also execute in the V3
+;; pipe.
+;;
+;; This is modelled here with a bypass.
+
+(define_insn_reservation "n7_vis_fga" 11
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fga,gsr"))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_fgm" 11
+  (and (eq_attr "cpu" "niagara7")
+    (eq_attr "type" "fgm_pack,fgm_mul,pdist"))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_move_v3pipe" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "vismv")
+         (eq_attr "v3pipe" "true")))
+  "n7_slot1")
+
+(define_insn_reservation "n7_vis_move_11cycle" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "vismv")
+         (eq_attr "v3pipe" "false")))
+  "n7_slot1, nothing*10")
+
+(define_insn_reservation "n7_vis_logical_v3pipe" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "visl,pdistn")
+         (eq_attr "v3pipe" "true")))
+  "n7_slot1, nothing*2")
+
+(define_insn_reservation "n7_vis_logical_11cycle" 11
+  (and (eq_attr "cpu" "niagara7")
+    (and (eq_attr "type" "visl")
+      (eq_attr "v3pipe" "false")))
+  "n7_slot1, nothing*10")
+
+(define_bypass 3 "*_v3pipe" "*_v3pipe")
diff --git a/gcc/config/sparc/sol2.h b/gcc/config/sparc/sol2.h
index a54e1ec..2a843c5 100644
--- a/gcc/config/sparc/sol2.h
+++ b/gcc/config/sparc/sol2.h
@@ -165,13 +165,22 @@ along with GCC; see the file COPYING3.  If not see
 #define ASM_CPU64_DEFAULT_SPEC AS_SPARC64_FLAG AS_NIAGARA4_FLAG
 #endif
 
+#if TARGET_CPU_DEFAULT == TARGET_CPU_niagara7
+#undef CPP_CPU64_DEFAULT_SPEC
+#define CPP_CPU64_DEFAULT_SPEC ""
+#undef ASM_CPU32_DEFAULT_SPEC
+#define ASM_CPU32_DEFAUILT_SPEC AS_SPARC32_FLAG AS_NIAGARA7_FLAG
+#undef ASM_CPU64_DEFAULT_SPEC
+#define ASM_CPU64_DEFAULT_SPEC AS_SPARC64_FLAG AS_NIAGARA7_FLAG
+#endif
+
 #undef CPP_CPU_SPEC
 #define CPP_CPU_SPEC "\
 %{mcpu=sparclet|mcpu=tsc701:-D__sparclet__} \
 %{mcpu=sparclite|mcpu-f930|mcpu=f934:-D__sparclite__} \
 %{mcpu=v8:" DEF_ARCH32_SPEC("-D__sparcv8") "} \
 %{mcpu=supersparc:-D__supersparc__ " DEF_ARCH32_SPEC("-D__sparcv8") "} \
-%{mcpu=v9|mcpu=ultrasparc|mcpu=ultrasparc3|mcpu=niagara|mcpu=niagara2|mcpu=niagara3|mcpu=niagara4:"
 DEF_ARCH32_SPEC("-D__sparcv8") "} \
+%{mcpu=v9|mcpu=ultrasparc|mcpu=ultrasparc3|mcpu=niagara|mcpu=niagara2|mcpu=niagara3|mcpu=niagara4|mcpu=niagara7:"
 DEF_ARCH32_SPEC("-D__sparcv8") "} \
 %{!mcpu*:%(cpp_cpu_default)} \
 "
 
@@ -280,7 +289,8 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
 %{mcpu=niagara2:" DEF_ARCH32_SPEC("-xarch=v8plusb") 
DEF_ARCH64_SPEC("-xarch=v9b") "} \
 %{mcpu=niagara3:" DEF_ARCH32_SPEC("-xarch=v8plus" AS_NIAGARA3_FLAG) 
DEF_ARCH64_SPEC("-xarch=v9" AS_NIAGARA3_FLAG) "} \
 %{mcpu=niagara4:" DEF_ARCH32_SPEC(AS_SPARC32_FLAG AS_NIAGARA4_FLAG) 
DEF_ARCH64_SPEC(AS_SPARC64_FLAG AS_NIAGARA4_FLAG) "} \
-%{!mcpu=niagara4:%{!mcpu=niagara3:%{!mcpu=niagara2:%{!mcpu=niagara:%{!mcpu=ultrasparc3:%{!mcpu=ultrasparc:%{!mcpu=v9:%{mcpu*:"
 DEF_ARCH32_SPEC("-xarch=v8") DEF_ARCH64_SPEC("-xarch=v9") "}}}}}}}} \
+%{mcpu=niagara7:" DEF_ARCH32_SPEC(AS_SPARC32_FLAG AS_NIAGARA7_FLAG) 
DEF_ARCH64_SPEC(AS_SPARC64_FLAG AS_NIAGARA7_FLAG) "} \
+%{!mcpu=niagara7:%{!mcpu=niagara4:%{!mcpu=niagara3:%{!mcpu=niagara2:%{!mcpu=niagara:%{!mcpu=ultrasparc3:%{!mcpu=ultrasparc:%{!mcpu=v9:%{mcpu*:"
 DEF_ARCH32_SPEC("-xarch=v8") DEF_ARCH64_SPEC("-xarch=v9") "}}}}}}}}} \
 %{!mcpu*:%(asm_cpu_default)} \
 "
 
diff --git a/gcc/config/sparc/sparc-c.c b/gcc/config/sparc/sparc-c.c
index d3fd60e..d9f9c15 100644
--- a/gcc/config/sparc/sparc-c.c
+++ b/gcc/config/sparc/sparc-c.c
@@ -40,7 +40,12 @@ sparc_target_macros (void)
       cpp_assert (parse_in, "machine=sparc");
     }
 
-  if (TARGET_VIS3)
+  if (TARGET_VIS4)
+    {
+      cpp_define (parse_in, "__VIS__=0x400");
+      cpp_define (parse_in, "__VIS__=0x400");
+    }
+  else if (TARGET_VIS3)
     {
       cpp_define (parse_in, "__VIS__=0x300");
       cpp_define (parse_in, "__VIS=0x300");
diff --git a/gcc/config/sparc/sparc-opts.h b/gcc/config/sparc/sparc-opts.h
index ce48f18..40d23a2 100644
--- a/gcc/config/sparc/sparc-opts.h
+++ b/gcc/config/sparc/sparc-opts.h
@@ -45,6 +45,7 @@ enum processor_type {
   PROCESSOR_NIAGARA2,
   PROCESSOR_NIAGARA3,
   PROCESSOR_NIAGARA4,
+  PROCESSOR_NIAGARA7,
   PROCESSOR_NATIVE
 };
 
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 082af3c..1d2ecaa 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -423,6 +423,30 @@ struct processor_costs niagara4_costs = {
   0, /* shift penalty */
 };
 
+static const
+struct processor_costs niagara7_costs = {
+  COSTS_N_INSNS (5), /* int load */
+  COSTS_N_INSNS (5), /* int signed load */
+  COSTS_N_INSNS (5), /* int zeroed load */
+  COSTS_N_INSNS (5), /* float load */
+  COSTS_N_INSNS (11), /* fmov, fneg, fabs */
+  COSTS_N_INSNS (11), /* fadd, fsub */
+  COSTS_N_INSNS (11), /* fcmp */
+  COSTS_N_INSNS (11), /* fmov, fmovr */
+  COSTS_N_INSNS (11), /* fmul */
+  COSTS_N_INSNS (24), /* fdivs */
+  COSTS_N_INSNS (37), /* fdivd */
+  COSTS_N_INSNS (24), /* fsqrts */
+  COSTS_N_INSNS (37), /* fsqrtd */
+  COSTS_N_INSNS (12), /* imul */
+  COSTS_N_INSNS (12), /* imulX */
+  0, /* imul bit factor */
+  COSTS_N_INSNS (51), /* idiv, average of 42 - 61 cycle range */
+  COSTS_N_INSNS (35), /* idivX, average of 26 - 44 cycle range */
+  COSTS_N_INSNS (1), /* movcc/movr */
+  0, /* shift penalty */
+};
+
 static const struct processor_costs *sparc_costs = &cypress_costs;
 
 #ifdef HAVE_AS_RELAX_OPTION
@@ -1175,6 +1199,8 @@ dump_target_flag_bits (const int flags)
     fprintf (stderr, "VIS2 ");
   if (flags & MASK_VIS3)
     fprintf (stderr, "VIS3 ");
+  if (flags & MASK_VIS4)
+    fprintf (stderr, "VIS4 ");
   if (flags & MASK_CBCOND)
     fprintf (stderr, "CBCOND ");
   if (flags & MASK_DEPRECATED_V8_INSNS)
@@ -1238,6 +1264,7 @@ sparc_option_override (void)
     { TARGET_CPU_niagara2, PROCESSOR_NIAGARA2 },
     { TARGET_CPU_niagara3, PROCESSOR_NIAGARA3 },
     { TARGET_CPU_niagara4, PROCESSOR_NIAGARA4 },
+    { TARGET_CPU_niagara7, PROCESSOR_NIAGARA7 },
     { -1, PROCESSOR_V7 }
   };
   const struct cpu_default *def;
@@ -1287,6 +1314,9 @@ sparc_option_override (void)
     /* UltraSPARC T4 */
     { "niagara4",      MASK_ISA,
       MASK_V9|MASK_POPC|MASK_VIS2|MASK_VIS3|MASK_FMAF|MASK_CBCOND },
+    /* UltraSPARC M7 */
+    { "niagara7",      MASK_ISA,
+      MASK_V9|MASK_POPC|MASK_VIS2|MASK_VIS3|MASK_VIS4|MASK_FMAF|MASK_CBCOND },
   };
   const struct cpu_table *cpu;
   unsigned int i;
@@ -1416,6 +1446,9 @@ sparc_option_override (void)
 #ifndef HAVE_AS_SPARC4
                   & ~MASK_CBCOND
 #endif
+#ifndef HAVE_AS_SPARC5_VIS4
+                  & ~MASK_VIS4
+#endif
 #ifndef HAVE_AS_LEON
                   & ~(MASK_LEON | MASK_LEON3)
 #endif
@@ -1434,10 +1467,15 @@ sparc_option_override (void)
   if (TARGET_VIS3)
     target_flags |= MASK_VIS2 | MASK_VIS;
 
-  /* Don't allow -mvis, -mvis2, -mvis3, or -mfmaf if FPU is
+  /* -mvis4 implies -mvis3, -mvis2 and -mvis */
+  if (TARGET_VIS4)
+    target_flags |= MASK_VIS3 | MASK_VIS2 | MASK_VIS;
+
+  /* Don't allow -mvis, -mvis2, -mvis3, -mvis4 or -mfmaf if FPU is
      disabled.  */
   if (! TARGET_FPU)
-    target_flags &= ~(MASK_VIS | MASK_VIS2 | MASK_VIS3 | MASK_FMAF);
+    target_flags &= ~(MASK_VIS | MASK_VIS2 | MASK_VIS3 | MASK_VIS4
+                     | MASK_FMAF);
 
   /* -mvis assumes UltraSPARC+, so we are sure v9 instructions
      are available.
@@ -1471,7 +1509,8 @@ sparc_option_override (void)
          || sparc_cpu == PROCESSOR_NIAGARA
          || sparc_cpu == PROCESSOR_NIAGARA2
          || sparc_cpu == PROCESSOR_NIAGARA3
-         || sparc_cpu == PROCESSOR_NIAGARA4))
+         || sparc_cpu == PROCESSOR_NIAGARA4
+         || sparc_cpu == PROCESSOR_NIAGARA7))
     align_functions = 32;
 
   /* Validate PCC_STRUCT_RETURN.  */
@@ -1535,6 +1574,9 @@ sparc_option_override (void)
     case PROCESSOR_NIAGARA4:
       sparc_costs = &niagara4_costs;
       break;
+    case PROCESSOR_NIAGARA7:
+      sparc_costs = &niagara7_costs;
+      break;
     case PROCESSOR_NATIVE:
       gcc_unreachable ();
     };
@@ -1566,6 +1608,29 @@ sparc_option_override (void)
   if (TARGET_DEBUG_OPTIONS)
     dump_target_flags ("Final target_flags", target_flags);
 
+  /* PARAM_SIMULTANEOUS_PREFETCHES is the number of prefetches that
+     can run at the same time.  More important, it is the threshold
+     defining when additional prefetches will be dropped by the
+     hardware.
+
+     The UltraSPARC-III features a documented prefetch queue with a
+     size of 8.  Additional prefetches issued in the cpu are
+     dropped.
+
+     Niagara processors are different.  In these processors prefetches
+     are handled much like regular loads.  The L1 miss buffer is 32
+     entries, but prefetches start getting affected when 30 entries
+     become occupied.  That occupation could be a mix of regular loads
+     and prefetches though.  And that buffer is shared by all threads.
+     Once the threshold is reached, if the core is running a single
+     thread the prefetch will retry.  If more than one thread is
+     running, the prefetch will be dropped.
+
+     All this makes it very difficult to determine how many
+     simultaneous prefetches can be issued simultaneously, even in a
+     single-threaded program.  Experimental results show that setting
+     this parameter to 32 works well when the number of threads is not
+     high.  */
   maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
                         ((sparc_cpu == PROCESSOR_ULTRASPARC
                           || sparc_cpu == PROCESSOR_NIAGARA
@@ -1574,20 +1639,55 @@ sparc_option_override (void)
                           || sparc_cpu == PROCESSOR_NIAGARA4)
                          ? 2
                          : (sparc_cpu == PROCESSOR_ULTRASPARC3
-                            ? 8 : 3)),
+                            ? 8 : (sparc_cpu == PROCESSOR_NIAGARA7
+                                   ? 32 : 3))),
                         global_options.x_param_values,
                         global_options_set.x_param_values);
-  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+
+  /* For PARAM_L1_CACHE_LINE_SIZE we use the default 32 bytes (see
+     params.def), so no maybe_set_param_value is needed.
+
+     The Oracle SPARC Architecture (previously the UltraSPARC
+     Architecture) specification states that when a PREFETCH[A]
+     instruction is executed an implementation-specific amount of data
+     is prefetched, and that it is at least 64 bytes long (aligned to
+     at least 64 bytes).
+
+     However, this is not correct.  The M7 (and implementations prior
+     to that) does not guarantee a 64B prefetch into a cache if the
+     line size is smaller.  A single cache line is all that is ever
+     prefetched.  So for the M7, where the L1D$ has 32B lines and the
+     L2D$ and L3 have 64B lines, a prefetch will prefetch 64B into the
+     L2 and L3, but only 32B are brought into the L1D$. (Assuming it
+     is a read_n prefetch, which is the only type which allocates to
+     the L1.)  */
+
+  /* PARAM_L1_CACHE_SIZE is the size of the L1D$ (most SPARC chips use
+     Hardvard level-1 caches) in kilobytes.  Both UltraSPARC and
+     Niagara processors feature a L1D$ of 16KB.  */
+  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
                         ((sparc_cpu == PROCESSOR_ULTRASPARC
                           || sparc_cpu == PROCESSOR_ULTRASPARC3
                           || sparc_cpu == PROCESSOR_NIAGARA
                           || sparc_cpu == PROCESSOR_NIAGARA2
                           || sparc_cpu == PROCESSOR_NIAGARA3
-                          || sparc_cpu == PROCESSOR_NIAGARA4)
-                         ? 64 : 32),
+                          || sparc_cpu == PROCESSOR_NIAGARA4
+                          || sparc_cpu == PROCESSOR_NIAGARA7)
+                         ? 16 : 64),
                         global_options.x_param_values,
                         global_options_set.x_param_values);
 
+
+  /* PARAM_L2_CACHE_SIZE is the size fo the L2 in kilobytes.  Note
+     that 512 is the default in params.def.  */
+  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+                        (sparc_cpu == PROCESSOR_NIAGARA4
+                         ? 128 : (sparc_cpu == PROCESSOR_NIAGARA7
+                                  ? 256 : 512)),
+                        global_options.x_param_values,
+                        global_options_set.x_param_values);
+  
+
   /* Disable save slot sharing for call-clobbered registers by default.
      The IRA sharing algorithm works on single registers only and this
      pessimizes for double floating-point registers.  */
@@ -9178,7 +9278,8 @@ sparc32_initialize_trampoline (rtx m_tramp, rtx fnaddr, 
rtx cxt)
       && sparc_cpu != PROCESSOR_NIAGARA
       && sparc_cpu != PROCESSOR_NIAGARA2
       && sparc_cpu != PROCESSOR_NIAGARA3
-      && sparc_cpu != PROCESSOR_NIAGARA4)
+      && sparc_cpu != PROCESSOR_NIAGARA4
+      && sparc_cpu != PROCESSOR_NIAGARA7)
     emit_insn (gen_flushsi (validize_mem (adjust_address (m_tramp, SImode, 
8))));
 
   /* Call __enable_execute_stack after writing onto the stack to make sure
@@ -9223,7 +9324,8 @@ sparc64_initialize_trampoline (rtx m_tramp, rtx fnaddr, 
rtx cxt)
       && sparc_cpu != PROCESSOR_NIAGARA
       && sparc_cpu != PROCESSOR_NIAGARA2
       && sparc_cpu != PROCESSOR_NIAGARA3
-      && sparc_cpu != PROCESSOR_NIAGARA4)
+      && sparc_cpu != PROCESSOR_NIAGARA4
+      && sparc_cpu != PROCESSOR_NIAGARA7)
     emit_insn (gen_flushdi (validize_mem (adjust_address (m_tramp, DImode, 
8))));
 
   /* Call __enable_execute_stack after writing onto the stack to make sure
@@ -9419,7 +9521,8 @@ sparc_use_sched_lookahead (void)
       || sparc_cpu == PROCESSOR_NIAGARA2
       || sparc_cpu == PROCESSOR_NIAGARA3)
     return 0;
-  if (sparc_cpu == PROCESSOR_NIAGARA4)
+  if (sparc_cpu == PROCESSOR_NIAGARA4
+      || sparc_cpu == PROCESSOR_NIAGARA7)
     return 2;
   if (sparc_cpu == PROCESSOR_ULTRASPARC
       || sparc_cpu == PROCESSOR_ULTRASPARC3)
@@ -9442,6 +9545,7 @@ sparc_issue_rate (void)
     default:
       return 1;
     case PROCESSOR_NIAGARA4:
+    case PROCESSOR_NIAGARA7:
     case PROCESSOR_V9:
       /* Assume V9 processors are capable of at least dual-issue.  */
       return 2;
@@ -10007,6 +10111,34 @@ enum sparc_builtins
   SPARC_BUILTIN_XMULX,
   SPARC_BUILTIN_XMULXHI,
 
+  /* VIS 4.0 builtins.  */
+  SPARC_BUILTIN_FPADD8,
+  SPARC_BUILTIN_FPADDS8,
+  SPARC_BUILTIN_FPADDUS8,
+  SPARC_BUILTIN_FPADDUS16,
+  SPARC_BUILTIN_FPCMPLE8,
+  SPARC_BUILTIN_FPCMPGT8,
+  SPARC_BUILTIN_FPCMPULE16,
+  SPARC_BUILTIN_FPCMPUGT16,
+  SPARC_BUILTIN_FPCMPULE32,
+  SPARC_BUILTIN_FPCMPUGT32,
+  SPARC_BUILTIN_FPMAX8,
+  SPARC_BUILTIN_FPMAX16,
+  SPARC_BUILTIN_FPMAX32,
+  SPARC_BUILTIN_FPMAXU8,
+  SPARC_BUILTIN_FPMAXU16,
+  SPARC_BUILTIN_FPMAXU32,
+  SPARC_BUILTIN_FPMIN8,
+  SPARC_BUILTIN_FPMIN16,
+  SPARC_BUILTIN_FPMIN32,
+  SPARC_BUILTIN_FPMINU8,
+  SPARC_BUILTIN_FPMINU16,
+  SPARC_BUILTIN_FPMINU32,
+  SPARC_BUILTIN_FPSUB8,
+  SPARC_BUILTIN_FPSUBS8,
+  SPARC_BUILTIN_FPSUBUS8,
+  SPARC_BUILTIN_FPSUBUS16,
+  
   SPARC_BUILTIN_MAX
 };
 
@@ -10483,6 +10615,83 @@ sparc_vis_init_builtins (void)
       def_builtin_const ("__builtin_vis_xmulxhi", CODE_FOR_xmulxhi_vis,
                         SPARC_BUILTIN_XMULXHI, di_ftype_di_di);
     }
+
+  if (TARGET_VIS4)
+    {
+      def_builtin_const ("__builtin_vis_fpadd8", CODE_FOR_addv8qi3,
+                        SPARC_BUILTIN_FPADD8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpadds8", CODE_FOR_ssaddv8qi3,
+                        SPARC_BUILTIN_FPADDS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpaddus8", CODE_FOR_usaddv8qi3,
+                        SPARC_BUILTIN_FPADDUS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpaddus16", CODE_FOR_usaddv4hi3,
+                        SPARC_BUILTIN_FPADDUS16, v4hi_ftype_v4hi_v4hi);
+
+
+      if (TARGET_ARCH64)
+       {
+         def_builtin_const ("__builtin_vis_fpcmple8", CODE_FOR_fpcmple8di_vis,
+                            SPARC_BUILTIN_FPCMPLE8, di_ftype_v8qi_v8qi);
+         def_builtin_const ("__builtin_vis_fpcmpgt8", CODE_FOR_fpcmpgt8di_vis,
+                            SPARC_BUILTIN_FPCMPGT8, di_ftype_v8qi_v8qi);
+         def_builtin_const ("__builtin_vis_fpcmpule16", 
CODE_FOR_fpcmpule16di_vis,
+                            SPARC_BUILTIN_FPCMPULE16, di_ftype_v4hi_v4hi);
+         def_builtin_const ("__builtin_vis_fpcmpugt16", 
CODE_FOR_fpcmpugt16di_vis,
+                            SPARC_BUILTIN_FPCMPUGT16, di_ftype_v4hi_v4hi);
+         def_builtin_const ("__builtin_vis_fpcmpule32", 
CODE_FOR_fpcmpule32di_vis,
+                            SPARC_BUILTIN_FPCMPULE32, di_ftype_v2si_v2si);
+         def_builtin_const ("__builtin_vis_fpcmpugt32", 
CODE_FOR_fpcmpugt32di_vis,
+                            SPARC_BUILTIN_FPCMPUGT32, di_ftype_v2si_v2si);
+       }
+      else
+       {
+         def_builtin_const ("__builtin_vis_fpcmple8", CODE_FOR_fpcmple8si_vis,
+                            SPARC_BUILTIN_FPCMPLE8, si_ftype_v8qi_v8qi);
+         def_builtin_const ("__builtin_vis_fpcmpgt8", CODE_FOR_fpcmpgt8si_vis,
+                            SPARC_BUILTIN_FPCMPGT8, si_ftype_v8qi_v8qi);
+         def_builtin_const ("__builtin_vis_fpcmpule16", 
CODE_FOR_fpcmpule16si_vis,
+                            SPARC_BUILTIN_FPCMPULE16, si_ftype_v4hi_v4hi);
+         def_builtin_const ("__builtin_vis_fpcmpugt16", 
CODE_FOR_fpcmpugt16si_vis,
+                            SPARC_BUILTIN_FPCMPUGT16, si_ftype_v4hi_v4hi);
+         def_builtin_const ("__builtin_vis_fpcmpule32", 
CODE_FOR_fpcmpule32si_vis,
+                            SPARC_BUILTIN_FPCMPULE32, di_ftype_v2si_v2si);
+         def_builtin_const ("__builtin_vis_fpcmpugt32", 
CODE_FOR_fpcmpugt32si_vis,
+                            SPARC_BUILTIN_FPCMPUGT32, di_ftype_v2si_v2si);
+       }
+      
+      def_builtin_const ("__builtin_vis_fpmax8", CODE_FOR_maxv8qi3,
+                        SPARC_BUILTIN_FPMAX8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpmax16", CODE_FOR_maxv4hi3,
+                        SPARC_BUILTIN_FPMAX16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpmax32", CODE_FOR_maxv2si3,
+                        SPARC_BUILTIN_FPMAX32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpmaxu8", CODE_FOR_maxuv8qi3,
+                        SPARC_BUILTIN_FPMAXU8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpmaxu16", CODE_FOR_maxuv4hi3,
+                        SPARC_BUILTIN_FPMAXU16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpmaxu32", CODE_FOR_maxuv2si3,
+                        SPARC_BUILTIN_FPMAXU32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpmin8", CODE_FOR_minv8qi3,
+                        SPARC_BUILTIN_FPMIN8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpmin16", CODE_FOR_minv4hi3,
+                        SPARC_BUILTIN_FPMIN16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpmin32", CODE_FOR_minv2si3,
+                        SPARC_BUILTIN_FPMIN32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpminu8", CODE_FOR_minuv8qi3,
+                        SPARC_BUILTIN_FPMINU8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpminu16", CODE_FOR_minuv4hi3,
+                        SPARC_BUILTIN_FPMINU16, v4hi_ftype_v4hi_v4hi);
+      def_builtin_const ("__builtin_vis_fpminu32", CODE_FOR_minuv2si3,
+                        SPARC_BUILTIN_FPMINU32, v2si_ftype_v2si_v2si);
+      def_builtin_const ("__builtin_vis_fpsub8", CODE_FOR_subv8qi3,
+                        SPARC_BUILTIN_FPSUB8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpsubs8", CODE_FOR_sssubv8qi3,
+                        SPARC_BUILTIN_FPSUBS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpsubus8", CODE_FOR_ussubv8qi3,
+                        SPARC_BUILTIN_FPSUBUS8, v8qi_ftype_v8qi_v8qi);
+      def_builtin_const ("__builtin_vis_fpsubus16", CODE_FOR_ussubv4hi3,
+                        SPARC_BUILTIN_FPSUBUS16, v4hi_ftype_v4hi_v4hi);
+    }
 }
 
 /* Implement TARGET_BUILTIN_DECL hook.  */
@@ -11042,7 +11251,8 @@ sparc_register_move_cost (machine_mode mode 
ATTRIBUTE_UNUSED,
          || sparc_cpu == PROCESSOR_NIAGARA
          || sparc_cpu == PROCESSOR_NIAGARA2
          || sparc_cpu == PROCESSOR_NIAGARA3
-         || sparc_cpu == PROCESSOR_NIAGARA4)
+         || sparc_cpu == PROCESSOR_NIAGARA4
+         || sparc_cpu == PROCESSOR_NIAGARA7)
        return 12;
 
       return 6;
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index ebfe87d..d91496a 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -142,6 +142,7 @@ extern enum cmodel sparc_cmodel;
 #define TARGET_CPU_niagara2    14
 #define TARGET_CPU_niagara3    15
 #define TARGET_CPU_niagara4    16
+#define TARGET_CPU_niagara7    19
 
 #if TARGET_CPU_DEFAULT == TARGET_CPU_v9 \
  || TARGET_CPU_DEFAULT == TARGET_CPU_ultrasparc \
@@ -149,7 +150,8 @@ extern enum cmodel sparc_cmodel;
  || TARGET_CPU_DEFAULT == TARGET_CPU_niagara \
  || TARGET_CPU_DEFAULT == TARGET_CPU_niagara2 \
  || TARGET_CPU_DEFAULT == TARGET_CPU_niagara3 \
- || TARGET_CPU_DEFAULT == TARGET_CPU_niagara4
+ || TARGET_CPU_DEFAULT == TARGET_CPU_niagara4 \
+ || TARGET_CPU_DEFAULT == TARGET_CPU_niagara7
 
 #define CPP_CPU32_DEFAULT_SPEC ""
 #define ASM_CPU32_DEFAULT_SPEC ""
@@ -186,6 +188,10 @@ extern enum cmodel sparc_cmodel;
 #define CPP_CPU64_DEFAULT_SPEC "-D__sparc_v9__"
 #define ASM_CPU64_DEFAULT_SPEC AS_NIAGARA4_FLAG
 #endif
+#if TARGET_CPU_DEFAULT == TARGET_CPU_niagara7
+#define CPP_CPU64_DEFAULT_SPEC "-D__sparc_v9__"
+#define ASM_CPU64_DEFAULT_SPEC AS_NIAGARA7_FLAG
+#endif
 
 #else
 
@@ -288,6 +294,7 @@ extern enum cmodel sparc_cmodel;
 %{mcpu=niagara2:-D__sparc_v9__} \
 %{mcpu=niagara3:-D__sparc_v9__} \
 %{mcpu=niagara4:-D__sparc_v9__} \
+%{mcpu=niagara7:-D__sparc_v9__} \
 %{!mcpu*:%(cpp_cpu_default)} \
 "
 #define CPP_ARCH32_SPEC ""
@@ -339,6 +346,7 @@ extern enum cmodel sparc_cmodel;
 %{mcpu=niagara2:%{!mv8plus:-Av9b}} \
 %{mcpu=niagara3:%{!mv8plus:-Av9" AS_NIAGARA3_FLAG "}} \
 %{mcpu=niagara4:%{!mv8plus:" AS_NIAGARA4_FLAG "}} \
+%{mcpu=niagara7:%{!mv8plus:" AS_NIAGARA7_FLAG "}} \
 %{!mcpu*:%(asm_cpu_default)} \
 "
 
@@ -1777,6 +1785,12 @@ extern int sparc_indent_opcode;
 #define AS_NIAGARA4_FLAG "-Av9" AS_NIAGARA3_FLAG
 #endif
 
+#ifdef HAVE_AS_SPARC5_VIS4
+#define AS_NIAGARA7_FLAG "-xarch=sparc5"
+#else
+#define AS_NIAGARA7_FLAG AS_NIAGARA4_FLAG
+#endif
+
 #ifdef HAVE_AS_LEON
 #define AS_LEON_FLAG "-Aleon"
 #define AS_LEONV7_FLAG "-Aleon"
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 56d4f63..29e4966 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -234,7 +234,8 @@
    niagara,
    niagara2,
    niagara3,
-   niagara4"
+   niagara4,
+   niagara7"
   (const (symbol_ref "sparc_cpu_attr")))
 
 ;; Attribute for the instruction set.
@@ -247,7 +248,7 @@
         (symbol_ref "TARGET_SPARCLET") (const_string "sparclet")]
        (const_string "v7"))))
 
-(define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3" (const_string 
"none"))
+(define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4" (const_string 
"none"))
 
 (define_attr "enabled" ""
   (cond [(eq_attr "cpu_feature" "none") (const_int 1)
@@ -255,7 +256,8 @@
         (eq_attr "cpu_feature" "fpunotv9") (symbol_ref "TARGET_FPU && ! 
TARGET_V9")
          (eq_attr "cpu_feature" "v9") (symbol_ref "TARGET_V9")
          (eq_attr "cpu_feature" "vis") (symbol_ref "TARGET_VIS")
-         (eq_attr "cpu_feature" "vis3") (symbol_ref "TARGET_VIS3")]
+         (eq_attr "cpu_feature" "vis3") (symbol_ref "TARGET_VIS3")
+         (eq_attr "cpu_feature" "vis4") (symbol_ref "TARGET_VIS4")]
         (const_int 0)))
 
 ;; Insn type.
@@ -274,7 +276,7 @@
    fga,visl,vismv,fgm_pack,fgm_mul,pdist,pdistn,edge,edgen,gsr,array,
    cmove,
    ialuX,
-   multi,savew,flushw,iflush,trap"
+   multi,savew,flushw,iflush,trap,lzd"
   (const_string "ialu"))
 
 ;; True if branch/call has empty delay slot and will emit a nop in it
@@ -476,6 +478,10 @@
           (const_string "true")
        ] (const_string "false")))
 
+;; True if the instruction executes in the V3 pipeline, in M7 and
+;; later processors.
+(define_attr "v3pipe" "false,true" (const_string "false"))
+
 (define_delay (eq_attr "type" "call")
   [(eq_attr "in_call_delay" "true") (nil) (nil)])
 
@@ -504,6 +510,7 @@
 (include "niagara.md")
 (include "niagara2.md")
 (include "niagara4.md")
+(include "niagara7.md")
 
 
 ;; Operand and operator predicates and constraints
@@ -1457,6 +1464,7 @@
    fzeros\t%0
    fones\t%0"
   [(set_attr "type" 
"*,*,load,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
+   (set_attr "v3pipe" "*,*,*,*,true,true,*,*,*,true,true")
    (set_attr "cpu_feature" "*,*,*,*,vis3,vis3,*,*,*,vis,vis")])
 
 (define_insn "*movsi_lo_sum"
@@ -1622,6 +1630,7 @@
    fzero\t%0
    fone\t%0"
   [(set_attr "type" 
"store,store,store,load,*,*,*,*,fpstore,fpload,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl")
+   (set_attr "v3pipe" "false, false, false, 
false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,
 true, true")
    (set_attr "length" "*,2,*,*,2,2,2,2,*,*,2,2,*,2,2,2,*,*,*,*")
    (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double")
    (set_attr "cpu_feature" 
"v9,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis")])
@@ -1645,6 +1654,7 @@
    fzero\t%0
    fone\t%0"
   [(set_attr "type" 
"*,*,load,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
+   (set_attr "v3pipe" "*, *, *, *, *, *, *, *, *, true, true")
    (set_attr "fptype" "*,*,*,*,*,*,double,*,*,double,double")
    (set_attr "cpu_feature" "*,*,*,*,vis3,vis3,*,*,*,vis,vis")])
 
@@ -2208,6 +2218,7 @@
     }
 }
   [(set_attr "type" 
"visl,visl,fpmove,*,*,*,vismv,vismv,fpload,load,fpstore,store")
+   (set_attr "v3pipe" "true, true, *, *, *, *, true, true, *, *, *, *")
    (set_attr "cpu_feature" "vis,vis,fpu,*,*,*,vis3,vis3,fpu,*,fpu,*")])
 
 ;; The following 3 patterns build SFmode constants in integer registers.
@@ -2276,6 +2287,7 @@
   #
   #"
   [(set_attr "type" 
"visl,visl,fpmove,*,*,*,fpload,store,fpstore,load,store,*,*,*,*")
+   (set_attr "v3pipe" "true, true, *, *, *, *, *, *, *, *, *, *, *, *, *")
    (set_attr "length" "*,*,*,2,2,2,*,*,*,*,*,2,2,2,2")
    (set_attr "fptype" "double,double,double,*,*,*,*,*,*,*,*,*,*,*,*")
    (set_attr "cpu_feature" 
"vis,vis,v9,fpunotv9,vis3,vis3,fpu,v9,fpu,*,*,fpu,*,*,fpu")])
@@ -2299,6 +2311,7 @@
   stx\t%r1, %0
   #"
   [(set_attr "type" "visl,visl,fpmove,vismv,vismv,load,store,*,load,store,*")
+   (set_attr "v3pipe" "true, true, *, *, *, *, *, *, *, *, *")
    (set_attr "length" "*,*,*,*,*,*,*,*,*,*,2")
    (set_attr "fptype" "double,double,double,double,double,*,*,*,*,*,*")
    (set_attr "cpu_feature" "vis,vis,fpu,vis3,vis3,fpu,fpu,*,*,*,*")])
@@ -2980,6 +2993,7 @@
    lduw\t%1, %0
    movstouw\t%1, %0"
   [(set_attr "type" "shift,load,*")
+   (set_attr "v3pipe" "*,*,true")
    (set_attr "cpu_feature" "*,*,vis3")])
 
 (define_insn_and_split "*zero_extendsidi2_insn_sp32"
@@ -3294,6 +3308,7 @@
   ldsw\t%1, %0
   movstosw\t%1, %0"
   [(set_attr "type" "shift,sload,*")
+   (set_attr "v3pipe" "*,*,true")
    (set_attr "us3load_type" "*,3cycle,*")
    (set_attr "cpu_feature" "*,*,vis3")])
 
@@ -6770,7 +6785,8 @@
   [(set (match_operand:DI 0 "register_operand" "=r")
         (clz:DI (match_operand:DI 1 "register_operand" "r")))]
   "TARGET_VIS3 && TARGET_ARCH64"
-  "lzd\t%1, %0")
+  "lzd\t%1, %0"
+  [(set_attr "type" "lzd")])
 
 (define_insn "clzdi_v8plus"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -6811,7 +6827,8 @@
         (truncate:SI
           (clz:DI (match_operand:DI 1 "register_operand" "r"))))]
   "TARGET_VIS3 && TARGET_ARCH64"
-  "lzd\t%1, %0")
+  "lzd\t%1, %0"
+  [(set_attr "type" "lzd")])
 
 (define_insn "clzsi_v8plus"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -7777,7 +7794,7 @@
 (define_mode_iterator VM64 [V1DI V2SI V4HI V8QI])
 (define_mode_iterator VMALL [V1SI V2HI V4QI V1DI V2SI V4HI V8QI])
 
-(define_mode_attr vbits [(V2SI "32") (V4HI "16") (V1SI "32s") (V2HI "16s")])
+(define_mode_attr vbits [(V2SI "32") (V4HI "16") (V1SI "32s") (V2HI "16s") 
(V8QI "8")])
 (define_mode_attr vconstr [(V1SI "f") (V2HI "f") (V4QI "f")
                           (V1DI "e") (V2SI "e") (V4HI "e") (V8QI "e")])
 (define_mode_attr vfptype [(V1SI "single") (V2HI "single") (V4QI "single")
@@ -7812,6 +7829,7 @@
   movstouw\t%1, %0
   movwtos\t%1, %0"
   [(set_attr "type" 
"visl,visl,vismv,fpload,fpstore,store,load,store,*,vismv,vismv")
+   (set_attr "v3pipe" 
"true,true,true,false,false,false,false,false,false,true,true")
    (set_attr "cpu_feature" "vis,vis,vis,*,*,*,*,*,*,vis3,vis3")])
 
 (define_insn "*mov<VM64:mode>_insn_sp64"
@@ -7834,6 +7852,7 @@
   movxtod\t%1, %0
   mov\t%1, %0"
   [(set_attr "type" 
"visl,visl,vismv,fpload,fpstore,store,load,store,vismv,vismv,*")
+   (set_attr "v3pipe" "true, true, true, false, false, false, false, false, 
false, false, false")
    (set_attr "cpu_feature" "vis,vis,vis,*,*,*,*,*,vis3,vis3,*")])
 
 (define_insn "*mov<VM64:mode>_insn_sp32"
@@ -7857,6 +7876,7 @@
   #
   #"
   [(set_attr "type" "visl,visl,vismv,*,*,fpload,fpstore,store,load,store,*,*")
+   (set_attr "v3pipe" "true, true, true, false, false, false, false, false, 
false, false, false, false")
    (set_attr "length" "*,*,*,2,2,*,*,*,*,*,2,2")
    (set_attr "cpu_feature" "vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*")])
 
@@ -7936,7 +7956,8 @@
   "TARGET_VIS"
   "fp<plusminus_insn><vbits>\t%1, %2, %0"
   [(set_attr "type" "fga")
-   (set_attr "fptype" "<vfptype>")])
+   (set_attr "fptype" "<vfptype>")
+   (set_attr "v3pipe" "true")])
 
 (define_mode_iterator VL [V1SI V2HI V4QI V1DI V2SI V4HI V8QI])
 (define_mode_attr vlsuf [(V1SI "s") (V2HI "s") (V4QI "s")
@@ -7952,6 +7973,7 @@
   "TARGET_VIS"
   "f<vlinsn><vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_insn "*not_<code><mode>3"
@@ -7961,6 +7983,7 @@
   "TARGET_VIS"
   "f<vlninsn><vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 ;; (ior (not (op1)) (not (op2))) is the canonical form of NAND.
@@ -7971,6 +7994,7 @@
   "TARGET_VIS"
   "fnand<vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_code_iterator vlnotop [ior and])
@@ -7982,6 +8006,7 @@
   "TARGET_VIS"
   "f<vlinsn>not1<vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_insn "*<code>_not2<mode>_vis"
@@ -7991,6 +8016,7 @@
   "TARGET_VIS"
   "f<vlinsn>not2<vlsuf>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 (define_insn "one_cmpl<mode>2"
@@ -7999,6 +8025,7 @@
   "TARGET_VIS"
   "fnot1<vlsuf>\t%1, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "<vfptype>")])
 
 ;; Hard to generate VIS instructions.  We have builtins for these.
@@ -8225,7 +8252,8 @@
   "TARGET_VIS"
   "faligndata\t%1, %2, %0"
   [(set_attr "type" "fga")
-   (set_attr "fptype" "double")])
+   (set_attr "fptype" "double")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrsi_vis"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -8235,7 +8263,8 @@
         (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
   "TARGET_VIS"
   "alignaddr\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrdi_vis"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -8245,7 +8274,8 @@
         (plus:DI (match_dup 1) (match_dup 2)))]
   "TARGET_VIS"
   "alignaddr\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrlsi_vis"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -8256,7 +8286,8 @@
                 (const_int 7)))]
   "TARGET_VIS"
   "alignaddrl\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "alignaddrldi_vis"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -8267,7 +8298,8 @@
                 (const_int 7)))]
   "TARGET_VIS"
   "alignaddrl\t%r1, %r2, %0"
-  [(set_attr "type" "gsr")])
+  [(set_attr "type" "gsr")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "pdist_vis"
   [(set (match_operand:DI 0 "register_operand" "=e")
@@ -8360,6 +8392,17 @@
   "TARGET_VIS"
   "fcmp<code><GCM:gcm_name>\t%1, %2, %0"
   [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")
+   (set_attr "fptype" "double")])
+
+(define_insn "fpcmp<code>8<P:mode>_vis"
+  [(set (match_operand:P 0 "register_operand" "=r")
+       (unspec:P [(gcond:V8QI (match_operand:V8QI 1 "register_operand" "e")
+                              (match_operand:V8QI 2 "register_operand" "e"))]
+        UNSPEC_FCMP))]
+  "TARGET_VIS4"
+  "fpcmp<code>8\t%1, %2, %0"
+  [(set_attr "type" "visl")
    (set_attr "fptype" "double")])
 
 (define_expand "vcond<mode><mode>"
@@ -8427,7 +8470,8 @@
         (plus:DI (match_dup 1) (match_dup 2)))]
   "TARGET_VIS2"
   "bmask\t%r1, %r2, %0"
-  [(set_attr "type" "array")])
+  [(set_attr "type" "array")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "bmasksi_vis"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -8437,7 +8481,8 @@
         (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
   "TARGET_VIS2"
   "bmask\t%r1, %r2, %0"
-  [(set_attr "type" "array")])
+  [(set_attr "type" "array")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "bshuffle<VM64:mode>_vis"
   [(set (match_operand:VM64 0 "register_operand" "=e")
@@ -8448,7 +8493,8 @@
   "TARGET_VIS2"
   "bshuffle\t%1, %2, %0"
   [(set_attr "type" "fga")
-   (set_attr "fptype" "double")])
+   (set_attr "fptype" "double")
+   (set_attr "v3pipe" "true")])
 
 ;; The rtl expanders will happily convert constant permutations on other
 ;; modes down to V8QI.  Rely on this to avoid the complexity of the byte
@@ -8550,7 +8596,8 @@
                    UNSPEC_CMASK8))]
   "TARGET_VIS3"
   "cmask8\t%r0"
-  [(set_attr "type" "fga")])
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "cmask16<P:mode>_vis"
   [(set (reg:DI GSR_REG)
@@ -8559,7 +8606,8 @@
                    UNSPEC_CMASK16))]
   "TARGET_VIS3"
   "cmask16\t%r0"
-  [(set_attr "type" "fga")])
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "cmask32<P:mode>_vis"
   [(set (reg:DI GSR_REG)
@@ -8568,7 +8616,8 @@
                    UNSPEC_CMASK32))]
   "TARGET_VIS3"
   "cmask32\t%r0"
-  [(set_attr "type" "fga")])
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
 
 (define_insn "fchksm16_vis"
   [(set (match_operand:V4HI 0 "register_operand" "=e")
@@ -8601,6 +8650,7 @@
   "TARGET_VIS3"
   "pdistn\t%1, %2, %0"
   [(set_attr "type" "pdistn")
+   (set_attr "v3pipe" "true")
    (set_attr "fptype" "double")])
 
 (define_insn "fmean16_vis"
@@ -8628,6 +8678,14 @@
   "fp<plusminus_insn>64\t%1, %2, %0"
   [(set_attr "type" "fga")])
 
+(define_insn "<plusminus_insn>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=e")
+        (plusminus:V8QI (match_operand:V8QI 1 "register_operand" "e")
+                        (match_operand:V8QI 2 "register_operand" "e")))]
+  "TARGET_VIS4"
+  "fp<plusminus_insn>8\t%1, %2, %0"
+  [(set_attr "type" "fga")])
+
 (define_mode_iterator VASS [V4HI V2SI V2HI V1SI])
 (define_code_iterator vis3_addsub_ss [ss_plus ss_minus])
 (define_code_attr vis3_addsub_ss_insn
@@ -8641,8 +8699,63 @@
                              (match_operand:VASS 2 "register_operand" 
"<vconstr>")))]
   "TARGET_VIS3"
   "<vis3_addsub_ss_insn><vbits>\t%1, %2, %0"
+  [(set_attr "type" "fga")
+   (set_attr "v3pipe" "true")])
+
+(define_mode_iterator VMMAX [V8QI V4HI V2SI])
+(define_code_iterator vis4_minmax [smin smax])
+(define_code_attr vis4_minmax_insn
+  [(smin "fpmin") (smax "fpmax")])
+(define_code_attr vis4_minmax_patname
+  [(smin "min") (smax "max")])
+
+(define_insn "<vis4_minmax_patname><mode>3"
+  [(set (match_operand:VMMAX 0 "register_operand" "=<vconstr>")
+        (vis4_minmax:VMMAX (match_operand:VMMAX 1 "register_operand" 
"<vconstr>")
+                           (match_operand:VMMAX 2 "register_operand" 
"<vconstr>")))]
+  "TARGET_VIS4"
+  "<vis4_minmax_insn><vbits>\t%1, %2, %0"
+  [(set_attr "type" "fga")])
+
+(define_code_iterator vis4_uminmax [umin umax])
+(define_code_attr vis4_uminmax_insn
+  [(umin "fpminu") (umax "fpmaxu")])
+(define_code_attr vis4_uminmax_patname
+ [(umin "minu") (umax "maxu")])
+
+(define_insn "<vis4_uminmax_patname><mode>3"
+  [(set (match_operand:VMMAX 0 "register_operand" "=<vconstr>")
+        (vis4_uminmax:VMMAX (match_operand:VMMAX 1 "register_operand" 
"<vconstr>")
+                            (match_operand:VMMAX 2 "register_operand" 
"<vconstr>")))]
+  "TARGET_VIS4"
+  "<vis4_uminmax_insn><vbits>\t%1, %2, %0"
   [(set_attr "type" "fga")])
 
+;; The use of vis3_addsub_ss_patname in the VIS4 instruction below is
+;; intended.
+(define_insn "<vis3_addsub_ss_patname>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=e")
+        (vis3_addsub_ss:V8QI (match_operand:V8QI 1 "register_operand" "e")
+                             (match_operand:V8QI 2 "register_operand" "e")))]
+  "TARGET_VIS4"
+  "<vis3_addsub_ss_insn>8\t%1, %2, %0"
+  [(set_attr "type" "fga")])
+
+(define_mode_iterator VAUS [V4HI V8QI])
+(define_code_iterator vis4_addsub_us [us_plus us_minus])
+(define_code_attr vis4_addsub_us_insn
+  [(us_plus "fpaddus") (us_minus "fpsubus")])
+(define_code_attr vis4_addsub_us_patname
+  [(us_plus "usadd") (us_minus "ussub")])
+
+(define_insn "<vis4_addsub_us_patname><mode>3"
+ [(set (match_operand:VAUS 0 "register_operand" "=<vconstr>")
+       (vis4_addsub_us:VAUS (match_operand:VAUS 1 "register_operand" 
"<vconstr>")
+                            (match_operand:VAUS 2 "register_operand" 
"<vconstr>")))]
+ "TARGET_VIS4"
+ "<vis4_addsub_us_insn><vbits>\t%1, %2, %0"
+ [(set_attr "type" "fga")])
+
 (define_insn "fucmp<code>8<P:mode>_vis"
   [(set (match_operand:P 0 "register_operand" "=r")
        (unspec:P [(gcond:V8QI (match_operand:V8QI 1 "register_operand" "e")
@@ -8650,7 +8763,18 @@
         UNSPEC_FUCMP))]
   "TARGET_VIS3"
   "fucmp<code>8\t%1, %2, %0"
-  [(set_attr "type" "visl")])
+  [(set_attr "type" "visl")
+   (set_attr "v3pipe" "true")])
+
+(define_insn "fpcmpu<code><GCM:gcm_name><P:mode>_vis"
+  [(set (match_operand:P 0 "register_operand" "=r")
+       (unspec:P [(gcond:GCM (match_operand:GCM 1 "register_operand" "e")
+                             (match_operand:GCM 2 "register_operand" "e"))]
+        UNSPEC_FUCMP))]
+  "TARGET_VIS4"
+  "fpcmpu<code><GCM:gcm_name>\t%1, %2, %0"
+  [(set_attr "type" "visl")
+   (set_attr "fptype" "double")])
 
 (define_insn "*naddsf3"
   [(set (match_operand:SF 0 "register_operand" "=f")
diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt
index 25eaa1a..13d4151 100644
--- a/gcc/config/sparc/sparc.opt
+++ b/gcc/config/sparc/sparc.opt
@@ -73,6 +73,10 @@ mvis3
 Target Report Mask(VIS3)
 Use UltraSPARC Visual Instruction Set version 3.0 extensions.
 
+mvis4
+Target Report Mask(VIS4)
+Use UltraSPARC Visual Instruction Set version 4.0 extensions.
+
 mcbcond
 Target Report Mask(CBCOND)
 Use UltraSPARC Compare-and-Branch extensions.
@@ -194,6 +198,9 @@ Enum(sparc_processor_type) String(niagara3) 
Value(PROCESSOR_NIAGARA3)
 EnumValue
 Enum(sparc_processor_type) String(niagara4) Value(PROCESSOR_NIAGARA4)
 
+EnumValue
+Enum(sparc_processor_type) String(niagara7) Value(PROCESSOR_NIAGARA7)
+
 mcmodel=
 Target RejectNegative Joined Var(sparc_cmodel_string)
 Use given SPARC-V9 code model.
diff --git a/gcc/config/sparc/visintrin.h b/gcc/config/sparc/visintrin.h
index 51ef739..8b80cc1 100644
--- a/gcc/config/sparc/visintrin.h
+++ b/gcc/config/sparc/visintrin.h
@@ -704,6 +704,192 @@ __vis_xmulxhi (__i64 __A, __i64 __B)
 
 #endif /* __VIS__ >= 0x300 */
 
+#if __VIS__ >= 0x400
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpadd8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpadd8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpadds8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpadds8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpaddus8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpaddus8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpaddus16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpaddus16 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmple8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpcmple8 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpgt8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpcmpgt8 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpule16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpcmpule16 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpugt16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpcmpugt16 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpule32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpcmpule32 (__A, __B);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpcmpugt32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpcmpugt32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmax8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpmax8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmax16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpmax16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmax32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpmax32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmaxu8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpmaxu8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmaxu16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpmaxu16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmaxu32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpmaxu32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmin8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpmin8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmin16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpmin16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpmin32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpmin32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpminu8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpminu8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpminu16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpminu16 (__A, __B);
+}
+
+extern __inline __v2si
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpminu32 (__v2si __A, __v2si __B)
+{
+  return __builtin_vis_fpminu32 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsub8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpsub8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsubs8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpsubs8 (__A, __B);
+}
+
+extern __inline __v8qi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsubus8 (__v8qi __A, __v8qi __B)
+{
+  return __builtin_vis_fpsubus8 (__A, __B);
+}
+
+extern __inline __v4hi
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__vis_fpsubus16 (__v4hi __A, __v4hi __B)
+{
+  return __builtin_vis_fpsubus16 (__A, __B);
+}
+
+#endif /* __VIS__ >= 0x400 */
+
 #endif /* __VIS__ */
 
 #endif  /* _VISINTRIN_H_INCLUDED */
diff --git a/gcc/configure b/gcc/configure
index 9e5cd64..98bb074 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -25124,6 +25124,43 @@ $as_echo "#define HAVE_AS_SPARC4 1" >>confdefs.h
 
 fi
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for SPARC5 and 
VIS 4.0 instructions" >&5
+$as_echo_n "checking assembler for SPARC5 and VIS 4.0 instructions... " >&6; }
+if test "${gcc_cv_as_sparc_sparc5+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_sparc_sparc5=no
+  if test x$gcc_cv_as != x; then
+    $as_echo '.text
+       .register %g2, #scratch
+       .register %g3, #scratch
+       .align 4
+       subxc %g1, %g2, %g3
+       fpadd8 %f0, %f2, %f4' > conftest.s
+    if { ac_try='$gcc_cv_as $gcc_cv_as_flags -xarch=sparc5 -o conftest.o 
conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+    then
+       gcc_cv_as_sparc_sparc5=yes
+    else
+      echo "configure: failed program was" >&5
+      cat conftest.s >&5
+    fi
+    rm -f conftest.o conftest.s
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_sparc_sparc5" >&5
+$as_echo "$gcc_cv_as_sparc_sparc5" >&6; }
+if test $gcc_cv_as_sparc_sparc5 = yes; then
+
+$as_echo "#define HAVE_AS_SPARC5_VIS4 1" >>confdefs.h
+
+fi
+
+
     { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for LEON 
instructions" >&5
 $as_echo_n "checking assembler for LEON instructions... " >&6; }
 if test "${gcc_cv_as_sparc_leon+set}" = set; then :
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 046c94e..6607e76 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -3928,6 +3928,18 @@ foo:
       [AC_DEFINE(HAVE_AS_SPARC4, 1,
                 [Define if your assembler supports SPARC4 instructions.])])
 
+    gcc_GAS_CHECK_FEATURE([SPARC5 and VIS 4.0 instructions],
+      gcc_cv_as_sparc_sparc5,,
+      [-xarch=sparc5],
+      [.text
+       .register %g2, #scratch
+       .register %g3, #scratch
+       .align 4
+       subxc %g1, %g2, %g3
+       fpadd8 %f0, %f2, %f4],,
+      [AC_DEFINE(HAVE_AS_SPARC5_VIS4, 1,
+                [Define if your assembler supports SPARC5 and VIS 4.0 
instructions.])])
+
     gcc_GAS_CHECK_FEATURE([LEON instructions],
       gcc_cv_as_sparc_leon,,
       [-Aleon],
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 2d4f028..edd3af5 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -18056,6 +18056,45 @@ int64_t __builtin_vis_xmulx (int64_t, int64_t);
 int64_t __builtin_vis_xmulxhi (int64_t, int64_t);
 @end smallexample
 
+When you use the @option{-mvis4} switch, the VIS version 4.0 built-in
+functions also become available:
+
+@smallexample
+v8qi __builtin_vis_fpadd8 (v8qi, v8qi);
+v8qi __builtin_vis_fpadds8 (v8qi, v8qi);
+v8qi __builtin_vis_fpaddus8 (v8qi, v8qi);
+v4hi __builtin_vis_fpaddus16 (v4hi, v4hi);
+
+v8qi __builtin_vis_fpsub8 (v8qi, v8qi);
+v8qi __builtin_vis_fpsubs8 (v8qi, v8qi);
+v8qi __builtin_vis_fpsubus8 (v8qi, v8qi);
+v4hi __builtin_vis_fpsubus16 (v4hi, v4hi);
+
+long __builtin_vis_fpcmple8 (v8qi, v8qi);
+long __builtin_vis_fpcmpgt8 (v8qi, v8qi);
+long __builtin_vis_fpcmpule16 (v4hi, v4hi);
+long __builtin_vis_fpcmpugt16 (v4hi, v4hi);
+long __builtin_vis_fpcmpule32 (v2si, v2si);
+long __builtin_vis_fpcmpugt32 (v2si, v2si);
+
+v8qi __builtin_vis_fpmax8 (v8qi, v8qi);
+v4hi __builtin_vis_fpmax16 (v4hi, v4hi);
+v2si __builtin_vis_fpmax32 (v2si, v2si);
+
+v8qi __builtin_vis_fpmaxu8 (v8qi, v8qi);
+v4hi __builtin_vis_fpmaxu16 (v4hi, v4hi);
+v2si __builtin_vis_fpmaxu32 (v2si, v2si);
+
+
+v8qi __builtin_vis_fpmin8 (v8qi, v8qi);
+v4hi __builtin_vis_fpmin16 (v4hi, v4hi);
+v2si __builtin_vis_fpmin32 (v2si, v2si);
+
+v8qi __builtin_vis_fpminu8 (v8qi, v8qi);
+v4hi __builtin_vis_fpminu16 (v4hi, v4hi);
+v2si __builtin_vis_fpminu32 (v2si, v2si);
+@end smallexample
+
 @node SPU Built-in Functions
 @subsection SPU Built-in Functions
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ce162a0..f5a9a6d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22175,7 +22175,7 @@ for machine type @var{cpu_type}.  Supported values for 
@var{cpu_type} are
 @samp{leon}, @samp{leon3}, @samp{leon3v7}, @samp{sparclite}, @samp{f930},
 @samp{f934}, @samp{sparclite86x}, @samp{sparclet}, @samp{tsc701}, @samp{v9},
 @samp{ultrasparc}, @samp{ultrasparc3}, @samp{niagara}, @samp{niagara2},
-@samp{niagara3} and @samp{niagara4}.
+@samp{niagara3}, @samp{niagara4} and @samp{niagara7}.
 
 Native Solaris and GNU/Linux toolchains also support the value @samp{native},
 which selects the best architecture option for the host processor.
@@ -22203,7 +22203,7 @@ f930, f934, sparclite86x
 tsc701
 
 @item v9
-ultrasparc, ultrasparc3, niagara, niagara2, niagara3, niagara4
+ultrasparc, ultrasparc3, niagara, niagara2, niagara3, niagara4, niagara7
 @end table
 
 By default (unless configured otherwise), GCC generates code for the V7
@@ -22245,7 +22245,9 @@ Sun UltraSPARC T1 chips.  With @option{-mcpu=niagara2}, 
the compiler
 additionally optimizes it for Sun UltraSPARC T2 chips. With
 @option{-mcpu=niagara3}, the compiler additionally optimizes it for Sun
 UltraSPARC T3 chips.  With @option{-mcpu=niagara4}, the compiler
-additionally optimizes it for Sun UltraSPARC T4 chips.
+additionally optimizes it for Sun UltraSPARC T4 chips.  With
+@option{-mcpu=niagara7}, the compiler additionally optimizes it for
+Oracle SPARC M7 chips.
 
 @item -mtune=@var{cpu_type}
 @opindex mtune
@@ -22255,12 +22257,13 @@ option @option{-mcpu=@var{cpu_type}} does.
 
 The same values for @option{-mcpu=@var{cpu_type}} can be used for
 @option{-mtune=@var{cpu_type}}, but the only useful values are those
-that select a particular CPU implementation.  Those are @samp{cypress},
-@samp{supersparc}, @samp{hypersparc}, @samp{leon}, @samp{leon3},
-@samp{leon3v7}, @samp{f930}, @samp{f934}, @samp{sparclite86x}, @samp{tsc701},
-@samp{ultrasparc}, @samp{ultrasparc3}, @samp{niagara}, @samp{niagara2},
-@samp{niagara3} and @samp{niagara4}.  With native Solaris and GNU/Linux
-toolchains, @samp{native} can also be used.
+that select a particular CPU implementation.  Those are
+@samp{cypress}, @samp{supersparc}, @samp{hypersparc}, @samp{leon},
+@samp{leon3}, @samp{leon3v7}, @samp{f930}, @samp{f934},
+@samp{sparclite86x}, @samp{tsc701}, @samp{ultrasparc},
+@samp{ultrasparc3}, @samp{niagara}, @samp{niagara2}, @samp{niagara3},
+@samp{niagara4} and @samp{niagara7}.  With native Solaris and
+GNU/Linux toolchains, @samp{native} can also be used.
 
 @item -mv8plus
 @itemx -mno-v8plus
@@ -22298,6 +22301,16 @@ default is @option{-mvis3} when targeting a cpu that 
supports such
 instructions, such as niagara-3 and later.  Setting @option{-mvis3}
 also sets @option{-mvis2} and @option{-mvis}.
 
+@item -mvis4
+@itemx -mno-vis4
+@opindex mvis4
+@opindex mno-vis4
+With @option{-mvis4}, GCC generates code that takes advantage of
+version 4.0 of the UltraSPARC Visual Instruction Set extensions.  The
+default is @option{-mvis4} when targeting a cpu that supports such
+instructions, such as niagara-7 and later.  Setting @option{-mvis4}
+also sets @option{-mvis3}, @option{-mvis2} and @option{-mvis}.
+
 @item -mcbcond
 @itemx -mno-cbcond
 @opindex mcbcond
diff --git a/gcc/testsuite/gcc.target/sparc/fpcmp.c 
b/gcc/testsuite/gcc.target/sparc/fpcmp.c
new file mode 100644
index 0000000..1255d67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/fpcmp.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-mvis4" } */
+
+typedef unsigned char vec8 __attribute__((vector_size(8)));
+
+long test_fpcmple8 (vec8 a, vec8 b)
+{
+  return __builtin_vis_fpcmple8 (a, b);
+}
+
+long test_fpcmpgt8 (vec8 a, vec8 b)
+{
+  return __builtin_vis_fpcmpgt8 (a, b);
+}
+
+/* { dg-final { scan-assembler "fpcmple8\t%" } } */
+/* { dg-final { scan-assembler "fpcmpgt8\t%" } } */
+
diff --git a/gcc/testsuite/gcc.target/sparc/fpcmpu.c 
b/gcc/testsuite/gcc.target/sparc/fpcmpu.c
new file mode 100644
index 0000000..816a22d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/fpcmpu.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-mvis4" } */
+
+
+typedef short vec16 __attribute__((vector_size(8)));
+typedef int vec32 __attribute__((vector_size(8)));
+
+long test_fpcmpule16 (vec16 a, vec16 b)
+{
+  return __builtin_vis_fpcmpule16 (a, b);
+}
+
+long test_fpcmpugt16 (vec16 a, vec16 b)
+{
+  return __builtin_vis_fpcmpugt16 (a, b);
+}
+
+long test_fpcmpule32 (vec32 a, vec32 b)
+{
+  return __builtin_vis_fpcmpule32 (a, b);
+}
+
+long test_fpcmpugt32 (vec32 a, vec32 b)
+{
+  return __builtin_vis_fpcmpugt32 (a, b);
+}
+
+/* { dg-final { scan-assembler "fpcmpule16\t%" } } */
+/* { dg-final { scan-assembler "fpcmpugt16\t%" } } */
+/* { dg-final { scan-assembler "fpcmpule32\t%" } } */
+/* { dg-final { scan-assembler "fpcmpugt32\t%" } } */
diff --git a/gcc/testsuite/gcc.target/sparc/vis4misc.c 
b/gcc/testsuite/gcc.target/sparc/vis4misc.c
new file mode 100644
index 0000000..b520b12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/vis4misc.c
@@ -0,0 +1,126 @@
+/* { dg-do compile } */
+/* { dg-options "-mvis4" } */
+typedef int __v2si __attribute__((vector_size(8)));
+typedef short __v4hi __attribute__((vector_size(8)));
+typedef unsigned char __v8qi __attribute__((vector_size(8)));
+
+__v8qi test_fpadd8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpadd8 (x, y);
+}
+
+__v8qi test_fpadds8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpadds8 (x, y);
+}
+
+__v8qi test_fpaddus8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpaddus8 (x, y);
+}
+
+__v4hi test_fpaddus16 (__v4hi x, __v4hi y)
+{
+  return __builtin_vis_fpaddus16 (x, y);
+}
+
+__v8qi test_fpsub8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpsub8 (x, y);
+}
+
+__v8qi test_fpsubs8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpsubs8 (x, y);
+}
+
+__v8qi test_fpsubus8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpsubus8 (x, y);
+}
+
+__v4hi test_fpsubus16 (__v4hi x, __v4hi y)
+{
+  return __builtin_vis_fpsubus16 (x, y);
+}
+
+__v8qi test_fpmax8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpmax8 (x, y);
+}
+
+__v4hi test_fpmax16 (__v4hi x, __v4hi y)
+{
+  return __builtin_vis_fpmax16 (x, y);
+}
+
+__v2si test_fpmax32 (__v2si x, __v2si y)
+{
+  return __builtin_vis_fpmax32 (x, y);
+}
+
+__v8qi test_fpmaxu8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpmaxu8 (x, y);
+}
+
+__v4hi test_fpmaxu16 (__v4hi x, __v4hi y)
+{
+  return __builtin_vis_fpmaxu16 (x, y);
+}
+
+__v2si test_fpmaxu32 (__v2si x, __v2si y)
+{
+  return __builtin_vis_fpmaxu32 (x, y);
+}
+
+__v8qi test_fpmin8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpmin8 (x, y);
+}
+
+__v4hi test_fpmin16 (__v4hi x, __v4hi y)
+{
+  return __builtin_vis_fpmin16 (x, y);
+}
+
+__v2si test_fpmin32 (__v2si x, __v2si y)
+{
+  return __builtin_vis_fpmin32 (x, y);
+}
+
+__v8qi test_fpminu8 (__v8qi x, __v8qi y)
+{
+  return __builtin_vis_fpminu8 (x, y);
+}
+
+__v4hi test_fpminu16 (__v4hi x, __v4hi y)
+{
+  return __builtin_vis_fpminu16 (x, y);
+}
+
+__v2si test_fpminu32 (__v2si x, __v2si y)
+{
+  return __builtin_vis_fpminu32 (x, y);
+}
+
+/* { dg-final { scan-assembler "fpadd8\t%" } } */
+/* { dg-final { scan-assembler "fpadds8\t%" } } */
+/* { dg-final { scan-assembler "fpaddus8\t%" } } */
+/* { dg-final { scan-assembler "fpaddus16\t%" } } */
+/* { dg-final { scan-assembler "fpsub8\t%" } } */
+/* { dg-final { scan-assembler "fpsubs8\t%" } } */
+/* { dg-final { scan-assembler "fpsubus8\t%" } } */
+/* { dg-final { scan-assembler "fpsubus16\t%" } } */
+/* { dg-final { scan-assembler "fpmax8\t%" } } */
+/* { dg-final { scan-assembler "fpmax16\t%" } } */
+/* { dg-final { scan-assembler "fpmax32\t%" } } */
+/* { dg-final { scan-assembler "fpmaxu8\t%" } } */
+/* { dg-final { scan-assembler "fpmaxu16\t%" } } */
+/* { dg-final { scan-assembler "fpmaxu32\t%" } } */
+/* { dg-final { scan-assembler "fpmin8\t%" } } */
+/* { dg-final { scan-assembler "fpmin16\t%" } } */
+/* { dg-final { scan-assembler "fpmin32\t%" } } */
+/* { dg-final { scan-assembler "fpminu8\t%" } } */
+/* { dg-final { scan-assembler "fpminu16\t%" } } */
+/* { dg-final { scan-assembler "fpminu32\t%" } } */

[PATCH v1] Support for SPARC M7 and VIS 4.0

Reply via email to