> While it could be possible to output_set_got such that we can
> individually annotate the instructions, it's simpler to simply
> admit that all processors currently being manufactured do want
> deep branch prediction.  At which point all of the complication
> simply goes away.

Note that most of modern CPUs special case call to next instruction, so 
they will work well with !X86_TUNE_DEEP_BRANCH_PREDICTION code.

Honza
> ---
>  gcc/config/i386/i386.c |  105 +++++++----------------------------------------
>  gcc/config/i386/i386.h |    3 -
>  2 files changed, 16 insertions(+), 92 deletions(-)
> 
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 014401b..332e65b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -55,7 +55,6 @@ along with GCC; see the file COPYING3.  If not see
>  #include "params.h"
>  #include "cselib.h"
>  #include "debug.h"
> -#include "dwarf2out.h"
>  #include "sched-int.h"
>  #include "sbitmap.h"
>  #include "fibheap.h"
> @@ -1847,10 +1846,6 @@ static unsigned int 
> initial_ix86_tune_features[X86_TUNE_LAST] = {
>    m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
>    | m_CORE2I7 | m_GENERIC,
>  
> -  /* X86_TUNE_DEEP_BRANCH_PREDICTION */
> -  m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
> -  | m_CORE2I7 | m_GENERIC,
> -
>    /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
>       on simulation result. But after P4 was made, no performance benefit
>       was observed with branch hints.  It also increases the code size.
> @@ -8323,31 +8318,11 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>  
>    xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
>  
> -  if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
> +  if (!flag_pic)
>      {
>        xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
>  
> -      if (!flag_pic)
> -     output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
> -      else
> -     {
> -       output_asm_insn ("call\t%a2", xops);
> -#ifdef DWARF2_UNWIND_INFO
> -       /* The call to next label acts as a push.  */
> -       if (dwarf2out_do_frame ())
> -         {
> -           rtx insn;
> -           start_sequence ();
> -           insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
> -                                          gen_rtx_PLUS (Pmode,
> -                                                        stack_pointer_rtx,
> -                                                        GEN_INT (-4))));
> -           RTX_FRAME_RELATED_P (insn) = 1;
> -           dwarf2out_frame_debug (insn, true);
> -           end_sequence ();
> -         }
> -#endif
> -     }
> +      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
>  
>  #if TARGET_MACHO
>        /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
> @@ -8358,29 +8333,6 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>  
>        targetm.asm_out.internal_label (asm_out_file, "L",
>                                     CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
> -
> -      if (flag_pic)
> -     {
> -       output_asm_insn ("pop%z0\t%0", xops);
> -#ifdef DWARF2_UNWIND_INFO
> -       /* The pop is a pop and clobbers dest, but doesn't restore it
> -          for unwind info purposes.  */
> -       if (dwarf2out_do_frame ())
> -         {
> -           rtx insn;
> -           start_sequence ();
> -           insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
> -           dwarf2out_frame_debug (insn, true);
> -           insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
> -                                          gen_rtx_PLUS (Pmode,
> -                                                        stack_pointer_rtx,
> -                                                        GEN_INT (4))));
> -           RTX_FRAME_RELATED_P (insn) = 1;
> -           dwarf2out_frame_debug (insn, true);
> -           end_sequence ();
> -         }
> -#endif
> -     }
>      }
>    else
>      {
> @@ -8388,12 +8340,6 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>        get_pc_thunk_name (name, REGNO (dest));
>        pic_labels_used |= 1 << REGNO (dest);
>  
> -#ifdef DWARF2_UNWIND_INFO
> -      /* Ensure all queued register saves are flushed before the
> -      call.  */
> -      if (dwarf2out_do_frame ())
> -     dwarf2out_flush_queued_reg_saves ();
> -#endif
>        xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
>        xops[2] = gen_rtx_MEM (QImode, xops[2]);
>        output_asm_insn ("call\t%X2", xops);
> @@ -8408,13 +8354,8 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>  #endif
>      }
>  
> -  if (TARGET_MACHO)
> -    return "";
> -
> -  if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
> +  if (!TARGET_MACHO)
>      output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
> -  else
> -    output_asm_insn ("add%z0\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
>  
>    return "";
>  }
> @@ -10138,7 +10079,11 @@ ix86_expand_prologue (void)
>              insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
>       }
>        else
> -        insn = emit_insn (gen_set_got (pic_offset_table_rtx));
> +     {
> +          insn = emit_insn (gen_set_got (pic_offset_table_rtx));
> +       RTX_FRAME_RELATED_P (insn) = 1;
> +       add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
> +     }
>      }
>  
>    /* In the pic_reg_used case, make sure that the got load isn't deleted
> @@ -28979,12 +28924,7 @@ machopic_output_stub (FILE *file, const char *symb, 
> const char *stub)
>    if (MACHOPIC_ATT_STUB)
>      switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
>    else if (MACHOPIC_PURE)
> -    {
> -      if (TARGET_DEEP_BRANCH_PREDICTION)
> -     switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
> -      else
> -    switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
> -    }
> +    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
>    else
>      switch_to_section (darwin_sections[machopic_symbol_stub_section]);
>  
> @@ -28998,19 +28938,11 @@ machopic_output_stub (FILE *file, const char *symb, 
> const char *stub)
>    else if (MACHOPIC_PURE)
>      {
>        /* PIC stub.  */
> -      if (TARGET_DEEP_BRANCH_PREDICTION)
> -     {
> -       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> -       rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
> -       output_set_got (tmp, NULL_RTX);       /* "CALL 
> ___<cpu>.get_pc_thunk.cx".  */
> -       fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", label, 
> lazy_ptr_name, label);
> -     }
> -      else
> -     {
> -       /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %eax". 
>  */
> -       fprintf (file, "\tcall LPC$%d\nLPC$%d:\tpopl %%ecx\n", label, label);
> -       fprintf (file, "\tmovl %s-LPC$%d(%%ecx),%%ecx\n", lazy_ptr_name, 
> label);
> -     }
> +      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> +      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
> +      output_set_got (tmp, NULL_RTX);        /* "CALL 
> ___<cpu>.get_pc_thunk.cx".  */
> +      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
> +            label, lazy_ptr_name, label);
>        fprintf (file, "\tjmp\t*%%ecx\n");
>      }
>    else
> @@ -29039,13 +28971,8 @@ machopic_output_stub (FILE *file, const char *symb, 
> const char *stub)
>       compatibility with existing dylibs.  */
>    if (MACHOPIC_PURE)
>      {
> -      /* PIC stubs.  */
> -      if (TARGET_DEEP_BRANCH_PREDICTION)
> -     /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> -     switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
> -      else
> -     /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %ebx".  
> */
> -  switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
> +      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> +      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
>      }
>    else
>      /* 16-byte -mdynamic-no-pic stub.  */
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 8badcbb..1452226 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -249,7 +249,6 @@ enum ix86_tune_indices {
>    X86_TUNE_PUSH_MEMORY,
>    X86_TUNE_ZERO_EXTEND_WITH_AND,
>    X86_TUNE_UNROLL_STRLEN,
> -  X86_TUNE_DEEP_BRANCH_PREDICTION,
>    X86_TUNE_BRANCH_PREDICTION_HINTS,
>    X86_TUNE_DOUBLE_WITH_ADD,
>    X86_TUNE_USE_SAHF,
> @@ -323,8 +322,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>  #define TARGET_ZERO_EXTEND_WITH_AND \
>       ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
>  #define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
> -#define TARGET_DEEP_BRANCH_PREDICTION \
> -     ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION]
>  #define TARGET_BRANCH_PREDICTION_HINTS \
>       ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
>  #define TARGET_DOUBLE_WITH_ADD       
> ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
> -- 
> 1.7.5.4

Reply via email to