Re: [OE-core][PATCH] gcc: backport patch to fix data relocation to !ENDBR: stpcpy

Khem Raj via lists.openembedded.org Tue, 17 Dec 2024 09:21:32 -0800

On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org
<bin.lan.cn=windriver....@lists.openembedded.org> wrote:
>
> There is the following warning when building linux-yocto with
> default configuration on x86-64 with gcc-14.2:
>   AR      built-in.a
>   AR      vmlinux.a
>   LD      vmlinux.o
>   vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to 
> !ENDBR: stpcpy+0x0
>
> This change set removes the warning.
>
> PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174]
>
> Signed-off-by: Bin Lan <bin.lan...@windriver.com>
> ---
>  meta/recipes-devtools/gcc/gcc-14.2.inc        |   1 +
>  ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++
>  2 files changed, 448 insertions(+)
>  create mode 100644 
> meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
>
> diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc 
> b/meta/recipes-devtools/gcc/gcc-14.2.inc
> index 4f505bef68..a25bc019e5 100644
> --- a/meta/recipes-devtools/gcc/gcc-14.2.inc
> +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc
> @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \
>             file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
>             file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \
>            file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \
> +           
> file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \


tiny nit.
is this tab vs spaces ? can you fix this.

>             file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \
>  "
>
> diff --git 
> a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
>  
> b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
> new file mode 100644
> index 0000000000..5bede60816
> --- /dev/null
> +++ 
> b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
> @@ -0,0 +1,447 @@
> +From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001
> +From: liuhongt <hongtao....@intel.com>
> +Date: Mon, 12 Aug 2024 14:35:31 +0800
> +Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the
> + pass after pass_endbr_and_patchable_area.
> +
> +gcc/ChangeLog:
> +
> +       PR target/116174
> +       * config/i386/i386.cc (ix86_align_loops): Move this to ..
> +       * config/i386/i386-features.cc (ix86_align_loops): .. here.
> +       (class pass_align_tight_loops): New class.
> +       (make_pass_align_tight_loops): New function.
> +       * config/i386/i386-passes.def: Insert pass_align_tight_loops
> +       after pass_insert_endbr_and_patchable_area.
> +       * config/i386/i386-protos.h (make_pass_align_tight_loops): New
> +       declare.
> +
> +gcc/testsuite/ChangeLog:
> +
> +       * gcc.target/i386/pr116174.c: New test.
> +
> +(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8)
> +
> +Upstream-Status: Backport 
> [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d]
> +
> +Signed-off-by: Bin Lan <bin.lan...@windriver.com>
> +---
> + gcc/config/i386/i386-features.cc         | 191 +++++++++++++++++++++++
> + gcc/config/i386/i386-passes.def          |   3 +
> + gcc/config/i386/i386-protos.h            |   1 +
> + gcc/config/i386/i386.cc                  | 146 -----------------
> + gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
> + 5 files changed, 207 insertions(+), 146 deletions(-)
> + create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c
> +
> +diff --git a/gcc/config/i386/i386-features.cc 
> b/gcc/config/i386/i386-features.cc
> +index e3e004d55267..7de19d423637 100644
> +--- a/gcc/config/i386/i386-features.cc
> ++++ b/gcc/config/i386/i386-features.cc
> +@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency 
> (gcc::context *ctxt)
> +   return new pass_remove_partial_avx_dependency (ctxt);
> + }
> +
> ++/* When a hot loop can be fit into one cacheline,
> ++   force align the loop without considering the max skip.  */
> ++static void
> ++ix86_align_loops ()
> ++{
> ++  basic_block bb;
> ++
> ++  /* Don't do this when we don't know cache line size.  */
> ++  if (ix86_cost->prefetch_block == 0)
> ++    return;
> ++
> ++  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> ++  profile_count count_threshold = cfun->cfg->count_max / 
> param_align_threshold;
> ++  FOR_EACH_BB_FN (bb, cfun)
> ++    {
> ++      rtx_insn *label = BB_HEAD (bb);
> ++      bool has_fallthru = 0;
> ++      edge e;
> ++      edge_iterator ei;
> ++
> ++      if (!LABEL_P (label))
> ++      continue;
> ++
> ++      profile_count fallthru_count = profile_count::zero ();
> ++      profile_count branch_count = profile_count::zero ();
> ++
> ++      FOR_EACH_EDGE (e, ei, bb->preds)
> ++      {
> ++        if (e->flags & EDGE_FALLTHRU)
> ++          has_fallthru = 1, fallthru_count += e->count ();
> ++        else
> ++          branch_count += e->count ();
> ++      }
> ++
> ++      if (!fallthru_count.initialized_p () || !branch_count.initialized_p 
> ())
> ++      continue;
> ++
> ++      if (bb->loop_father
> ++        && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
> ++        && (has_fallthru
> ++            ? (!(single_succ_p (bb)
> ++                 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
> ++               && optimize_bb_for_speed_p (bb)
> ++               && branch_count + fallthru_count > count_threshold
> ++               && (branch_count > fallthru_count * 
> param_align_loop_iterations))
> ++            /* In case there'no fallthru for the loop.
> ++               Nops inserted won't be executed.  */
> ++            : (branch_count > count_threshold
> ++               || (bb->count > bb->prev_bb->count * 10
> ++                   && (bb->prev_bb->count
> ++                       <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
> ++      {
> ++        rtx_insn* insn, *end_insn;
> ++        HOST_WIDE_INT size = 0;
> ++        bool padding_p = true;
> ++        basic_block tbb = bb;
> ++        unsigned cond_branch_num = 0;
> ++        bool detect_tight_loop_p = false;
> ++
> ++        for (unsigned int i = 0; i != bb->loop_father->num_nodes;
> ++             i++, tbb = tbb->next_bb)
> ++          {
> ++            /* Only handle continuous cfg layout. */
> ++            if (bb->loop_father != tbb->loop_father)
> ++              {
> ++                padding_p = false;
> ++                break;
> ++              }
> ++
> ++            FOR_BB_INSNS (tbb, insn)
> ++              {
> ++                if (!NONDEBUG_INSN_P (insn))
> ++                  continue;
> ++                size += ix86_min_insn_size (insn);
> ++
> ++                /* We don't know size of inline asm.
> ++                   Don't align loop for call.  */
> ++                if (asm_noperands (PATTERN (insn)) >= 0
> ++                    || CALL_P (insn))
> ++                  {
> ++                    size = -1;
> ++                    break;
> ++                  }
> ++              }
> ++
> ++            if (size == -1 || size > ix86_cost->prefetch_block)
> ++              {
> ++                padding_p = false;
> ++                break;
> ++              }
> ++
> ++            FOR_EACH_EDGE (e, ei, tbb->succs)
> ++              {
> ++                /* It could be part of the loop.  */
> ++                if (e->dest == bb)
> ++                  {
> ++                    detect_tight_loop_p = true;
> ++                    break;
> ++                  }
> ++              }
> ++
> ++            if (detect_tight_loop_p)
> ++              break;
> ++
> ++            end_insn = BB_END (tbb);
> ++            if (JUMP_P (end_insn))
> ++              {
> ++                /* For decoded icache:
> ++                   1. Up to two branches are allowed per Way.
> ++                   2. A non-conditional branch is the last micro-op in a 
> Way.
> ++                */
> ++                if (onlyjump_p (end_insn)
> ++                    && (any_uncondjump_p (end_insn)
> ++                        || single_succ_p (tbb)))
> ++                  {
> ++                    padding_p = false;
> ++                    break;
> ++                  }
> ++                else if (++cond_branch_num >= 2)
> ++                  {
> ++                    padding_p = false;
> ++                    break;
> ++                  }
> ++              }
> ++
> ++          }
> ++
> ++        if (padding_p && detect_tight_loop_p)
> ++          {
> ++            emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 
> (size)),
> ++                                                  GEN_INT (0)), label);
> ++            /* End of function.  */
> ++            if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
> ++              break;
> ++            /* Skip bb which already fits into one cacheline.  */
> ++            bb = tbb;
> ++          }
> ++      }
> ++    }
> ++
> ++  loop_optimizer_finalize ();
> ++  free_dominance_info (CDI_DOMINATORS);
> ++}
> ++
> ++namespace {
> ++
> ++const pass_data pass_data_align_tight_loops =
> ++{
> ++  RTL_PASS, /* type */
> ++  "align_tight_loops", /* name */
> ++  OPTGROUP_NONE, /* optinfo_flags */
> ++  TV_MACH_DEP, /* tv_id */
> ++  0, /* properties_required */
> ++  0, /* properties_provided */
> ++  0, /* properties_destroyed */
> ++  0, /* todo_flags_start */
> ++  0, /* todo_flags_finish */
> ++};
> ++
> ++class pass_align_tight_loops : public rtl_opt_pass
> ++{
> ++public:
> ++  pass_align_tight_loops (gcc::context *ctxt)
> ++    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
> ++  {}
> ++
> ++  /* opt_pass methods: */
> ++  bool gate (function *) final override
> ++    {
> ++      return optimize && optimize_function_for_speed_p (cfun);
> ++    }
> ++
> ++  unsigned int execute (function *) final override
> ++    {
> ++      timevar_push (TV_MACH_DEP);
> ++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
> ++      ix86_align_loops ();
> ++#endif
> ++      timevar_pop (TV_MACH_DEP);
> ++      return 0;
> ++    }
> ++}; // class pass_align_tight_loops
> ++
> ++} // anon namespace
> ++
> ++rtl_opt_pass *
> ++make_pass_align_tight_loops (gcc::context *ctxt)
> ++{
> ++  return new pass_align_tight_loops (ctxt);
> ++}
> ++
> + /* This compares the priority of target features in function DECL1
> +    and DECL2.  It returns positive value if DECL1 is higher priority,
> +    negative value if DECL2 is higher priority and 0 if they are the
> +diff --git a/gcc/config/i386/i386-passes.def 
> b/gcc/config/i386/i386-passes.def
> +index 7d96766f7b96..e500f15c9971 100644
> +--- a/gcc/config/i386/i386-passes.def
> ++++ b/gcc/config/i386/i386-passes.def
> +@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3.  If not see
> +   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
> +
> +   INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
> pass_insert_endbr_and_patchable_area);
> ++  /* pass_align_tight_loops must be after 
> pass_insert_endbr_and_patchable_area.
> ++     PR116174.  */
> ++  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
> +
> +   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
> +diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> +index 46214a63974d..36c7b1aed42b 100644
> +--- a/gcc/config/i386/i386-protos.h
> ++++ b/gcc/config/i386/i386-protos.h
> +@@ -419,6 +419,7 @@ extern rtl_opt_pass 
> *make_pass_insert_endbr_and_patchable_area
> +   (gcc::context *);
> + extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
> +   (gcc::context *);
> ++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
> +
> + extern bool ix86_has_no_direct_extern_access;
> +
> +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> +index 6f89891d3cb5..288c69467d62 100644
> +--- a/gcc/config/i386/i386.cc
> ++++ b/gcc/config/i386/i386.cc
> +@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load ()
> +     }
> + }
> +
> +-/* When a hot loop can be fit into one cacheline,
> +-   force align the loop without considering the max skip.  */
> +-static void
> +-ix86_align_loops ()
> +-{
> +-  basic_block bb;
> +-
> +-  /* Don't do this when we don't know cache line size.  */
> +-  if (ix86_cost->prefetch_block == 0)
> +-    return;
> +-
> +-  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> +-  profile_count count_threshold = cfun->cfg->count_max / 
> param_align_threshold;
> +-  FOR_EACH_BB_FN (bb, cfun)
> +-    {
> +-      rtx_insn *label = BB_HEAD (bb);
> +-      bool has_fallthru = 0;
> +-      edge e;
> +-      edge_iterator ei;
> +-
> +-      if (!LABEL_P (label))
> +-      continue;
> +-
> +-      profile_count fallthru_count = profile_count::zero ();
> +-      profile_count branch_count = profile_count::zero ();
> +-
> +-      FOR_EACH_EDGE (e, ei, bb->preds)
> +-      {
> +-        if (e->flags & EDGE_FALLTHRU)
> +-          has_fallthru = 1, fallthru_count += e->count ();
> +-        else
> +-          branch_count += e->count ();
> +-      }
> +-
> +-      if (!fallthru_count.initialized_p () || !branch_count.initialized_p 
> ())
> +-      continue;
> +-
> +-      if (bb->loop_father
> +-        && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
> +-        && (has_fallthru
> +-            ? (!(single_succ_p (bb)
> +-                 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
> +-               && optimize_bb_for_speed_p (bb)
> +-               && branch_count + fallthru_count > count_threshold
> +-               && (branch_count > fallthru_count * 
> param_align_loop_iterations))
> +-            /* In case there'no fallthru for the loop.
> +-               Nops inserted won't be executed.  */
> +-            : (branch_count > count_threshold
> +-               || (bb->count > bb->prev_bb->count * 10
> +-                   && (bb->prev_bb->count
> +-                       <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
> +-      {
> +-        rtx_insn* insn, *end_insn;
> +-        HOST_WIDE_INT size = 0;
> +-        bool padding_p = true;
> +-        basic_block tbb = bb;
> +-        unsigned cond_branch_num = 0;
> +-        bool detect_tight_loop_p = false;
> +-
> +-        for (unsigned int i = 0; i != bb->loop_father->num_nodes;
> +-             i++, tbb = tbb->next_bb)
> +-          {
> +-            /* Only handle continuous cfg layout. */
> +-            if (bb->loop_father != tbb->loop_father)
> +-              {
> +-                padding_p = false;
> +-                break;
> +-              }
> +-
> +-            FOR_BB_INSNS (tbb, insn)
> +-              {
> +-                if (!NONDEBUG_INSN_P (insn))
> +-                  continue;
> +-                size += ix86_min_insn_size (insn);
> +-
> +-                /* We don't know size of inline asm.
> +-                   Don't align loop for call.  */
> +-                if (asm_noperands (PATTERN (insn)) >= 0
> +-                    || CALL_P (insn))
> +-                  {
> +-                    size = -1;
> +-                    break;
> +-                  }
> +-              }
> +-
> +-            if (size == -1 || size > ix86_cost->prefetch_block)
> +-              {
> +-                padding_p = false;
> +-                break;
> +-              }
> +-
> +-            FOR_EACH_EDGE (e, ei, tbb->succs)
> +-              {
> +-                /* It could be part of the loop.  */
> +-                if (e->dest == bb)
> +-                  {
> +-                    detect_tight_loop_p = true;
> +-                    break;
> +-                  }
> +-              }
> +-
> +-            if (detect_tight_loop_p)
> +-              break;
> +-
> +-            end_insn = BB_END (tbb);
> +-            if (JUMP_P (end_insn))
> +-              {
> +-                /* For decoded icache:
> +-                   1. Up to two branches are allowed per Way.
> +-                   2. A non-conditional branch is the last micro-op in a 
> Way.
> +-                */
> +-                if (onlyjump_p (end_insn)
> +-                    && (any_uncondjump_p (end_insn)
> +-                        || single_succ_p (tbb)))
> +-                  {
> +-                    padding_p = false;
> +-                    break;
> +-                  }
> +-                else if (++cond_branch_num >= 2)
> +-                  {
> +-                    padding_p = false;
> +-                    break;
> +-                  }
> +-              }
> +-
> +-          }
> +-
> +-        if (padding_p && detect_tight_loop_p)
> +-          {
> +-            emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 
> (size)),
> +-                                                  GEN_INT (0)), label);
> +-            /* End of function.  */
> +-            if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
> +-              break;
> +-            /* Skip bb which already fits into one cacheline.  */
> +-            bb = tbb;
> +-          }
> +-      }
> +-    }
> +-
> +-  loop_optimizer_finalize ();
> +-  free_dominance_info (CDI_DOMINATORS);
> +-}
> +-
> + /* Implement machine specific optimizations.  We implement padding of 
> returns
> +    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> + static void
> +@@ -23611,8 +23467,6 @@ ix86_reorg (void)
> + #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
> +       if (TARGET_FOUR_JUMP_LIMIT)
> +       ix86_avoid_jump_mispredicts ();
> +-
> +-      ix86_align_loops ();
> + #endif
> +     }
> + }
> +diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c 
> b/gcc/testsuite/gcc.target/i386/pr116174.c
> +new file mode 100644
> +index 000000000000..8877d0b51af1
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/i386/pr116174.c
> +@@ -0,0 +1,12 @@
> ++/* { dg-do compile { target *-*-linux* } } */
> ++/* { dg-options "-O2 -fcf-protection=branch" } */
> ++
> ++char *
> ++foo (char *dest, const char *src)
> ++{
> ++  while ((*dest++ = *src++) != '\0')
> ++    /* nothing */;
> ++  return --dest;
> ++}
> ++
> ++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */
> +--
> +2.43.5
> --
> 2.34.1
>
>
> 
>

-=-=-=-=-=-=-=-=-=-=-=-
Links: You receive all messages sent to this group.
View/Reply Online (#208844): 
https://lists.openembedded.org/g/openembedded-core/message/208844
Mute This Topic: https://lists.openembedded.org/mt/110160747/21656
Group Owner: openembedded-core+ow...@lists.openembedded.org
Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub 
[arch...@mail-archive.com]
-=-=-=-=-=-=-=-=-=-=-=-

Re: [OE-core][PATCH] gcc: backport patch to fix data relocation to !ENDBR: stpcpy

Reply via email to