From 0c6a334d7de8fc990bb4bb874bdf6ebe257ca0c5 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kvivekananda@nvidia.com>
Date: Sat, 25 Oct 2025 15:41:37 -0700
Subject: [PATCH] Implement hierarchical discriminators for AutoFDO

This patch implements hierarchical discriminators to improve AutoFDO
profile accuracy. The discriminator is extended from 16 bits to 32 bits
with three fields:

  - Base (12 bits): Traditional same-line disambiguation
  - Pass1 (12 bits): Optimization context (loop versioning, inlining)
  - Pass2 (8 bits): Code duplication (loop unrolling, peeling)

gcc/ChangeLog:

	* Makefile.in: Add hierarchical_discriminator.o to OBJS and update
	dependencies.
	* auto-profile.cc (struct decl_lineno): Use 32bit line_offset (int)
	and discriminator (unsigned int) fields instead of combined
	afdo_loc field.
	(get_line_offset_from_combined_loc): New.
	(get_discriminator_from_combined_loc): Likewise.
	(get_base_discriminator): Likewise.
	(get_combined_location): Update to support 64-bit combined location
	format (32-bit line offset + 32-bit hierarchical discriminator).
	Update warning messages and comments.
	(dump_afdo_loc): Change signature to accept separate line_offset
	and discriminator parameters.
	(dump_inline_stack): Update to use separate line_offset and
	discriminator fields from decl_lineno.
	(get_inline_stack): Populate separate line_offset and discriminator
	fields in decl_lineno struct.
	(make_profile_lookup_key): New.
	(function_instance::get_function_instance_by_decl): Update to use
	separate line_offset and discriminator fields from decl_lineno.
	(dump_stmt): Likewise.
	(function_instance::lookup_count): Likewise.
	(function_instance::match): Likewise.
	(function_instance::dump): Likewise.
	(function_instance::dump_inline_stack): Likewise.
	(function_instance::get_count_info): Likewise.
	(walk_block): Likewise.
	(autofdo_source_profile::offline_unrealized_inlines): Likewise.
	(function_instance::read_function_instance): Read line_offset and
	discriminator as two separate 32-bit values using gcov_read_unsigned().
	Aggregate profile counts by (line_offset, base) during reading,
	stripping pass1 and pass2 discriminators. Apply same aggregation
	to callsites.
	(autofdo_source_profile::get_count_info): Update to use
        separate line_offset and discriminator fields from decl_lineno.
	* cfgloopmanip.cc (duplicate_loop_body_to_header_edge):
	Assign pass2 discriminators for loop unrolling and peeling iterations.
	* cfgloopmanip.h (DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR): New.
	* gimple-loop-versioning.cc (loop_versioning::version_loop): Assign
	pass1 discriminators (1 for vectorized, 2 for scalar) for loop
	versioning.
	* hierarchical_discriminator.cc: New file.
	* hierarchical_discriminator.h: Likewise.
	* input.cc (location_with_discriminator_components): New.
	(get_discriminator_components_from_loc): Likewise.
	* input.h (Base):
	(DISCR_BASE_BITS): New.
	(DISCR_PASS1_BITS): Likewise.
	(DISCR_PASS2_BITS): Likewise.
	(DISCR_BASE_MASK): Likewise.
	(DISCR_PASS1_MASK): Likewise.
	(DISCR_PASS2_MASK): Likewise.
	(DISCR_BASE_SHIFT): Likewise.
	(DISCR_PASS1_SHIFT): Likewise.
	(DISCR_PASS2_SHIFT): Likewise.
	(DISCR_BASE_MAX): Likewise.
	(DISCR_PASS1_MAX): Likewise.
	(DISCR_PASS2_MAX): Likewise.
	(location_with_discriminator_components): Likewise.
	(get_discriminator_components_from_loc): Likewise.
	* tree-ssa-loop-ivcanon.cc (try_peel_loop): Pass
	DLTHE_RECORD_COPY_NUMBER flag for loop peeling to enable pass2
	discriminator assignment.
	* tree-vect-loop-manip.cc (vect_loop_versioning): Assign pass1
	discriminators for vectorized and scalar loop versions.

gcc/testsuite/ChangeLog:

2025-10-30  Kugan Vivekanandarajah  <kvivekananda@nvidia.com>

	* gcc.dg/hierarchical-discriminator-unroll.c: New test.
	* gcc.dg/hierarchical-discriminator-vect-version.c: New test.

Bootstrapped and regression tested on aarch64-linux-gnu with no new
regressions.

Signed-off-by: Kugan Vivekanandarajah <kvivekananda@nvidia.com>
---
 gcc/Makefile.in                               |   1 +
 gcc/auto-profile.cc                           | 406 ++++++++++++------
 gcc/cfgloopmanip.cc                           |  42 ++
 gcc/cfgloopmanip.h                            |   4 +
 gcc/gimple-loop-versioning.cc                 |   8 +
 gcc/hierarchical_discriminator.cc             |  97 +++++
 gcc/hierarchical_discriminator.h              |  75 ++++
 gcc/input.cc                                  |  34 ++
 gcc/input.h                                   |  36 ++
 .../hierarchical-discriminator-unroll.c       |  35 ++
 .../hierarchical-discriminator-vect-version.c |  28 ++
 gcc/tree-ssa-loop-ivcanon.cc                  |   7 +-
 gcc/tree-vect-loop-manip.cc                   |   9 +
 13 files changed, 652 insertions(+), 130 deletions(-)
 create mode 100644 gcc/hierarchical_discriminator.cc
 create mode 100644 gcc/hierarchical_discriminator.h
 create mode 100644 gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c
 create mode 100644 gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 5c24a9aab00..26278c5d143 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1456,6 +1456,7 @@ OBJS = \
 	dce.o \
 	ddg.o \
 	debug.o \
+	hierarchical_discriminator.o \
 	dep-fusion.o \
 	df-core.o \
 	df-problems.o \
diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index cf7a2191336..3eae14aad53 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -178,9 +178,10 @@ private:
 struct decl_lineno
 {
   tree decl;
-  /* Relative locations stored in auto-profile.  */
-  unsigned int afdo_loc;
-  /* Actual location afdo_loc was computed from used to output diagnostics.  */
+  /* Relative line offset from function start.  */
+  int line_offset;
+  /* Dscriminator.  */
+  unsigned int discriminator;
   location_t location;
 };
 
@@ -466,8 +467,10 @@ public:
   count_info *lookup_count (location_t loc, inline_stack &stack,
 			    cgraph_node *node);
 private:
-  /* Callsite, represented as (decl_lineno, callee_function_name_index).  */
-  typedef std::pair<unsigned, unsigned> callsite;
+  /* Callsite, represented as (decl_lineno, callee_function_name_index).
+     decl_lineno is now 64-bit to support hierarchical discriminators:
+     upper 32 bits: line offset, lower 32 bits: hierarchical discriminator.  */
+  typedef std::pair<uint64_t, unsigned> callsite;
 
   /* Map from callsite to callee function_instance.  */
   typedef std::map<callsite, function_instance *> callsite_map;
@@ -480,8 +483,10 @@ private:
   {
   }
 
-  /* Map from source location (decl_lineno) to profile (count_info).  */
-  typedef std::map<unsigned, count_info> position_count_map;
+  /* Map from source location (decl_lineno) to profile (count_info).
+     Key is 64-bit to support hierarchical discriminators:
+     upper 32 bits: line offset, lower 32 bits: hierarchical discriminator.  */
+  typedef std::map<uint64_t, count_info> position_count_map;
 
   /* function_instance name index in the string_table.  */
   unsigned name_;
@@ -495,7 +500,9 @@ private:
   /* Map from callsite location to callee function_instance.  */
   callsite_map callsites;
 
-  /* Map from source location to count_info.  */
+  /* Map from source location to count_info.
+     During profile reading, entries are aggregated by (line_offset, base)
+     to strip pass1 and pass2 discriminators.  */
   position_count_map pos_counts;
 
   /* True if function was removed from indir target list.  */
@@ -659,29 +666,65 @@ get_original_name (const char *name, bool alloc = true)
   return ret;
 }
 
-/* Return the combined location, which is a 32bit integer in which
-   higher 16 bits stores the line offset of LOC to the start lineno
-   of DECL, The lower 16 bits stores the discriminator.  */
+/* Extract line offset from a 64-bit combined location.  */
 
-static unsigned
+static inline int
+get_line_offset_from_combined_loc (uint64_t combined_loc)
+{
+  return (int)(combined_loc >> 32);
+}
+
+/* Extract 32-bit hierarchical discriminator from a 64-bit combined
+   location.  */
+
+static inline unsigned int
+get_discriminator_from_combined_loc (uint64_t combined_loc)
+{
+  return (unsigned int)(combined_loc & 0xFFFFFFFFULL);
+}
+
+/* Extract base discriminator (bits 0-11) from a 32-bit hierarchical
+   discriminator.  */
+
+static inline unsigned int
+get_base_discriminator (unsigned int discriminator)
+{
+  return discriminator & DISCR_BASE_MASK;
+}
+
+/* Return the combined location, which is a 64-bit integer in which
+   upper 32 bits stores the line offset of LOC to the start lineno
+   of DECL, and the lower 32 bits stores the hierarchical discriminator.
+   This supports the new hierarchical discriminator layout:
+   Base (12 bits) | Pass1 (12 bits) | Pass2 (8 bits).  */
+
+static uint64_t
 get_combined_location (location_t loc, tree decl)
 {
   bool warned = false;
-  /* TODO: allow more bits for line and less bits for discriminator.  */
-  if ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) >= (1<<15)
-      || (LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) <= -(1<<15))
-    warned = warning_at (loc, OPT_Wauto_profile,
-			 "auto-profile cannot encode offset %i "
-			 "that exceeds 16 bytes",
-			 LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl));
-  if (warned)
-    inform (DECL_SOURCE_LOCATION (decl), "location offset is related to");
-  if ((unsigned)get_discriminator_from_loc (loc) >= (1u << 16))
-    warning_at (loc, OPT_Wauto_profile,
-		"auto-profile cannot encode discriminators "
-		"that exceeds 16 bytes");
-  return ((unsigned)(LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
-	 | get_discriminator_from_loc (loc);
+  int line_offset = LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl);
+
+  /* Check if line offset fits in 32 bits (signed) within the 64-bit
+     combined location format.  */
+  if (line_offset >= (1LL << 31) || line_offset <= -(1LL << 31))
+    {
+      warned = warning_at (loc, OPT_Wauto_profile,
+			   "auto-profile cannot encode line offset %i "
+			   "in 64-bit combined location format",
+			   line_offset);
+      if (warned)
+	inform (DECL_SOURCE_LOCATION (decl), "location offset is related to");
+      /* Clamp to valid range.  */
+      if (line_offset >= (1LL << 31))
+	line_offset = (1LL << 31) - 1;
+      else
+	line_offset = -(1LL << 31);
+    }
+
+  unsigned int discriminator = get_discriminator_from_loc (loc);
+
+  /* The discriminator now supports 32 bits for hierarchical layout.  */
+  return (((uint64_t)(unsigned int)line_offset) << 32) | discriminator;
 }
 
 /* Return the function decl of a given lexical BLOCK.  */
@@ -695,15 +738,15 @@ get_function_decl_from_block (tree block)
   return BLOCK_ABSTRACT_ORIGIN (block);
 }
 
-/* Dump LOC to F.  */
+/* Dump line offset and discriminator to F.  */
 
 static void
-dump_afdo_loc (FILE *f, unsigned loc)
+dump_afdo_loc (FILE *f, int line_offset, unsigned int discriminator)
 {
-  if (loc & 65535)
-    fprintf (f, "%i.%i", loc >> 16, loc & 65535);
+  if (discriminator)
+    fprintf (f, "%i.%u", line_offset, discriminator);
   else
-    fprintf (f, "%i", loc >> 16);
+    fprintf (f, "%i", line_offset);
 }
 
 /* Return assembler name as in symbol table and DW_AT_linkage_name.  */
@@ -737,7 +780,7 @@ dump_inline_stack (FILE *f, inline_stack *stack)
       fprintf (f, "%s%s:",
 	       first ? "" : "; ",
 	       raw_symbol_name (p.decl));
-      dump_afdo_loc (f, p.afdo_loc);
+      dump_afdo_loc (f, p.line_offset, p.discriminator);
       first = false;
     }
   fprintf (f, "\n");
@@ -764,12 +807,15 @@ get_inline_stack (location_t locus, inline_stack *stack,
             continue;
 
           tree decl = get_function_decl_from_block (block);
-          stack->safe_push (
-	      {decl, get_combined_location (locus, decl), locus});
+	  int line_offset = LOCATION_LINE (locus) - DECL_SOURCE_LINE (decl);
+	  unsigned int discriminator = get_discriminator_from_loc (locus);
+	  stack->safe_push ({decl, line_offset, discriminator, locus});
           locus = tmp_locus;
         }
     }
-  stack->safe_push ({fn, get_combined_location (locus, fn), locus});
+  int line_offset = LOCATION_LINE (locus) - DECL_SOURCE_LINE (fn);
+  unsigned int discriminator = get_discriminator_from_loc (locus);
+  stack->safe_push ({fn, line_offset, discriminator, locus});
 }
 
 /* Same as get_inline_stack for a given node which may be
@@ -802,7 +848,7 @@ get_inline_stack_in_node (location_t locus, inline_stack *stack,
    LOC to the start lineno of DECL, The lower 16 bits stores the
    discriminator.  */
 
-static unsigned
+static uint64_t
 get_relative_location_for_locus (tree fn, tree block, location_t locus)
 {
   if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION)
@@ -818,7 +864,7 @@ get_relative_location_for_locus (tree fn, tree block, location_t locus)
 
 /* Return combined location of STMT in function FN.  */
 
-static unsigned
+static uint64_t
 get_relative_location_for_stmt (tree fn, gimple *stmt)
 {
   return get_relative_location_for_locus
@@ -826,6 +872,16 @@ get_relative_location_for_stmt (tree fn, gimple *stmt)
 	   gimple_location (stmt));
 }
 
+/* Create a lookup key from line_offset and discriminator.
+   Strips pass1 and pass2 from discriminator, keeping only base.  */
+
+static uint64_t
+make_profile_lookup_key (int line_offset, unsigned int discriminator)
+{
+  unsigned int base = get_base_discriminator (discriminator);
+  return (((uint64_t)(unsigned int)line_offset) << 32) | base;
+}
+
 /* Member functions for string_table.  */
 
 /* Deconstructor.  */
@@ -973,11 +1029,11 @@ function_instance::get_function_instance_by_decl (unsigned lineno,
 	  dump_printf_loc (MSG_NOTE | MSG_PRIORITY_INTERNALS,
 			   dump_user_location_t::from_location_t (location),
 			   "auto-profile has mismatched function name %s"
-			   " insteed of %s at loc %i:%i",
+			   " insteed of %s at loc %i:%u",
 			   afdo_string_table->get_name (iter.first.second),
 			   raw_symbol_name (decl),
-			   lineno >> 16,
-			   lineno & 65535);
+			   get_line_offset_from_combined_loc (lineno),
+			   get_discriminator_from_combined_loc (lineno));
     }
 
   return NULL;
@@ -1239,9 +1295,9 @@ dump_stmt (gimple *stmt, count_info *info, function_instance *inlined_fn,
       else
 	{
 	  gcc_checking_assert (stack.length () == 1);
-	  fprintf (dump_file, "%5i", stack[0].afdo_loc >> 16);
-	  if (stack[0].afdo_loc & 65535)
-	    fprintf (dump_file, ".%-5i", stack[0].afdo_loc & 65535);
+	  fprintf (dump_file, "%5i", stack[0].line_offset);
+	  if (stack[0].discriminator)
+	    fprintf (dump_file, ".%-5u", stack[0].discriminator);
 	  else
 	    fprintf (dump_file, "      ");
 	  if (info)
@@ -1263,17 +1319,21 @@ function_instance::lookup_count (location_t loc, inline_stack &stack,
   gcc_checking_assert (stack.length () < 2);
   if (stack.length ())
     {
-      int c = pos_counts.count (stack[0].afdo_loc);
+      uint64_t lookup_key
+	= make_profile_lookup_key (stack[0].line_offset,
+				   stack[0].discriminator);
+      int c = pos_counts.count (lookup_key);
       if (c > 1
 	  && warning (OPT_Wauto_profile,
 		      "duplicated count information"
 		      " in auto-profile of %q+F"
-		      " with relative location %i discriminator %i",
-		      node->decl, stack[0].afdo_loc >> 16,
-		      stack[0].afdo_loc & 65535))
-	  inform (loc, "corresponding source location");
+		      " with relative location %i discriminator %u",
+		      node->decl,
+		      stack[0].line_offset,
+		      stack[0].discriminator))
+	inform (loc, "corresponding source location");
       if (c)
-	return &pos_counts[stack[0].afdo_loc];
+	return &pos_counts[lookup_key];
     }
   return NULL;
 }
@@ -1349,9 +1409,9 @@ function_instance::match (cgraph_node *node,
       if (stack.length () && dump_file)
 	{
 	  gcc_checking_assert (stack.length () == 1);
-	  fprintf (dump_file, "%5i", stack[0].afdo_loc >> 16);
-	  if (stack[0].afdo_loc & 65535)
-	    fprintf (dump_file, "  .%-5i arg", stack[0].afdo_loc & 65535);
+	  fprintf (dump_file, "%5i", stack[0].line_offset);
+	  if (stack[0].discriminator)
+	    fprintf (dump_file, "  .%-5u arg", stack[0].discriminator);
 	  else
 	    fprintf (dump_file, "        arg ");
 	  print_generic_expr (dump_file, arg);
@@ -1425,8 +1485,13 @@ function_instance::match (cgraph_node *node,
 		{
 		  int c = 0;
 		  int cnodis = 0;
+		  uint64_t lookup_key
+		    = make_profile_lookup_key (stack[0].line_offset,
+					       stack[0].discriminator);
+		  uint64_t lookup_key_no_disc
+		    = make_profile_lookup_key (stack[0].line_offset, 0);
 		  for (auto const &iter : callsites)
-		    if (iter.first.first == stack[0].afdo_loc)
+		    if (iter.first.first == lookup_key)
 		      {
 			if (!c)
 			  inlined_fn = iter.second;
@@ -1434,7 +1499,7 @@ function_instance::match (cgraph_node *node,
 		      }
 		    /* Discriminators are sometimes lost; try to find the
 		       call without discriminator info.  */
-		    else if (iter.first.first == (stack[0].afdo_loc & ~65535))
+		    else if (iter.first.first == lookup_key_no_disc)
 		      {
 			if (!cnodis)
 			  inlined_fn_nodisc = iter.second;
@@ -1444,22 +1509,24 @@ function_instance::match (cgraph_node *node,
 		      && warning (OPT_Wauto_profile,
 				  "duplicated callsite in auto-profile of %q+F"
 				  " with relative location %i,"
-				  " discriminator %i",
-				  node->decl, stack[0].afdo_loc >> 16,
-				  stack[0].afdo_loc & 65535))
+				  " discriminator %u",
+				  node->decl,
+				  stack[0].line_offset,
+				  stack[0].discriminator))
 		    inform (gimple_location (stmt), "corresponding call");
 		  if (inlined_fn && info && info->targets.size ()
 		      && warning (OPT_Wauto_profile,
 				  "both call targets and inline callsite"
 				  " information is present in auto-profile"
 				  " of function %q+F with relative location"
-				  " %i, discriminator %i",
-				  node->decl, stack[0].afdo_loc >> 16,
-				  stack[0].afdo_loc & 65535))
+				  " %i, discriminator %u",
+				  node->decl,
+				  stack[0].line_offset,
+				  stack[0].discriminator))
 		    inform (gimple_location (stmt), "corresponding call");
 		  tree callee = gimple_call_fndecl (stmt);
 		  cgraph_node *callee_node;
-		  unsigned int loc = stack[0].afdo_loc;
+		  uint64_t loc = lookup_key;
 		  bool lost_discriminator = false;
 		  if (!inlined_fn && inlined_fn_nodisc)
 		    {
@@ -1479,32 +1546,32 @@ function_instance::match (cgraph_node *node,
 				       	(gimple_location (call),
 					 &stack2, node);
 				if (stack2.length ())
-				  lineno_to_call.get_or_insert
-				    (stack2[0].afdo_loc >> 16).safe_push (call);
-			      }
+			      lineno_to_call.get_or_insert
+				(stack2[0].line_offset).safe_push (call);
+			    }
 			  lineno_to_call_computed = true;
 			}
 		      /* If we can determine lost discriminator uniquely,
 			 use it.  */
 		      if (lineno_to_call.get
-			      (stack[0].afdo_loc >> 16)->length () == 1)
+			  (stack[0].line_offset)->length () == 1)
 			{
 			  if (warning (OPT_Wauto_profile,
 				       "auto-profile of %q+F seem to contain"
-				       " lost discriminator %i for"
+				       " lost discriminator %u for"
 				       " call of %s at relative location %i",
 				       node->decl,
-				       loc & 65535,
+				       stack[0].discriminator,
 				       afdo_string_table->get_name
 					 (inlined_fn_nodisc->name ()),
-				       loc >> 16))
+				       stack[0].line_offset))
 			    inform (gimple_location (stmt),
 				    "corresponding call");
 			  inlined_fn = inlined_fn_nodisc;
 			  if (dump_file)
-			    fprintf (dump_file, "   Lost discriminator %i\n",
-				     loc & 65535);
-			  loc = loc & ~65535;
+			    fprintf (dump_file, "   Lost discriminator %u\n",
+				     stack[0].discriminator);
+			  loc = lookup_key_no_disc;
 			}
 		      lost_discriminator = true;
 		    }
@@ -1523,8 +1590,7 @@ function_instance::match (cgraph_node *node,
 						   && iter != callsites.end ()
 						   && iter->second
 						      == inlined_fn);
-			      callsite key2 = {stack[0].afdo_loc,
-						inlined_fn->name ()};
+			      callsite key2 = {loc, inlined_fn->name ()};
 			      callsites.erase (iter);
 			      callsites[key2] = inlined_fn;
 			    }
@@ -1538,9 +1604,10 @@ function_instance::match (cgraph_node *node,
 			warning_at (gimple_location (stmt), OPT_Wauto_profile,
 				    "auto-profile of %q+F contains multiple"
 				    " targets for a direct call with relative"
-				    " location %i, discriminator %i",
-				    node->decl, stack[0].afdo_loc >> 16,
-				    stack[0].afdo_loc & 65535);
+				    " location %i, discriminator %u",
+				    node->decl,
+				    stack[0].line_offset,
+				    stack[0].discriminator);
 		      /* We do not need target profile for direct calls.  */
 		      if (info)
 			info->targets.clear ();
@@ -1551,13 +1618,14 @@ function_instance::match (cgraph_node *node,
 			  && inlined_fn->get_call_location ()
 				  != UNKNOWN_LOCATION)
 			{
-			  if (warning (OPT_Wauto_profile,
-				       "function contains two calls of the same"
-				       " relative location +%i,"
-				       " discrimnator %i,"
-				       " that leads to lost auto-profile",
-				       loc >> 16,
-				       loc & 65535))
+			  if (warning
+			      (OPT_Wauto_profile,
+			       "function contains two calls of the same"
+			       " relative location +%i,"
+			       " discrimnator %u,"
+			       " that leads to lost auto-profile",
+			       get_line_offset_from_combined_loc (loc),
+			       get_discriminator_from_combined_loc (loc)))
 			    {
 			      inform (gimple_location (stmt),
 				      "location of the first call");
@@ -1585,7 +1653,7 @@ function_instance::match (cgraph_node *node,
 			      gcc_checking_assert (iter != callsites.end ()
 						   && iter->second
 						      == inlined_fn);
-			      callsite key2 = {stack[0].afdo_loc,
+			      callsite key2 = {loc,
 					       newn ? *newn
 					       : inlined_fn->name ()};
 			      callsites.erase (iter);
@@ -1621,12 +1689,16 @@ function_instance::match (cgraph_node *node,
 	if (warned)
 	  inform (DECL_SOURCE_LOCATION (node->decl),
 		  "count %" PRIu64
-		  " with relative location +%i, discriminator %i",
-		  iter.second.count, iter.first >> 16, iter.first & 65535);
+		  " with relative location +%i, discriminator %u",
+		  iter.second.count,
+		  get_line_offset_from_combined_loc (iter.first),
+		  get_discriminator_from_combined_loc (iter.first));
 	if (dump_file)
 	  {
 	    fprintf (dump_file, "Removing targets of ");
-	    dump_afdo_loc (dump_file, iter.first);
+	    dump_afdo_loc (dump_file,
+			   get_line_offset_from_combined_loc (iter.first),
+			   get_discriminator_from_combined_loc (iter.first));
 	    fprintf (dump_file, "\n");
 	  }
 	iter.second.targets.clear ();
@@ -1636,22 +1708,22 @@ function_instance::match (cgraph_node *node,
      (prologue, epilogue).
      TODO: If present, perhaps it can be used to determine entry block
      and exit block counts.  */
-  unsigned int end_location = get_combined_location
+  uint64_t end_location = get_combined_location
     (DECL_STRUCT_FUNCTION (node->decl)->function_end_locus, node->decl);
-  unsigned int start_location = get_combined_location
+  uint64_t start_location = get_combined_location
     (DECL_STRUCT_FUNCTION (node->decl)->function_start_locus, node->decl);
   /* When outputting code to builtins location we use line number 0.
      create_gcov is stupid and happily computes offsets across files.
      Silently ignore it.  */
-  unsigned int zero_location
-	  = ((unsigned)(1-DECL_SOURCE_LINE (node->decl))) << 16;
+  uint64_t zero_location
+	  = ((uint64_t)(unsigned)(1-DECL_SOURCE_LINE (node->decl))) << 32;
   for (position_count_map::const_iterator iter = pos_counts.begin ();
        iter != pos_counts.end ();)
     if (!counts.contains (&iter->second))
       {
 	if (iter->first != end_location
 	    && iter->first != start_location
-	    && (iter->first & 65535) != zero_location
+	    && iter->first != zero_location
 	    && iter->first
 	    /* FIXME: dwarf5 does not represent inline stack of debug
 	       statements and consequently create_gcov is sometimes
@@ -1667,10 +1739,11 @@ function_instance::match (cgraph_node *node,
 			    node->decl);
 	    if (warned)
 	      inform (DECL_SOURCE_LOCATION (node->decl),
-		      "count %" PRIu64 " with relative location +%i,"
-		      " discriminator %i",
-		      iter->second.count, iter->first >> 16,
-		      iter->first & 65535);
+		      "count %" PRIu64 " with relative location +%u,"
+		      " discriminator %u",
+		      iter->second.count,
+		      (unsigned int)(iter->first >> 32),
+		      (unsigned int)(iter->first & 0xFFFFFFFF));
 	    if ((iter->first >> 16) > (end_location >> 16) && warned)
 	      inform (DECL_SOURCE_LOCATION (node->decl),
 		      "location is after end of function");
@@ -1678,7 +1751,9 @@ function_instance::match (cgraph_node *node,
 	if (dump_file)
 	  {
 	    fprintf (dump_file, "Removing unmatched count ");
-	    dump_afdo_loc (dump_file, iter->first);
+	    dump_afdo_loc (dump_file,
+			   get_line_offset_from_combined_loc (iter->first),
+			   get_discriminator_from_combined_loc (iter->first));
 	    fprintf (dump_file, ":%" PRIu64, iter->second.count);
 	    for (auto &titer : iter->second.targets)
 	      fprintf (dump_file, " %s:%" PRIu64,
@@ -1708,11 +1783,14 @@ function_instance::match (cgraph_node *node,
 	    if (warned)
 	      inform (DECL_SOURCE_LOCATION (node->decl),
 		      "call of %s with total count %" PRId64
-		      ", relative location +%i, discriminator %i",
+		      ", relative location +%i, discriminator %u",
 		      afdo_string_table->get_name (iter->first.second),
 		      iter->second->total_count (),
-		      iter->first.first >> 16, iter->first.first & 65535);
-	    if ((iter->first.first >> 16) > (end_location >> 16) && warned)
+		      get_line_offset_from_combined_loc (iter->first.first),
+		      get_discriminator_from_combined_loc (iter->first.first));
+	    if (get_line_offset_from_combined_loc (iter->first.first)
+		> get_line_offset_from_combined_loc (end_location)
+		&& warned)
 	      inform (DECL_SOURCE_LOCATION (node->decl),
 		      "location is after end of function");
 	    if (dump_file)
@@ -1860,7 +1938,9 @@ function_instance::dump (FILE *f, int indent, bool nested) const
   for (auto const &iter : pos_counts)
     {
       fprintf (f, "%*s", indent + 2, "");
-      dump_afdo_loc (f, iter.first);
+      dump_afdo_loc (f,
+		     get_line_offset_from_combined_loc (iter.first),
+		     get_discriminator_from_combined_loc (iter.first));
       fprintf (f, ": %" PRIu64, (int64_t)iter.second.count);
 
       for (auto const &titer : iter.second.targets)
@@ -1872,7 +1952,9 @@ function_instance::dump (FILE *f, int indent, bool nested) const
   for (auto const &iter : callsites)
     {
       fprintf (f, "%*s", indent + 2, "");
-      dump_afdo_loc (f, iter.first.first);
+      dump_afdo_loc (f,
+		     get_line_offset_from_combined_loc (iter.first.first),
+		     get_discriminator_from_combined_loc (iter.first.first));
       fprintf (f, ": %s", afdo_string_table->get_name (iter.first.second));
       iter.second->dump (f, indent + 2, true);
       gcc_checking_assert ((int)iter.first.second == iter.second->name ());
@@ -1905,7 +1987,9 @@ function_instance::dump_inline_stack (FILE *f) const
   for (callsite &s: stack)
     {
       fprintf (f, "%s:", afdo_string_table->get_name (s.second));
-      dump_afdo_loc (f, s.first);
+      dump_afdo_loc (f,
+		     get_line_offset_from_combined_loc (s.first),
+		     get_discriminator_from_combined_loc (s.first));
       fprintf (f, " ");
     }
   fprintf (f, "%s", afdo_string_table->get_name (name ()));
@@ -1919,16 +2003,32 @@ function_instance::debug () const
   dump (stderr);
 }
 
-/* Return profile info for LOC in INFO.  */
+/* Return profile info for LOC in INFO.
+
+   For hierarchical discriminators, we aggregate counts across all
+   pass1/pass2 values that share the same line offset and base
+   discriminator.  This is necessary because during profile-guided
+   optimization, the code structure may differ from the training run
+   (e.g., different unrolling decisions), but we still want to use the
+   profile data for the same logical source location.  */
 
 bool
 function_instance::get_count_info (location_t loc, count_info *info) const
 {
-  position_count_map::const_iterator iter = pos_counts.find (loc);
-  if (iter == pos_counts.end ())
-    return false;
-  *info = iter->second;
-  return true;
+  /* Direct lookup using combined location which contains (line_offset, base).
+     Profile data was aggregated during reading by (line_offset, base),
+     stripping only pass1 and pass2 discriminators.
+     At afdo_offline pass, discriminators only have base component
+     (pass1=0, pass2=0).  */
+
+  position_count_map::const_iterator it = pos_counts.find (loc);
+  if (it != pos_counts.end ())
+    {
+      *info = it->second;
+      return true;
+    }
+
+  return false;
 }
 
 /* Read the inlined indirect call target profile for STMT and store it in
@@ -2216,7 +2316,7 @@ walk_block (tree fn, function_instance *s, tree block)
 {
   if (inlined_function_outer_scope_p (block))
     {
-      unsigned loc = get_relative_location_for_locus
+      uint64_t loc = get_relative_location_for_locus
 		      (fn, BLOCK_SUPERCONTEXT (block),
 		       BLOCK_SOURCE_LOCATION (block));
       function_instance *ns
@@ -2230,7 +2330,9 @@ walk_block (tree fn, function_instance *s, tree block)
 	      fprintf (dump_file, " Failed to find inlined instance:");
 	      s->dump_inline_stack (dump_file);
 	      fprintf (dump_file, ":");
-	      dump_afdo_loc (dump_file, loc);
+	      dump_afdo_loc (dump_file,
+			     get_line_offset_from_combined_loc (loc),
+			     get_discriminator_from_combined_loc (loc));
 	      fprintf (dump_file, " %s\n",
 		       raw_symbol_name (BLOCK_ABSTRACT_ORIGIN (block)));
 	    }
@@ -2327,7 +2429,8 @@ autofdo_source_profile::offline_unrealized_inlines ()
    NUM_POS_COUNTS: 4 bytes
    NUM_CALLSITES: 4 byte
    POS_COUNT_1:
-     POS_1_OFFSET: 4 bytes
+     POS_1_OFFSET: 8 bytes (64-bit: upper 32 bits = line offset,
+			    lower 32 bits = hierarchical discriminator)
      NUM_TARGETS: 4 bytes
      COUNT: 8 bytes
      TARGET_1:
@@ -2341,7 +2444,8 @@ autofdo_source_profile::offline_unrealized_inlines ()
    ...
    POS_COUNT_N
    CALLSITE_1:
-     CALLSITE_1_OFFSET: 4 bytes
+     CALLSITE_1_OFFSET: 8 bytes (64-bit: upper 32 bits = line offset,
+				 lower 32 bits = hierarchical discriminator)
      FUNCTION_INSTANCE_PROFILE (nested)
    CALLSITE_2
    ...
@@ -2361,12 +2465,38 @@ function_instance::read_function_instance (function_instance_stack *stack,
 
   for (unsigned i = 0; i < num_pos_counts; i++)
     {
-      unsigned offset = gcov_read_unsigned ();
+      /* Read separate 32-bit line_offset and 32-bit discriminator.
+	 Discriminator format: Base(12) | Pass1(12) | Pass2(8).  */
+      unsigned int line_offset = gcov_read_unsigned ();
+      unsigned int discriminator = gcov_read_unsigned ();
+
       unsigned num_targets = gcov_read_unsigned ();
       gcov_type count = gcov_read_counter ();
-      s->pos_counts[offset].count = count;
+
+      if (dump_file)
+	fprintf (dump_file,
+		 "  READ pos_count: line_offset=%u discriminator=%u"
+		 "(base=%u pass1=%u pass2=%u) count=%ld\n",
+		 line_offset, discriminator,
+		 discriminator & DISCR_BASE_MASK,
+		 (discriminator >> DISCR_PASS1_SHIFT) & DISCR_PASS1_MASK,
+		 (discriminator >> DISCR_PASS2_SHIFT) & DISCR_PASS2_MASK,
+		 (long)count);
+
+      /* Aggregate by (line_offset, base) to strip pass1 and pass2.
+	 Extract base discriminator from the profile entry.  */
+      unsigned int base = get_base_discriminator (discriminator);
+
+      /* Create aggregated key: (line_offset << 32) | base
+	 This strips pass1 and pass2, keeping only line and base
+	 discriminator.  */
+      uint64_t agg_offset
+       	= (((uint64_t)(unsigned int)line_offset) << 32) | base;
+
+      /* Accumulate counts for the same (line_offset, base) key.  */
+      s->pos_counts[agg_offset].count += count;
       afdo_profile_info->sum_max = std::max (afdo_profile_info->sum_max,
-					     count);
+					     s->pos_counts[agg_offset].count);
 
       for (unsigned j = 0; j < stack->length (); j++)
         (*stack)[j]->total_count_ += count;
@@ -2375,15 +2505,30 @@ function_instance::read_function_instance (function_instance_stack *stack,
 	  /* Only indirect call target histogram is supported now.  */
 	  gcov_read_unsigned ();
 	  gcov_type target_idx = gcov_read_counter ();
-	  s->pos_counts[offset].targets[target_idx] = gcov_read_counter ();
+	  gcov_type target_count = gcov_read_counter ();
+	  /* Accumulate target counts for the same aggregated key.  */
+	  s->pos_counts[agg_offset].targets[target_idx] += target_count;
         }
     }
+
+
   for (unsigned i = 0; i < num_callsites; i++)
     {
-      unsigned offset = gcov_read_unsigned ();
+      /* Read separate 32-bit line_offset and 32-bit discriminator.  */
+      unsigned int line_offset = gcov_read_unsigned ();
+      unsigned int discriminator = gcov_read_unsigned ();
+
+      /* Aggregate callsite offset by (line_offset, base) to strip
+	 pass1 and pass2.
+	 This matches how we aggregate pos_counts.  */
+      unsigned int base = get_base_discriminator (discriminator);
+      uint64_t agg_offset
+       	= (((uint64_t)(unsigned int)line_offset) << 32) | base;
+
       function_instance *callee_function_instance
           = read_function_instance (stack, -1);
-      s->callsites[std::make_pair (offset, callee_function_instance->name ())]
+      s->callsites[std::make_pair (agg_offset,
+				   callee_function_instance->name ())]
           = callee_function_instance;
     }
   stack->pop ();
@@ -2457,7 +2602,9 @@ autofdo_source_profile::get_count_info (location_t gimple_loc,
   function_instance *s = get_function_instance_by_inline_stack (stack);
   if (s == NULL)
     return false;
-  return s->get_count_info (stack[0].afdo_loc, info);
+  uint64_t lookup_key = make_profile_lookup_key (stack[0].line_offset,
+						  stack[0].discriminator);
+  return s->get_count_info (lookup_key, info);
 }
 
 /* Update value profile INFO for STMT from the inlined indirect callsite.
@@ -2564,7 +2711,7 @@ autofdo_source_profile::get_callsite_total_count (
     struct cgraph_edge *edge) const
 {
   inline_stack stack;
-  stack.safe_push ({edge->callee->decl, 0, UNKNOWN_LOCATION});
+  stack.safe_push ({edge->callee->decl, 0, 0, UNKNOWN_LOCATION});
 
   get_inline_stack_in_node (gimple_location (edge->call_stmt), &stack,
 			    edge->caller);
@@ -2574,7 +2721,8 @@ autofdo_source_profile::get_callsite_total_count (
 	fprintf (dump_file, "Looking up afdo profile for call %s -> %s stack:",
 		 edge->caller->dump_name (), edge->callee->dump_name ());
       else
-	fprintf (dump_file, "Looking up afdo profile for call %s -> %s transitively %s stack:",
+	fprintf (dump_file,
+		 "Looking up afdo profile for call %s -> %s transitively %s stack:",
 		 edge->caller->dump_name (), edge->callee->dump_name (),
 		 edge->caller->inlined_to->dump_name ());
       dump_inline_stack (dump_file, &stack);
@@ -2691,7 +2839,9 @@ autofdo_source_profile::get_function_instance_by_inline_stack (
   function_instance *s = iter->second;
   for (unsigned i = stack.length () - 1; i > 0; i--)
     {
-      s = s->get_function_instance_by_decl (stack[i].afdo_loc,
+      uint64_t lookup_key = make_profile_lookup_key (stack[i].line_offset,
+						      stack[i].discriminator);
+      s = s->get_function_instance_by_decl (lookup_key,
 					    stack[i - 1].decl,
 					    stack[i].location);
       if (s == NULL)
@@ -2706,10 +2856,10 @@ autofdo_source_profile::get_function_instance_by_inline_stack (
 			       (stack[i].location),
 			      "auto-profile has no inlined function instance "
 			      "for inlined call of %s at relative "
-			      " locaction +%i, discriminator %i\n",
-			     raw_symbol_name (stack[i - 1].decl),
-			     stack[i].afdo_loc >> 16,
-			     stack[i].afdo_loc & 65535);
+		   " locaction +%i, discriminator %u\n",
+		   raw_symbol_name (stack[i - 1].decl),
+		   stack[i].line_offset,
+		   stack[i].discriminator);
 	  return NULL;
 	}
     }
diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
index dda2fb661d8..91c3e88d567 100644
--- a/gcc/cfgloopmanip.cc
+++ b/gcc/cfgloopmanip.cc
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "sreal.h"
 #include "tree-cfg.h"
 #include "tree-pass.h"
+#include "hierarchical_discriminator.h"
 
 static void copy_loops_to (class loop **, int,
 			   class loop *);
@@ -1422,6 +1423,47 @@ duplicate_loop_body_to_header_edge (class loop *loop, edge e,
 	    new_bbs[i]->aux = (void *)(size_t)(j + 1);
 	  }
 
+      /* Assign hierarchical discriminators to distinguish loop iterations.  */
+      if (flags & DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR)
+	{
+	  /* Only handle GIMPLE mode for now.  */
+	  if (current_ir_type () == IR_GIMPLE)
+	    {
+	      unsigned int iter = j + 1;
+
+	      for (i = 0; i < n; i++)
+		{
+		  for (gimple_stmt_iterator gsi = gsi_start_bb (new_bbs[i]);
+		       !gsi_end_p (gsi); gsi_next (&gsi))
+		    {
+		      gimple *stmt = gsi_stmt (gsi);
+		      location_t loc = gimple_location (stmt);
+
+		      if (loc != UNKNOWN_LOCATION && !is_gimple_debug (stmt))
+			{
+			  unsigned int base, pass1, old_pass2;
+			  get_discriminator_components_from_loc (loc, &base,
+								 &pass1,
+								 &old_pass2);
+
+			  /* Add iteration count to existing pass2 value,
+			     capping at 255.  */
+			  unsigned int pass2 = old_pass2 + iter;
+			  if (pass2 > 255)
+			    pass2 = 255;
+
+			  location_t new_loc
+			    = location_with_discriminator_components (loc,
+								      base,
+								      pass1,
+								      pass2);
+			  gimple_set_location (stmt, new_loc);
+			}
+		    }
+		}
+	    }
+	}
+
       /* Note whether the blocks and edges belong to an irreducible loop.  */
       if (add_irreducible_flag)
 	{
diff --git a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h
index 42def2fe40d..d3d1a73bdea 100644
--- a/gcc/cfgloopmanip.h
+++ b/gcc/cfgloopmanip.h
@@ -34,6 +34,10 @@ enum
 					   a complete peeling.  */
 #define DLTHE_FLAG_FLAT_PROFILE 8	/* Profile is flat; do not reduce
 					   count by unroll factor.  */
+#define DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR 16 /* Assign hierarchical
+						      discriminators to
+						      distinguish loop
+						      iterations.  */
 extern edge mfb_kj_edge;
 
 extern bool remove_path (edge, bool * = NULL, bitmap = NULL);
diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
index 5c9b2fb77ff..b4fb5575b67 100644
--- a/gcc/gimple-loop-versioning.cc
+++ b/gcc/gimple-loop-versioning.cc
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-into-ssa.h"
 #include "gimple-range.h"
 #include "tree-cfg.h"
+#include "hierarchical_discriminator.h"
 
 namespace {
 
@@ -1699,6 +1700,13 @@ loop_versioning::version_loop (class loop *loop)
       return false;
     }
 
+  /* Assign hierarchical discriminators to distinguish loop versions.
+     This allows AutoFDO to distinguish profile data from different
+     versions.  */
+  assign_discriminators_to_loop (li.optimized_loop,
+				  DISCRIMINATOR_LOOP_VERSION_ALIGNED);
+  assign_discriminators_to_loop (loop, DISCRIMINATOR_LOOP_VERSION_UNALIGNED);
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
 		     "versioned this loop for when certain strides are 1\n");
diff --git a/gcc/hierarchical_discriminator.cc b/gcc/hierarchical_discriminator.cc
new file mode 100644
index 00000000000..ddd19718f0f
--- /dev/null
+++ b/gcc/hierarchical_discriminator.cc
@@ -0,0 +1,97 @@
+/* Copyright The GNU Toolchain Authors
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "ssa.h"
+#include "gimple-iterator.h"
+#include "tree-cfg.h"
+#include "cfgloop.h"
+#include "hierarchical_discriminator.h"
+#include "cfghooks.h"
+
+/* Assign discriminators to all statements in a basic block.  This
+   function updates the pass1 and/or pass2 discriminator components for
+   all statements in the given basic block, while preserving the base
+   discriminator.  */
+
+void
+assign_discriminators_to_bb (basic_block bb,
+			      unsigned int pass1_value,
+			      unsigned int pass2_value,
+			      bool update_pass1,
+			      bool update_pass2)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      location_t loc = gimple_location (stmt);
+
+      if (loc == UNKNOWN_LOCATION || is_gimple_debug (stmt))
+	continue;
+
+      /* Get existing discriminator components.  */
+      unsigned int base, pass1, pass2;
+      get_discriminator_components_from_loc (loc, &base, &pass1, &pass2);
+
+      /* Update requested components.  */
+      if (update_pass1)
+	pass1 = pass1_value;
+      if (update_pass2)
+	pass2 = pass2_value;
+
+      /* Set new location.  */
+      location_t new_loc = location_with_discriminator_components (loc, base,
+								   pass1,
+								   pass2);
+      gimple_set_location (stmt, new_loc);
+    }
+}
+
+/* Assign pass1 discriminators to all basic blocks in a loop.  This
+   function is used by loop versioning passes to assign a unique version
+   ID to all statements in a loop version.  The version_id should be a
+   unique value (1, 2, 3, ...) for each version of the loop.  */
+
+void
+assign_discriminators_to_loop (class loop *loop, unsigned int version_id)
+{
+  basic_block *bbs;
+  unsigned int i;
+
+  /* Validate version_id is in valid range for pass1 (1-4095).  */
+  gcc_assert (version_id > 0 && version_id <= DISCR_PASS1_MAX);
+
+  /* Get all basic blocks in the loop.  */
+  bbs = get_loop_body (loop);
+
+  /* Assign pass1 discriminator to all blocks in the loop.  */
+  for (i = 0; i < loop->num_nodes; i++)
+    assign_discriminators_to_bb (bbs[i], version_id, 0, true, false);
+
+  free (bbs);
+}
+
+
diff --git a/gcc/hierarchical_discriminator.h b/gcc/hierarchical_discriminator.h
new file mode 100644
index 00000000000..dd3cb1b0ae7
--- /dev/null
+++ b/gcc/hierarchical_discriminator.h
@@ -0,0 +1,75 @@
+/* Copyright The GNU Toolchain Authors
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#ifndef GCC_HIERARCHICAL_DISCRIMINATOR_H
+#define GCC_HIERARCHICAL_DISCRIMINATOR_H
+
+#include "gimple.h"
+#include "tree.h"
+#include "basic-block.h"
+#include "input.h"
+
+/* Hierarchical discriminator layout (32 bits total):
+   - Base: bits 0-11 (12 bits, 0-4095)
+   - Pass1: bits 12-23 (12 bits, 0-4095)
+   - Pass2: bits 24-31 (8 bits, 0-255)
+
+   Base discriminator: Used by front-end and early passes to distinguish
+		       different statements on the same source line.
+
+   Pass1 discriminator: Used by middle-end optimizations to distinguish
+			different versions/contexts of the same code:
+			- Loop versioning (vectorized vs scalar)
+			- Inlining contexts (different callsites)
+			- Function cloning
+
+   Pass2 discriminator: Used by late optimizations to distinguish
+			iterations or variants:
+			- Loop unrolling iterations
+			- Vectorization variants
+ */
+
+/* Loop versioning discriminators.  */
+#define DISCRIMINATOR_LOOP_VERSION_VECTORIZED  1  /* Vectorized version.  */
+#define DISCRIMINATOR_LOOP_VERSION_SCALAR      2  /* Scalar version.  */
+#define DISCRIMINATOR_LOOP_VERSION_ALIGNED     3  /* Aligned version.  */
+#define DISCRIMINATOR_LOOP_VERSION_UNALIGNED   4  /* Unaligned version.  */
+
+/* Loop transformation discriminators.  */
+#define DISCRIMINATOR_LOOP_UNROLLED	       5  /* Unrolled loop.  */
+#define DISCRIMINATOR_LOOP_PEELED	       6  /* Peeled loop.  */
+
+/* Helper function to assign discriminators to all statements in a basic
+   block.  This preserves the base discriminator and only updates the
+   requested components.  */
+extern void assign_discriminators_to_bb (basic_block bb,
+					  unsigned int pass1_value,
+					  unsigned int pass2_value,
+					  bool update_pass1,
+					  bool update_pass2);
+
+/* Helper function to assign pass1 discriminators to all basic blocks in
+   a loop.  This is used by loop versioning passes to distinguish
+   different versions of the same loop.  */
+extern void assign_discriminators_to_loop (class loop *loop,
+					    unsigned int version_id);
+
+#endif /* GCC_HIERARCHICAL_DISCRIMINATOR_H.  */
diff --git a/gcc/input.cc b/gcc/input.cc
index aad98394711..7d8cd31e304 100644
--- a/gcc/input.cc
+++ b/gcc/input.cc
@@ -1074,6 +1074,40 @@ get_discriminator_from_loc (location_t locus)
   return get_discriminator_from_loc (line_table, locus);
 }
 
+/* Create a location with hierarchical discriminator components.  */
+
+location_t
+location_with_discriminator_components (location_t locus,
+					unsigned int base,
+					unsigned int pass1,
+					unsigned int pass2)
+{
+  gcc_assert (base <= DISCR_BASE_MAX);
+  gcc_assert (pass1 <= DISCR_PASS1_MAX);
+  gcc_assert (pass2 <= DISCR_PASS2_MAX);
+  unsigned int discriminator = (base << DISCR_BASE_SHIFT)
+    | (pass1 << DISCR_PASS1_SHIFT)
+    | (pass2 << DISCR_PASS2_SHIFT);
+  return location_with_discriminator (locus, discriminator);
+}
+
+/* Get hierarchical discriminator components from a location.  */
+
+void
+get_discriminator_components_from_loc (location_t locus,
+				       unsigned int *base,
+				       unsigned int *pass1,
+				       unsigned int *pass2)
+{
+  unsigned int discriminator = get_discriminator_from_loc (locus);
+  if (base)
+    *base = discriminator & DISCR_BASE_MASK;
+  if (pass1)
+    *pass1 = (discriminator >> DISCR_PASS1_SHIFT) & DISCR_PASS1_MASK;
+  if (pass2)
+    *pass2 = (discriminator >> DISCR_PASS2_SHIFT) & DISCR_PASS2_MASK;
+}
+
 #if CHECKING_P
 
 namespace selftest {
diff --git a/gcc/input.h b/gcc/input.h
index 4d2d7741592..af30f314be5 100644
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -89,6 +89,42 @@ extern location_t location_with_discriminator (location_t, int);
 extern bool has_discriminator (location_t);
 extern int get_discriminator_from_loc (location_t);
 
+/* Hierarchical discriminator support for AutoFDO.
+Layout: Base (12 bits) | Pass1 (12 bits) | Pass2 (8 bits)
+- Base: Traditional same-line disambiguation
+- Pass1: Optimization context (e.g., inline callsite hash)
+- Pass2: Code duplication (e.g., loop unroll iteration)  */
+
+/* Discriminator bit layout constants.  */
+#define DISCR_BASE_BITS 12
+#define DISCR_PASS1_BITS 12
+#define DISCR_PASS2_BITS 8
+
+#define DISCR_BASE_MASK ((1u << DISCR_BASE_BITS) - 1)
+#define DISCR_PASS1_MASK ((1u << DISCR_PASS1_BITS) - 1)
+#define DISCR_PASS2_MASK ((1u << DISCR_PASS2_BITS) - 1)
+
+#define DISCR_BASE_SHIFT 0
+#define DISCR_PASS1_SHIFT DISCR_BASE_BITS
+#define DISCR_PASS2_SHIFT (DISCR_BASE_BITS + DISCR_PASS1_BITS)
+
+/* Maximum values for each discriminator field.  */
+#define DISCR_BASE_MAX DISCR_BASE_MASK
+#define DISCR_PASS1_MAX DISCR_PASS1_MASK
+#define DISCR_PASS2_MAX DISCR_PASS2_MASK
+
+/* Create location with hierarchical discriminator.  */
+extern location_t location_with_discriminator_components (location_t,
+							  unsigned int base,
+							  unsigned int pass1,
+							  unsigned int pass2);
+
+/* Get discriminator components from location.  */
+extern void get_discriminator_components_from_loc (location_t,
+						   unsigned int *base,
+						   unsigned int *pass1,
+						   unsigned int *pass2);
+
 #define LOCATION_FILE(LOC) ((expand_location (LOC)).file)
 #define LOCATION_LINE(LOC) ((expand_location (LOC)).line)
 #define LOCATION_COLUMN(LOC)((expand_location (LOC)).column)
diff --git a/gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c b/gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c
new file mode 100644
index 00000000000..9690d664197
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/hierarchical-discriminator-unroll.c
@@ -0,0 +1,35 @@
+/* Test that loop unrolling assigns pass2 discriminators for iterations.
+   { dg-do compile }
+   { dg-options "-S -O2 -g  -fno-tree-vectorize" } */
+
+int a[100];
+int
+test_unroll (void)
+{
+  int sum = 0;
+  int i;
+  
+  /* Small fixed-count loop that should be completely unrolled */
+  #pragma GCC unroll 4
+  for (i = 0; i < 4; i++)
+    {
+      /* Each unrolled iteration should get pass2=1,2,3,4 */
+      asm ("nop");
+      sum += a[i] * 2; 
+    }
+  
+  return sum;
+}
+
+/* Expected discriminators from the assembly (hierarchical format: base:pass1:pass2):
+   - discriminator 16777216 (0x1000000) = base:0, pass1:0, pass2:1 - first unrolled iteration
+   - discriminator 33554432 (0x2000000) = base:0, pass1:0, pass2:2 - second unrolled iteration
+   - discriminator 50331648 (0x3000000) = base:0, pass1:0, pass2:3 - third unrolled iteration
+   - discriminator 67108864 (0x4000000) = base:0, pass1:0, pass2:4 - fourth unrolled iteration
+   Note: pass2 values represent the iteration number in the unrolled loop
+*/
+
+/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 16777216" } } */
+/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 33554432" } } */
+/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 50331648" } } */
+/* { dg-final { scan-assembler "\\.loc 1 17 7 is_stmt 0 discriminator 67108864" } } */
diff --git a/gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c b/gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c
new file mode 100644
index 00000000000..d9b5ebb584c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/hierarchical-discriminator-vect-version.c
@@ -0,0 +1,28 @@
+/* Test that loop versioning for vectorization assigns pass1 discriminators.
+   { dg-do compile }
+   { dg-options "-O3 -g -ftree-vectorize" }
+   { dg-require-effective-target vect_int }
+    */
+
+void
+test_vectorize (int *a, int *b, int *c, int n)
+//test_vectorize (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[i] = b[i] + c[i];  /* This should get pass1=1 (vectorized) and pass1=2 (scalar) */
+    }
+}
+
+/* Check that .loc directives with discriminators are present.
+   Format: .loc file line column discriminator
+   Expected discriminators from the assembly (hierarchical format: base:pass1:pass2):
+   - discriminator 4096 (0x1000) = base:0, pass1:1, pass2:0 - vectorized version
+   - discriminator 8192 (0x2000) = base:0, pass1:2, pass2:0 - scalar version
+   - discriminator 16781312 (0x1000000) = base:0, pass1:4096, pass2:0 - scalar remainder first iteration
+   - discriminator 33558528 (0x2000000) = base:0, pass1:8192, pass2:0 - scalar remainder second iteration
+*/
+
+/* { dg-final { scan-assembler "\\.loc 1 14 15 is_stmt 0 discriminator 4096" } } */
+/* { dg-final { scan-assembler "\\.loc 1 14 19 is_stmt 0 discriminator 8192" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index ca6295c7de2..fe774454bf5 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -65,6 +65,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-sccvn.h"
 #include "tree-vectorizer.h" /* For find_loop_location */
 #include "dbgcnt.h"
+#include "hierarchical_discriminator.h"
 
 /* Specifies types of loops that may be unrolled.  */
 
@@ -980,7 +981,8 @@ try_unroll_loop_completely (class loop *loop,
       if (!gimple_duplicate_loop_body_to_header_edge (
 	    loop, loop_preheader_edge (loop), n_unroll, wont_exit, exit,
 	    &edges_to_remove,
-	    DLTHE_FLAG_UPDATE_FREQ | DLTHE_FLAG_COMPLETTE_PEEL))
+	    DLTHE_FLAG_UPDATE_FREQ | DLTHE_FLAG_COMPLETTE_PEEL
+	    | DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR))
 	{
           free_original_copy_tables ();
 	  if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1222,7 +1224,8 @@ try_peel_loop (class loop *loop,
 
   if (!gimple_duplicate_loop_body_to_header_edge (
 	loop, loop_preheader_edge (loop), npeel, wont_exit, exit,
-	&edges_to_remove, DLTHE_FLAG_UPDATE_FREQ))
+	&edges_to_remove,
+	DLTHE_FLAG_UPDATE_FREQ | DLTHE_RECORD_HIERARCHICAL_DISCRIMINATOR))
     {
       free_original_copy_tables ();
       return false;
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 20141dbc2e5..6de9fcd5746 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -53,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "langhooks.h"
 #include "tree-vector-builder.h"
 #include "optabs-tree.h"
+#include "hierarchical_discriminator.h"
 
 /*************************************************************************
   Simple Loop Peeling Utilities
@@ -4359,6 +4360,14 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
       gcc_assert (nloop);
       nloop = get_loop_copy (loop);
 
+      /* Assign hierarchical discriminators to distinguish loop versions.
+	 This allows AutoFDO to distinguish profile data from different
+	 versions.  */
+      assign_discriminators_to_loop (loop,
+				      DISCRIMINATOR_LOOP_VERSION_VECTORIZED);
+      assign_discriminators_to_loop (nloop,
+				      DISCRIMINATOR_LOOP_VERSION_SCALAR);
+
       /* For cycle vectorization with SLP we rely on the PHI arguments
 	 appearing in the same order as the SLP node operands which for the
 	 loop PHI nodes means the preheader edge dest index needs to remain
-- 
2.34.1

