Hi, all
Currently, GCC just ignores all data prefetches within
loop when the number of prefetches exceeds
SIMULTANEOUS_PREFETCHES. It isn't advisable.
Also, macros defined in ia64.h for data prefetching
are too small.
This patch modified the data prefetch algorithm
defined in loop.c and redefines some macros in ia64.h
accordingly. The test shows 2.5 percent perfomance
improvements is gained for SPEC CFP2000 benchmarks on
IA-64. If the new loop unroller was perfectly (just
like the old one which was removed) implemented, much
more performance improvements would be gained.
Canqun Yang
Creative Compiler Research Group.
National University of Defense Technology, China.
2005-03-25 Canqun Yang <[EMAIL PROTECTED]>
* ia64.c (SIMULTANEOUS_PREFETCHES): Redefine as 18.
(PREFETCH_BLOCK): Redefine as 64.
(PREFETCH_BLOCKS_BEFORE_LOOP_MAX): New definition.
2005-03-25 Canqun Yang <[EMAIL PROTECTED]>
* loop.c (PREFETCH_BLOCKS_BEFORE_LOOP_MAX): Defined conditionally.
(scan_loop): Change extra_size from 16 to 128.
(emit_prefetch_instructions): Don't ignore all prefetches within loop.
Index: loop.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/loop.c,v
retrieving revision 1.522
diff -c -3 -p -r1.522 loop.c
*** loop.c 17 Jan 2005 08:46:15 -0000 1.522
--- loop.c 25 Mar 2005 12:03:44 -0000
*************** struct loop_info
*** 434,440 ****
--- 434,442 ----
#define MAX_PREFETCHES 100
/* The number of prefetch blocks that are beneficial to fetch at once before
a loop with a known (and low) iteration count. */
+ #ifndef PREFETCH_BLOCKS_BEFORE_LOOP_MAX
#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX 6
+ #endif
/* For very tiny loops it is not worthwhile to prefetch even before the loop,
since it is likely that the data are already in the cache. */
#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN 2
*************** scan_loop (struct loop *loop, int flags)
*** 1100,1106 ****
/* Allocate extra space for REGs that might be created by load_mems.
We allocate a little extra slop as well, in the hopes that we
won't have to reallocate the regs array. */
! loop_regs_scan (loop, loop_info->mems_idx + 16);
insn_count = count_insns_in_loop (loop);
if (loop_dump_stream)
--- 1102,1108 ----
/* Allocate extra space for REGs that might be created by load_mems.
We allocate a little extra slop as well, in the hopes that we
won't have to reallocate the regs array. */
! loop_regs_scan (loop, loop_info->mems_idx + 128);
insn_count = count_insns_in_loop (loop);
if (loop_dump_stream)
*************** emit_prefetch_instructions (struct loop
*** 4398,4406 ****
{
if (loop_dump_stream)
fprintf (loop_dump_stream,
! "Prefetch: ignoring prefetches within loop: ahead is zero;
%d < %d\n",
SIMULTANEOUS_PREFETCHES, num_real_prefetches);
! num_real_prefetches = 0, num_real_write_prefetches = 0;
}
}
/* We'll also use AHEAD to determine how many prefetch instructions to
--- 4400,4411 ----
{
if (loop_dump_stream)
fprintf (loop_dump_stream,
! "Prefetch: ignoring some prefetches within loop: ahead is
zero; %d < %d\n",
SIMULTANEOUS_PREFETCHES, num_real_prefetches);
! num_real_prefetches = MIN (num_real_prefetches,
! SIMULTANEOUS_PREFETCHES);
! num_real_write_prefetches = MIN (num_real_write_prefetches,
! SIMULTANEOUS_PREFETCHES);
}
}
/* We'll also use AHEAD to determine how many prefetch instructions to
Index: config/ia64/ia64.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.h,v
retrieving revision 1.194
diff -c -3 -p -r1.194 ia64.h
*** config/ia64/ia64.h 17 Mar 2005 17:35:16 -0000 1.194
--- config/ia64/ia64.h 25 Mar 2005 12:05:05 -0000
*************** do {
\
*** 1993,2004 ****
??? This number is bogus and needs to be replaced before the value is
actually used in optimizations. */
! #define SIMULTANEOUS_PREFETCHES 6
/* If this architecture supports prefetch, define this to be the size of
the cache line that is prefetched. */
! #define PREFETCH_BLOCK 32
#define HANDLE_SYSV_PRAGMA 1
--- 1993,2008 ----
??? This number is bogus and needs to be replaced before the value is
actually used in optimizations. */
! #define SIMULTANEOUS_PREFETCHES 18
/* If this architecture supports prefetch, define this to be the size of
the cache line that is prefetched. */
! #define PREFETCH_BLOCK 64
!
! /* The number of prefetch blocks that are beneficial to fetch at once before
! a loop. */
! #define PREFETCH_BLOCKS_BEFORE_LOOP_MAX 18
#define HANDLE_SYSV_PRAGMA 1