Hi Honza, > -----Original Message----- > From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches- > ow...@gcc.gnu.org] On Behalf Of Jan Hubicka > Sent: Thursday, October 26, 2017 12:49 AM > To: gcc-patches@gcc.gnu.org > Subject: Add scatter/gather costs > > Hi, > this patch adds computation of scatter/gather to i386 cost metric. > The costs for core are set for haswell, skylake has better implementation so I > will have to split the cost tables for cores older and younger than skylake. I > will do that as a followup. > > Bootstrapped/regtested x86_64-linux, comitted. > > Honza > > * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather > cost correctly. > * i386.h (processor_costs): Add gather_static, gather_per_elt, > scatter_static, scatter_per_elt. > * x86-tune-costs.h: Add new cost entries. > Index: config/i386/i386.c > ========================================================== > ========= > --- config/i386/i386.c (revision 254073) > +++ config/i386/i386.c (working copy) > @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve > /* We should have separate costs for unaligned loads and > gather/scatter. > Do that incrementally. */ > case unaligned_load: > - case vector_gather_load: > index = sse_store_index (mode); > return ix86_vec_cost (mode, > COSTS_N_INSNS > @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve > true); > > case unaligned_store: > - case vector_scatter_store: > index = sse_store_index (mode); > return ix86_vec_cost (mode, > COSTS_N_INSNS > (ix86_cost->sse_unaligned_store[index]) / 2, > true); > > + case vector_gather_load: > + return ix86_vec_cost (mode, > + COSTS_N_INSNS > + (ix86_cost->gather_static > + + ix86_cost->gather_per_elt > + * TYPE_VECTOR_SUBPARTS (vectype)) / 2, > + true); > + > + case vector_scatter_store: > + return ix86_vec_cost (mode, > + COSTS_N_INSNS > + (ix86_cost->scatter_static > + + ix86_cost->scatter_per_elt > + * TYPE_VECTOR_SUBPARTS (vectype)) / 2, > + true); > + > case cond_branch_taken: > return ix86_cost->cond_taken_branch_cost; > > Index: config/i386/i386.h > ========================================================== > ========= > --- config/i386/i386.h (revision 254073) > +++ config/i386/i386.h (working copy) > @@ -253,6 +253,10 @@ struct processor_costs { > const int mmxsse_to_integer; /* cost of moving mmxsse register to > integer. */ > const int ssemmx_to_integer; /* cost of moving integer to mmxsse > register. */ > + const int gather_static, gather_per_elt; /* Cost of gather load is computed > + as static + per_item * nelts. */ > + const int scatter_static, scatter_per_elt; /* Cost of gather store is > + computed as static + per_item * nelts. */ > const int l1_cache_size; /* size of l1 cache, in kilobytes. */ > const int l2_cache_size; /* size of l2 cache, in kilobytes. */ > const int prefetch_block; /* bytes moved to cache for prefetch. */ > Index: config/i386/x86-tune-costs.h > ========================================================== > ========= > --- config/i386/x86-tune-costs.h (revision 254073) > +++ config/i386/x86-tune-costs.h (working copy) > @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost = > {3, 3, 3, 3, 3}, /* cost of unaligned SSE store > in 128bit, 256bit and 512bit */ > 3, 3, /* SSE->integer and integer->SSE > moves */ > + 5, 0, /* Gather load static, per_elt. > */ > + 5, 0, /* Gather store static, > per_elt. */ > 0, /* size of l1 cache */ > 0, /* size of l2 cache */ > 0, /* size of prefetch block */ > @@ -166,6 +168,8 @@ struct processor_costs i386_cost = { /* > in 32,64,128,256 and 512-bit */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 3, 3, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 0, /* size of l1 cache */ > 0, /* size of l2 cache */ > 0, /* size of prefetch block */ > @@ -249,6 +253,8 @@ struct processor_costs i486_cost = { /* > in 32,64,128,256 and 512-bit */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 3, 3, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 4, /* size of l1 cache. 486 has 8kB cache > shared for code and data, so 4kB is > not really precise. */ > @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = { > in 32,64,128,256 and 512-bit */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 3, 3, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 8, /* size of l1 cache. */ > 8, /* size of l2 cache */ > 0, /* size of prefetch block */ > @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = { > in 32,64,128,256 and 512-bit */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 3, 3, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 8, /* size of l1 cache. */ > 8, /* size of l2 cache */ > 0, /* size of prefetch block */ > @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost = > in 32,64,128,256 and 512-bit */ > {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ > 3, 3, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 8, /* size of l1 cache. */ > 256, /* size of l2 cache */ > 32, /* size of prefetch block */ > @@ -584,6 +596,8 @@ struct processor_costs geode_cost = { > in 32,64,128,256 and 512-bit */ > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > 6, 6, /* SSE->integer and integer->SSE > moves */ > + 2, 2, /* Gather load static, per_elt. > */ > + 2, 2, /* Gather store static, > per_elt. */ > 64, /* size of l1 cache. */ > 128, /* size of l2 cache. */ > 32, /* size of prefetch block */ > @@ -666,6 +680,8 @@ struct processor_costs k6_cost = { > in 32,64,128,256 and 512-bit */ > {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ > 6, 6, /* SSE->integer and integer->SSE > moves */ > + 2, 2, /* Gather load static, per_elt. > */ > + 2, 2, /* Gather store static, > per_elt. */ > 32, /* size of l1 cache. */ > 32, /* size of l2 cache. Some > models > have integrated l2 cache, but > @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = { > in 32,64,128,256 and 512-bit */ > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > 5, 5, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 64, /* size of l1 cache. */ > 256, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -844,6 +862,8 @@ struct processor_costs k8_cost = { > in 32,64,128,256 and 512-bit */ > {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > 5, 5, /* SSE->integer and integer->SSE > moves */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 64, /* size of l1 cache. */ > 512, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = { > 1/1 1/1 > MOVD reg32, xmmreg Double > FADD 3 > 1/1 1/1 */ > + 4, 4, /* Gather load static, per_elt. > */ > + 4, 4, /* Gather store static, > per_elt. */ > 64, /* size of l1 cache. */ > 512, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost > in 32,64,128,256 and 512-bit */ > {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > 16, 20, /* SSE->integer and integer->SSE > moves */ > + 12, 12, /* Gather load static, per_elt. */ > + 10, 10, /* Gather store static, per_elt. */ > 16, /* size of l1 cache. */ > 2048, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost > in 32,64,128,256 and 512-bit */ > {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > 16, 20, /* SSE->integer and integer->SSE > moves */ > + 12, 12, /* Gather load static, per_elt. */ > + 10, 10, /* Gather store static, per_elt. */ > 16, /* size of l1 cache. */ > 2048, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = { > in 32,64,128,256 and 512-bit */ > {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > 16, 20, /* SSE->integer and integer->SSE > moves */ > + 12, 12, /* Gather load static, per_elt. */ > + 10, 10, /* Gather store static, per_elt. */ > 16, /* size of l1 cache. */ > 2048, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = { > in 32,64,128,256 and 512-bit */ > {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > 16, 20, /* SSE->integer and integer->SSE > moves */ > + 12, 12, /* Gather load static, per_elt. */ > + 10, 10, /* Gather store static, per_elt. */ > 16, /* size of l1 cache. */ > 2048, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = { > in 32,64,128,256 and 512-bit. */ > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > 6, 6, /* SSE->integer and integer->SSE > moves. */ > + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > + throughput 12. Approx 9 uops do not depend on vector size and every > load > + is 7 uops. */ > + 18, 8, /* Gather load static, per_elt. */ > + 18, 10, /* Gather store static, per_elt. */
Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)? Per_elt is 8 i.e. (latency of load ) 4 * 2 (reg-reg move ) ? > 32, /* size of l1 cache. */ > 512, /* size of l2 cache. */ > 64, /* size of prefetch block. */ > @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost > in 32,64,128,256 and 512-bit */ > {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ > 14, 14, /* SSE->integer and integer->SSE > moves */ > + 10, 10, /* Gather load static, per_elt. */ > + 10, 10, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > 512, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost > in 32,64,128,256 and 512-bit */ > {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ > 14, 14, /* SSE->integer and integer->SSE > moves */ > + 10, 10, /* Gather load static, per_elt. */ > + 10, 10, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > 2048, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = { > in 32,64,128,256 and 512-bit */ > {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ > 20, 12, /* SSE->integer and integer->SSE > moves */ > + 16, 16, /* Gather load static, per_elt. */ > + 16, 16, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > 256, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = { > in 32,64,128,256 and 512-bit */ > {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ > 20, 12, /* SSE->integer and integer->SSE > moves */ > + 12, 12, /* Gather load static, per_elt. */ > + 12, 12, /* Gather store static, per_elt. */ > 8, /* size of l1 cache. */ > 1024, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = { > in 32,64,128,256 and 512-bit */ > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > 8, 6, /* SSE->integer and integer->SSE > moves */ > + 8, 8, /* Gather load static, per_elt. > */ > + 8, 8, /* Gather store static, > per_elt. */ > 32, /* size of l1 cache. */ > 256, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = { > in 32,64,128,256 and 512-bit */ > {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ > 8, 6, /* SSE->integer and integer->SSE > moves */ > + 8, 8, /* Gather load static, per_elt. > */ > + 8, 8, /* Gather store static, > per_elt. */ > 32, /* size of l1 cache. */ > 256, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = { > in 32,64,128,256 and 512-bit */ > {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ > 4, 4, /* SSE->integer and integer->SSE > moves */ > + 6, 6, /* Gather load static, per_elt. > */ > + 6, 6, /* Gather store static, > per_elt. */ > 32, /* size of l1 cache. */ > 256, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = { > in 32,64,128,256 and 512-bit */ > {10, 10, 10, 15, 20}, /* cost of unaligned storess. > */ > 20, 20, /* SSE->integer and integer->SSE > moves */ > + 6, 6, /* Gather load static, per_elt. > */ > + 6, 6, /* Gather store static, > per_elt. */ > 32, /* size of l1 cache. */ > 512, /* size of l2 cache. */ > 64, /* size of prefetch block */ > @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = { > in 32,64,128,256 and 512-bit */ > {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ > 2, 2, /* SSE->integer and integer->SSE > moves */ > + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, > + rec. throughput 6. > + So 5 uops statically and one uops per load. */ > + 10, 6, /* Gather load static, per_elt. */ > + 10, 6, /* Gather store static, per_elt. */ > 64, /* size of l1 cache. */ > 512, /* size of l2 cache. */ > 64, /* size of prefetch block */ Regards, Venkat.