RE: Add scatter/gather costs

Kumar, Venkataramanan Wed, 25 Oct 2017 23:49:19 -0700

Hi Honza, 

> -----Original Message-----
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Jan Hubicka
> Sent: Thursday, October 26, 2017 12:49 AM
> To: gcc-patches@gcc.gnu.org
> Subject: Add scatter/gather costs
> 
> Hi,
> this patch adds computation of scatter/gather to i386 cost metric.
> The costs for core are set for haswell, skylake has better implementation so I
> will have to split the cost tables for cores older and younger than skylake. I
> will do that as a followup.
> 
> Bootstrapped/regtested x86_64-linux, comitted.
> 
> Honza
> 
>       * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
>       cost correctly.
>       * i386.h (processor_costs): Add gather_static, gather_per_elt,
>       scatter_static, scatter_per_elt.
>       * x86-tune-costs.h: Add new cost entries.
> Index: config/i386/i386.c
> ==========================================================
> =========
> --- config/i386/i386.c        (revision 254073)
> +++ config/i386/i386.c        (working copy)
> @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve
>        /* We should have separate costs for unaligned loads and 
> gather/scatter.
>        Do that incrementally.  */
>        case unaligned_load:
> -      case vector_gather_load:
>       index = sse_store_index (mode);
>          return ix86_vec_cost (mode,
>                             COSTS_N_INSNS
> @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve
>                             true);
> 
>        case unaligned_store:
> -      case vector_scatter_store:
>       index = sse_store_index (mode);
>          return ix86_vec_cost (mode,
>                             COSTS_N_INSNS
>                                (ix86_cost->sse_unaligned_store[index]) / 2,
>                             true);
> 
> +      case vector_gather_load:
> +        return ix86_vec_cost (mode,
> +                           COSTS_N_INSNS
> +                              (ix86_cost->gather_static
> +                               + ix86_cost->gather_per_elt
> +                                 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> +                           true);
> +
> +      case vector_scatter_store:
> +        return ix86_vec_cost (mode,
> +                           COSTS_N_INSNS
> +                              (ix86_cost->scatter_static
> +                               + ix86_cost->scatter_per_elt
> +                                 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> +                           true);
> +
>        case cond_branch_taken:
>          return ix86_cost->cond_taken_branch_cost;
> 
> Index: config/i386/i386.h
> ==========================================================
> =========
> --- config/i386/i386.h        (revision 254073)
> +++ config/i386/i386.h        (working copy)
> @@ -253,6 +253,10 @@ struct processor_costs {
>    const int mmxsse_to_integer;       /* cost of moving mmxsse register to
>                                  integer.  */
>    const int ssemmx_to_integer;  /* cost of moving integer to mmxsse
> register. */
> +  const int gather_static, gather_per_elt; /* Cost of gather load is computed
> +                                as static + per_item * nelts. */
> +  const int scatter_static, scatter_per_elt; /* Cost of gather store is
> +                                computed as static + per_item * nelts.  */
>    const int l1_cache_size;   /* size of l1 cache, in kilobytes.  */
>    const int l2_cache_size;   /* size of l2 cache, in kilobytes.  */
>    const int prefetch_block;  /* bytes moved to cache for prefetch.  */
> Index: config/i386/x86-tune-costs.h
> ==========================================================
> =========
> --- config/i386/x86-tune-costs.h      (revision 254073)
> +++ config/i386/x86-tune-costs.h      (working copy)
> @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost =
>    {3, 3, 3, 3, 3},                           /* cost of unaligned SSE store
>                                          in 128bit, 256bit and 512bit */
>    3, 3,                                      /* SSE->integer and integer->SSE
> moves */
> +  5, 0,                                      /* Gather load static, per_elt. 
>  */
> +  5, 0,                                      /* Gather store static, 
> per_elt.  */
>    0,                                 /* size of l1 cache  */
>    0,                                 /* size of l2 cache  */
>    0,                                 /* size of prefetch block */
> @@ -166,6 +168,8 @@ struct processor_costs i386_cost = {      /*
>                                          in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},                        /* cost of unaligned stores.  */
>    3, 3,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    0,                                 /* size of l1 cache  */
>    0,                                 /* size of l2 cache  */
>    0,                                 /* size of prefetch block */
> @@ -249,6 +253,8 @@ struct processor_costs i486_cost = {      /*
>                                          in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},                        /* cost of unaligned stores.  */
>    3, 3,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    4,                                 /* size of l1 cache.  486 has 8kB cache
>                                          shared for code and data, so 4kB is
>                                          not really precise.  */
> @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},                        /* cost of unaligned stores.  */
>    3, 3,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    8,                                 /* size of l1 cache.  */
>    8,                                 /* size of l2 cache  */
>    0,                                 /* size of prefetch block */
> @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},                        /* cost of unaligned stores.  */
>    3, 3,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    8,                                 /* size of l1 cache.  */
>    8,                                 /* size of l2 cache  */
>    0,                                 /* size of prefetch block */
> @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost =
>                                          in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},                        /* cost of unaligned stores.  */
>    3, 3,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    8,                                 /* size of l1 cache.  */
>    256,                                       /* size of l2 cache  */
>    32,                                        /* size of prefetch block */
> @@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {2, 2, 8, 16, 32},                 /* cost of unaligned stores.  */
>    6, 6,                                      /* SSE->integer and integer->SSE
> moves */
> +  2, 2,                                      /* Gather load static, per_elt. 
>  */
> +  2, 2,                                      /* Gather store static, 
> per_elt.  */
>    64,                                        /* size of l1 cache.  */
>    128,                                       /* size of l2 cache.  */
>    32,                                        /* size of prefetch block */
> @@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {2, 2, 8, 16, 32},                 /* cost of unaligned stores.  */
>    6, 6,                                      /* SSE->integer and integer->SSE
> moves */
> +  2, 2,                                      /* Gather load static, per_elt. 
>  */
> +  2, 2,                                      /* Gather store static, 
> per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    32,                                        /* size of l2 cache.  Some 
> models
>                                          have integrated l2 cache, but
> @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {4, 4, 5, 10, 20},                 /* cost of unaligned stores.  */
>    5, 5,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    64,                                        /* size of l1 cache.  */
>    256,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {4, 4, 5, 10, 20},                 /* cost of unaligned stores.  */
>    5, 5,                                      /* SSE->integer and integer->SSE
> moves */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    64,                                        /* size of l1 cache.  */
>    512,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
>                                                              1/1  1/1
>                                           MOVD reg32, xmmreg Double
> FADD 3
>                                                              1/1  1/1 */
> +  4, 4,                                      /* Gather load static, per_elt. 
>  */
> +  4, 4,                                      /* Gather store static, 
> per_elt.  */
>    64,                                        /* size of l1 cache.  */
>    512,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},                      /* cost of unaligned stores.  */
>    16, 20,                            /* SSE->integer and integer->SSE
> moves */
> +  12, 12,                            /* Gather load static, per_elt.  */
> +  10, 10,                            /* Gather store static, per_elt.  */
>    16,                                        /* size of l1 cache.  */
>    2048,                                      /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},                      /* cost of unaligned stores.  */
>    16, 20,                            /* SSE->integer and integer->SSE
> moves */
> +  12, 12,                            /* Gather load static, per_elt.  */
> +  10, 10,                            /* Gather store static, per_elt.  */
>    16,                                        /* size of l1 cache.  */
>    2048,                                      /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},                      /* cost of unaligned stores.  */
>    16, 20,                            /* SSE->integer and integer->SSE
> moves */
> +  12, 12,                            /* Gather load static, per_elt.  */
> +  10, 10,                            /* Gather store static, per_elt.  */
>    16,                                        /* size of l1 cache.  */
>    2048,                                      /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},                      /* cost of unaligned stores.  */
>    16, 20,                            /* SSE->integer and integer->SSE
> moves */
> +  12, 12,                            /* Gather load static, per_elt.  */
> +  10, 10,                            /* Gather store static, per_elt.  */
>    16,                                        /* size of l1 cache.  */
>    2048,                                      /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
>                                          in 32,64,128,256 and 512-bit.  */
>    {8, 8, 8, 8, 16},                  /* cost of unaligned stores.  */
>    6, 6,                                      /* SSE->integer and integer->SSE
> moves.  */
> +  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> +     throughput 12.  Approx 9 uops do not depend on vector size and every
> load
> +     is 7 uops.  */
> +  18, 8,                             /* Gather load static, per_elt.  */
> +  18, 10,                            /* Gather store static, per_elt.  */


Can you please help on how you arrived at 18 for the load/store static cost 
(based on throughput)?
Per_elt is 8  i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
 

>    32,                                        /* size of l1 cache.  */
>    512,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block.  */
> @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 12, 24, 48},                      /* cost of unaligned stores.  */
>    14, 14,                            /* SSE->integer and integer->SSE
> moves */
> +  10, 10,                            /* Gather load static, per_elt.  */
> +  10, 10,                            /* Gather store static, per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    512,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 12, 24, 48},                      /* cost of unaligned stores.  */
>    14, 14,                            /* SSE->integer and integer->SSE
> moves */
> +  10, 10,                            /* Gather load static, per_elt.  */
> +  10, 10,                            /* Gather store static, per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    2048,                                      /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {32, 32, 32, 64, 128},             /* cost of unaligned stores.  */
>    20, 12,                            /* SSE->integer and integer->SSE
> moves */
> +  16, 16,                            /* Gather load static, per_elt.  */
> +  16, 16,                            /* Gather store static, per_elt.  */
>    8,                                 /* size of l1 cache.  */
>    256,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {24, 24, 24, 48, 96},                      /* cost of unaligned stores.  */
>    20, 12,                            /* SSE->integer and integer->SSE
> moves */
> +  12, 12,                            /* Gather load static, per_elt.  */
> +  12, 12,                            /* Gather store static, per_elt.  */
>    8,                                 /* size of l1 cache.  */
>    1024,                                      /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {16, 16, 16, 32, 64},                      /* cost of unaligned stores.  */
>    8, 6,                                      /* SSE->integer and integer->SSE
> moves */
> +  8, 8,                                      /* Gather load static, per_elt. 
>  */
> +  8, 8,                                      /* Gather store static, 
> per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    256,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {16, 16, 16, 32, 64},                      /* cost of unaligned stores.  */
>    8, 6,                                      /* SSE->integer and integer->SSE
> moves */
> +  8, 8,                                      /* Gather load static, per_elt. 
>  */
> +  8, 8,                                      /* Gather store static, 
> per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    256,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 10, 10},                      /* cost of unaligned loads.  */
>    4, 4,                                      /* SSE->integer and integer->SSE
> moves */
> +  6, 6,                                      /* Gather load static, per_elt. 
>  */
> +  6, 6,                                      /* Gather store static, 
> per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    256,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 15, 20},                      /* cost of unaligned storess.  
> */
>    20, 20,                            /* SSE->integer and integer->SSE
> moves */
> +  6, 6,                                      /* Gather load static, per_elt. 
>  */
> +  6, 6,                                      /* Gather store static, 
> per_elt.  */
>    32,                                        /* size of l1 cache.  */
>    512,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */
> @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
>                                          in 32,64,128,256 and 512-bit */
>    {6, 6, 6, 6, 12},                  /* cost of unaligned stores.  */
>    2, 2,                                      /* SSE->integer and integer->SSE
> moves */
> +  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> +     rec. throughput 6.
> +     So 5 uops statically and one uops per load.  */
> +  10, 6,                             /* Gather load static, per_elt.  */
> +  10, 6,                             /* Gather store static, per_elt.  */
>    64,                                        /* size of l1 cache.  */
>    512,                                       /* size of l2 cache.  */
>    64,                                        /* size of prefetch block */

Regards,
Venkat.

RE: Add scatter/gather costs

Reply via email to