Hi,

Attached is a WIP/PoC fix addressing the O(N^2) behavior in ANALYZE with
high statistic target values. It needs more work, but it's good enough to
show some measurements.

For benchmark, I've created a simple 2-column table, with MCV list on
those two columns:

 CREATE TABLE t (a int, b int);
 CREATE STATISTICS s (mcv) ON a,b FROM t;

and then loaded data sets with different numbers of random combinations,
determined by number of values in each column. For example with 10 values
in a column, you get ~100 combinations.

 INSERT INTO t
 SELECT 10*random(), 10*random() FROM generate_series(1,3e6);

The 3M rows is picked because that's the sample size with target 10000.

The results with different statistic targets look like this:

1) master

   values        100       1000        5000       10000
   ====================================================
       10        103        586        2419        3041
      100        116        789        4778        8934
     1000        116        690        3162      499748

2) patched

   values        100       1000        5000       10000
   ====================================================
       10        113        606        2460        3716
      100        143        711        3371        5231
     1000        156        994        3836        6002

3) comparison (patched / master)

   values        100       1000        5000       10000
   ====================================================
       10       110%       103%        102%        122%
      100       123%        90%         71%         59%
     1000       134%       144%        121%          1%


So clearly, the issue for large statistic targets is resolved (duration
drops from 500s to just 6s), but there is measurable regression for the
other cases. That needs more investigation & fix before commit.


regards

--
Tomas Vondra                  http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 5fe61ea0a4..6dc4ed37d8 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -78,6 +78,9 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int 
numattrs);
 static SortItem *build_distinct_groups(int numrows, SortItem *items,
                                                                           
MultiSortSupport mss, int *ndistinct);
 
+static SortItem **compute_column_counts(SortItem *groups, int ngroups,
+                                                                               
MultiSortSupport mss, int *ncounts);
+
 static int     count_distinct_groups(int numrows, SortItem *items,
                                                                  
MultiSortSupport mss);
 
@@ -172,6 +175,8 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset 
*attrs,
        SortItem   *groups;
        MCVList    *mcvlist = NULL;
        MultiSortSupport mss;
+       SortItem  **counts;
+       int                *ncounts;
 
        attnums = build_attnums_array(attrs, &numattrs);
 
@@ -188,6 +193,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset 
*attrs,
        /* transform the sorted rows into groups (sorted by frequency) */
        groups = build_distinct_groups(nitems, items, mss, &ngroups);
 
+       /* compute frequencies for values in each column */
+       ncounts = (int *) palloc0(sizeof(int) * numattrs);
+       counts = compute_column_counts(groups, ngroups, mss, ncounts);
+
        /*
         * Maximum number of MCV items to store, based on the attribute with the
         * largest stats target (and the number of groups we have available).
@@ -242,6 +251,16 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset 
*attrs,
        if (nitems > 0)
        {
                int                     j;
+               SortItem        key;
+               MultiSortSupport        tmp;
+
+               /* used to search values */
+               tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, 
ssup)
+                                                                               
+ sizeof(SortSupportData));
+
+               /* space for search key */
+               key.values = palloc(sizeof(Datum));
+               key.isnull = palloc(sizeof(bool));
 
                /*
                 * Allocate the MCV list structure, set the global parameters.
@@ -281,16 +300,21 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset 
*attrs,
                        item->base_frequency = 1.0;
                        for (j = 0; j < numattrs; j++)
                        {
-                               int                     count = 0;
-                               int                     k;
+                               SortItem   *result;
 
-                               for (k = 0; k < ngroups; k++)
-                               {
-                                       if (multi_sort_compare_dim(j, 
&groups[i], &groups[k], mss) == 0)
-                                               count += groups[k].count;
-                               }
+                               /* single dimension */
+                               tmp->ndims = 1;
+                               tmp->ssup[0] = mss->ssup[j];
+
+                               /* fill search key */
+                               key.values[0] = groups[i].values[j];
+                               key.isnull[0] = groups[i].isnull[j];
+
+                               result = (SortItem *) bsearch_arg(&key, 
counts[j], ncounts[j],
+                                                                               
                  sizeof(SortItem),
+                                                                               
                  multi_sort_compare, tmp);
 
-                               item->base_frequency *= (double) count / 
numrows;
+                               item->base_frequency *= ((double) 
result->count) / numrows;
                        }
                }
        }
@@ -419,6 +443,61 @@ build_distinct_groups(int numrows, SortItem *items, 
MultiSortSupport mss,
        return groups;
 }
 
+static SortItem **
+compute_column_counts(SortItem *groups, int ngroups, MultiSortSupport mss,
+                                         int *ncounts)
+{
+       int                     i,
+                               j;
+       SortItem  **result;
+       MultiSortSupport        tmp;
+
+       result = (SortItem **) palloc(sizeof(SortItem *) * mss->ndims);
+       tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
+                                                                        + 
sizeof(SortSupportData));
+
+       for (i = 0; i < mss->ndims; i++)
+       {
+               result[i] = (SortItem *) palloc0(sizeof(SortItem) * ngroups);
+
+               /* single dimension */
+               tmp->ndims = 1;
+               tmp->ssup[0] = mss->ssup[i];
+
+               /* extract data for the dimension */
+               for (j = 0; j < ngroups; j++)
+               {
+                       result[i][j].values = palloc(sizeof(Datum));
+                       result[i][j].isnull = palloc(sizeof(bool));
+
+                       result[i][j].values[0] = groups[j].values[i];
+                       result[i][j].isnull[0] = groups[j].isnull[i];
+                       result[i][j].count = groups[j].count;
+               }
+
+               /* sort the values, deduplicate */
+               qsort_arg((void *) result[i], ngroups, sizeof(SortItem),
+                                 multi_sort_compare, tmp);
+
+               ncounts[i] = 1;
+               for (j = 1; j < ngroups; j++)
+               {
+                       if (multi_sort_compare(&result[i][(ncounts[i] - 1)], 
&result[i][j], tmp) == 0)
+                       {
+                               result[i][(ncounts[i] - 1)].count += 
result[i][j].count;
+                               continue;
+                       }
+
+                       /* */
+                       if (ncounts[i] != j)
+                               result[i][ncounts[i]] = result[i][j];
+
+                       ncounts[i]++;
+               }
+       }
+
+       return result;
+}
 
 /*
  * statext_mcv_load

Attachment: test.sql
Description: application/sql

Reply via email to