Re: [HACKERS] Bogus ANALYZE results for an otherwise-unique column with many nulls

Tom Lane Thu, 04 Aug 2016 16:01:56 -0700

I wrote:
> Looking around, there are a couple of places outside commands/analyze.c
> that are making the same mistake, so this patch isn't complete, but it
> illustrates what needs to be done.


Here's a more complete patch.

                        regards, tom lane

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 5fcedd7..9ac7122 100644
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** compute_distinct_stats(VacAttrStatsP sta
*** 2049,2056 ****
  
  		if (nmultiple == 0)
  		{
! 			/* If we found no repeated values, assume it's a unique column */
! 			stats->stadistinct = -1.0;
  		}
  		else if (track_cnt < track_max && toowide_cnt == 0 &&
  				 nmultiple == track_cnt)
--- 2049,2059 ----
  
  		if (nmultiple == 0)
  		{
! 			/*
! 			 * If we found no repeated non-null values, assume it's a unique
! 			 * column; but be sure to discount for any nulls we found.
! 			 */
! 			stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  		}
  		else if (track_cnt < track_max && toowide_cnt == 0 &&
  				 nmultiple == track_cnt)
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2426,2433 ****
  
  		if (nmultiple == 0)
  		{
! 			/* If we found no repeated values, assume it's a unique column */
! 			stats->stadistinct = -1.0;
  		}
  		else if (toowide_cnt == 0 && nmultiple == ndistinct)
  		{
--- 2429,2439 ----
  
  		if (nmultiple == 0)
  		{
! 			/*
! 			 * If we found no repeated non-null values, assume it's a unique
! 			 * column; but be sure to discount for any nulls we found.
! 			 */
! 			stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  		}
  		else if (toowide_cnt == 0 && nmultiple == ndistinct)
  		{
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2753,2759 ****
  		else
  			stats->stawidth = stats->attrtype->typlen;
  		/* Assume all too-wide values are distinct, so it's a unique column */
! 		stats->stadistinct = -1.0;
  	}
  	else if (null_cnt > 0)
  	{
--- 2759,2765 ----
  		else
  			stats->stawidth = stats->attrtype->typlen;
  		/* Assume all too-wide values are distinct, so it's a unique column */
! 		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  	}
  	else if (null_cnt > 0)
  	{
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 0f851ea..817453c 100644
*** a/src/backend/tsearch/ts_typanalyze.c
--- b/src/backend/tsearch/ts_typanalyze.c
*************** compute_tsvector_stats(VacAttrStats *sta
*** 295,301 ****
  		stats->stawidth = total_width / (double) nonnull_cnt;
  
  		/* Assume it's a unique column (see notes above) */
! 		stats->stadistinct = -1.0;
  
  		/*
  		 * Construct an array of the interesting hashtable items, that is,
--- 295,301 ----
  		stats->stawidth = total_width / (double) nonnull_cnt;
  
  		/* Assume it's a unique column (see notes above) */
! 		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  
  		/*
  		 * Construct an array of the interesting hashtable items, that is,
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index fcb71d3..56504fc 100644
*** a/src/backend/utils/adt/rangetypes_typanalyze.c
--- b/src/backend/utils/adt/rangetypes_typanalyze.c
*************** compute_range_stats(VacAttrStats *stats,
*** 203,209 ****
  		/* Do the simple null-frac and width stats */
  		stats->stanullfrac = (double) null_cnt / (double) samplerows;
  		stats->stawidth = total_width / (double) non_null_cnt;
! 		stats->stadistinct = -1.0;
  
  		/* Must copy the target values into anl_context */
  		old_cxt = MemoryContextSwitchTo(stats->anl_context);
--- 203,211 ----
  		/* Do the simple null-frac and width stats */
  		stats->stanullfrac = (double) null_cnt / (double) samplerows;
  		stats->stawidth = total_width / (double) non_null_cnt;
! 
! 		/* Estimate that non-null values are unique */
! 		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  
  		/* Must copy the target values into anl_context */
  		old_cxt = MemoryContextSwitchTo(stats->anl_context);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cc2a9a1..56943f2 100644
*** a/src/backend/utils/adt/selfuncs.c
--- b/src/backend/utils/adt/selfuncs.c
*************** double
*** 4738,4743 ****
--- 4738,4744 ----
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
  	double		stadistinct;
+ 	double		stanullfrac = 0.0;
  	double		ntuples;
  
  	*isdefault = false;
*************** get_variable_numdistinct(VariableStatDat
*** 4745,4751 ****
  	/*
  	 * Determine the stadistinct value to use.  There are cases where we can
  	 * get an estimate even without a pg_statistic entry, or can get a better
! 	 * value than is in pg_statistic.
  	 */
  	if (HeapTupleIsValid(vardata->statsTuple))
  	{
--- 4746,4753 ----
  	/*
  	 * Determine the stadistinct value to use.  There are cases where we can
  	 * get an estimate even without a pg_statistic entry, or can get a better
! 	 * value than is in pg_statistic.  Grab stanullfrac too if we can find it
! 	 * (otherwise, assume no nulls, for lack of any better idea).
  	 */
  	if (HeapTupleIsValid(vardata->statsTuple))
  	{
*************** get_variable_numdistinct(VariableStatDat
*** 4754,4759 ****
--- 4756,4762 ----
  
  		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
  		stadistinct = stats->stadistinct;
+ 		stanullfrac = stats->stanullfrac;
  	}
  	else if (vardata->vartype == BOOLOID)
  	{
*************** get_variable_numdistinct(VariableStatDat
*** 4777,4783 ****
  			{
  				case ObjectIdAttributeNumber:
  				case SelfItemPointerAttributeNumber:
! 					stadistinct = -1.0; /* unique */
  					break;
  				case TableOidAttributeNumber:
  					stadistinct = 1.0;	/* only 1 value */
--- 4780,4786 ----
  			{
  				case ObjectIdAttributeNumber:
  				case SelfItemPointerAttributeNumber:
! 					stadistinct = -1.0; /* unique (and all non null) */
  					break;
  				case TableOidAttributeNumber:
  					stadistinct = 1.0;	/* only 1 value */
*************** get_variable_numdistinct(VariableStatDat
*** 4799,4808 ****
  	 * If there is a unique index or DISTINCT clause for the variable, assume
  	 * it is unique no matter what pg_statistic says; the statistics could be
  	 * out of date, or we might have found a partial unique index that proves
! 	 * the var is unique for this query.
  	 */
  	if (vardata->isunique)
! 		stadistinct = -1.0;
  
  	/*
  	 * If we had an absolute estimate, use that.
--- 4802,4812 ----
  	 * If there is a unique index or DISTINCT clause for the variable, assume
  	 * it is unique no matter what pg_statistic says; the statistics could be
  	 * out of date, or we might have found a partial unique index that proves
! 	 * the var is unique for this query.  However, we'd better still believe
! 	 * the null-fraction statistic.
  	 */
  	if (vardata->isunique)
! 		stadistinct = -1.0 * (1.0 - stanullfrac);
  
  	/*
  	 * If we had an absolute estimate, use that.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Bogus ANALYZE results for an otherwise-unique column with many nulls

Reply via email to