Hi Hackers,

I've recently stumbled upon a problem with table bloat estimation in case
there are columns of type JSON.

The quick bloat estimation queries use sum over pg_statistic.stawidth of
table's columns, but in case of JSON the corresponding entry is never
created by the ANALYZE command due to equality comparison operator
missing.  I understand why there is no such operator defined for this
particular type, but shouldn't we still try to produce meaningful average
width estimation?

In my case the actual bloat is around 40% as verified with pgstattuple,
while the bloat reported by quick estimate can be between 75% and 95%(!) in
three instances of this problem.  We're talking about some hundreds of GB
of miscalculation.

Attached patch against master makes the std_typanalyze still try to compute
the minimal stats even if there is no "=" operator.  Makes sense?

I could also find this report in archives that talks about similar problem,
but due to all values being over the analyze threshold:

http://www.postgresql.org/message-id/flat/12480.1389370...@sss.pgh.pa.us#12480.1389370...@sss.pgh.pa.us

I think we could try harder, otherwise any estimate relying on average
width can be way off in such cases.

--
Alex
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
new file mode 100644
index 861048f..903681e
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** std_typanalyze(VacAttrStats *stats)
*** 1723,1732 ****
  							 &ltopr, &eqopr, NULL,
  							 NULL);
  
- 	/* If column has no "=" operator, we can't do much of anything */
- 	if (!OidIsValid(eqopr))
- 		return false;
- 
  	/* Save the operator info for compute_stats routines */
  	mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData));
  	mystats->eqopr = eqopr;
--- 1723,1728 ----
*************** std_typanalyze(VacAttrStats *stats)
*** 1737,1743 ****
  	/*
  	 * Determine which standard statistics algorithm to use
  	 */
! 	if (OidIsValid(ltopr))
  	{
  		/* Seems to be a scalar datatype */
  		stats->compute_stats = compute_scalar_stats;
--- 1733,1739 ----
  	/*
  	 * Determine which standard statistics algorithm to use
  	 */
! 	if (OidIsValid(eqopr) && OidIsValid(ltopr))
  	{
  		/* Seems to be a scalar datatype */
  		stats->compute_stats = compute_scalar_stats;
*************** std_typanalyze(VacAttrStats *stats)
*** 1776,1786 ****
  /*
   *	compute_minimal_stats() -- compute minimal column statistics
   *
!  *	We use this when we can find only an "=" operator for the datatype.
   *
   *	We determine the fraction of non-null rows, the average width, the
   *	most common values, and the (estimated) number of distinct values.
   *
   *	The most common values are determined by brute force: we keep a list
   *	of previously seen values, ordered by number of times seen, as we scan
   *	the samples.  A newly seen value is inserted just after the last
--- 1772,1784 ----
  /*
   *	compute_minimal_stats() -- compute minimal column statistics
   *
!  *	We use this when we cannot find "=" and "<" operators for the datatype.
   *
   *	We determine the fraction of non-null rows, the average width, the
   *	most common values, and the (estimated) number of distinct values.
   *
+  *	Only if there is a "=" operator, we try to find the most common values.
+  *
   *	The most common values are determined by brute force: we keep a list
   *	of previously seen values, ordered by number of times seen, as we scan
   *	the samples.  A newly seen value is inserted just after the last
*************** compute_minimal_stats(VacAttrStatsP stat
*** 1809,1830 ****
  		Datum		value;
  		int			count;
  	} TrackItem;
! 	TrackItem  *track;
! 	int			track_cnt,
! 				track_max;
  	int			num_mcv = stats->attr->attstattarget;
  	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
  
! 	/*
! 	 * We track up to 2*n values for an n-element MCV list; but at least 10
! 	 */
! 	track_max = 2 * num_mcv;
! 	if (track_max < 10)
! 		track_max = 10;
! 	track = (TrackItem *) palloc(track_max * sizeof(TrackItem));
! 	track_cnt = 0;
  
! 	fmgr_info(mystats->eqfunc, &f_cmpeq);
  
  	for (i = 0; i < samplerows; i++)
  	{
--- 1807,1830 ----
  		Datum		value;
  		int			count;
  	} TrackItem;
! 	TrackItem  *track = NULL;
! 	int			track_cnt = 0,
! 				track_max = 0;
  	int			num_mcv = stats->attr->attstattarget;
  	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
  
! 	if (OidIsValid(mystats->eqfunc))
! 	{
! 		/*
! 		 * We track up to 2*n values for an n-element MCV list; but at least 10
! 		 */
! 		track_max = 2 * num_mcv;
! 		if (track_max < 10)
! 			track_max = 10;
! 		track = (TrackItem *) palloc(track_max * sizeof(TrackItem));
  
! 		fmgr_info(mystats->eqfunc, &f_cmpeq);
! 	}
  
  	for (i = 0; i < samplerows; i++)
  	{
*************** compute_minimal_stats(VacAttrStatsP stat
*** 1876,1926 ****
  			total_width += strlen(DatumGetCString(value)) + 1;
  		}
  
! 		/*
! 		 * See if the value matches anything we're already tracking.
! 		 */
! 		match = false;
! 		firstcount1 = track_cnt;
! 		for (j = 0; j < track_cnt; j++)
  		{
! 			/* We always use the default collation for statistics */
! 			if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
! 											   DEFAULT_COLLATION_OID,
! 											   value, track[j].value)))
  			{
! 				match = true;
! 				break;
  			}
- 			if (j < firstcount1 && track[j].count == 1)
- 				firstcount1 = j;
- 		}
  
! 		if (match)
! 		{
! 			/* Found a match */
! 			track[j].count++;
! 			/* This value may now need to "bubble up" in the track list */
! 			while (j > 0 && track[j].count > track[j - 1].count)
! 			{
! 				swapDatum(track[j].value, track[j - 1].value);
! 				swapInt(track[j].count, track[j - 1].count);
! 				j--;
! 			}
! 		}
! 		else
! 		{
! 			/* No match.  Insert at head of count-1 list */
! 			if (track_cnt < track_max)
! 				track_cnt++;
! 			for (j = track_cnt - 1; j > firstcount1; j--)
  			{
! 				track[j].value = track[j - 1].value;
! 				track[j].count = track[j - 1].count;
  			}
! 			if (firstcount1 < track_cnt)
  			{
! 				track[firstcount1].value = value;
! 				track[firstcount1].count = 1;
  			}
  		}
  	}
--- 1876,1929 ----
  			total_width += strlen(DatumGetCString(value)) + 1;
  		}
  
! 		if (track)
  		{
! 			/*
! 			 * See if the value matches anything we're already tracking.
! 			 */
! 			match = false;
! 			firstcount1 = track_cnt;
! 			for (j = 0; j < track_cnt; j++)
  			{
! 				/* We always use the default collation for statistics */
! 				if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
! 												   DEFAULT_COLLATION_OID,
! 												   value, track[j].value)))
! 				{
! 					match = true;
! 					break;
! 				}
! 				if (j < firstcount1 && track[j].count == 1)
! 					firstcount1 = j;
  			}
  
! 			if (match)
  			{
! 				/* Found a match */
! 				track[j].count++;
! 				/* This value may now need to "bubble up" in the track list */
! 				while (j > 0 && track[j].count > track[j - 1].count)
! 				{
! 					swapDatum(track[j].value, track[j - 1].value);
! 					swapInt(track[j].count, track[j - 1].count);
! 					j--;
! 				}
  			}
! 			else
  			{
! 				/* No match.  Insert at head of count-1 list */
! 				if (track_cnt < track_max)
! 					track_cnt++;
! 				for (j = track_cnt - 1; j > firstcount1; j--)
! 				{
! 					track[j].value = track[j - 1].value;
! 					track[j].count = track[j - 1].count;
! 				}
! 				if (firstcount1 < track_cnt)
! 				{
! 					track[firstcount1].value = value;
! 					track[firstcount1].count = 1;
! 				}
  			}
  		}
  	}
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to