Here's another cut at this patch.  This is mainly about the
infrastructure to pass the data around in autovacuum; the proposed
formulas probably need lot of work.

We still have two terms in autovacuum priority, the first one considers
dead tuples and the second one considers wraparound limit.  I have kept
Chris' proposal for the second term, but refined the first one a bit per
Jim Nasby's suggestion of discounting dead space.  So we now have

        return
                (d->p_deadtuples * (d->p_livetuples + d->p_deadtuples) * 
d->p_relpages) /
                ((d->p_livetuples + 1) * d->p_reltuples * nblocks) +
                exp(d->p_xidage * logf(nblocks) / UINT_MAX);

Here, deadtuples and livetuples come from pgstat data, while relpages
and reltuples come from pg_class.  nblocks, on the other hand, comes
from the actual number of blocks in the table.

I haven't considered the case where pg_class.reltuples = 0 (which
results in division-by-zero), but I think to be really robust here we'd
want to have some code copied from estimate_rel_size; or maybe simply
use some hardcoded magic value.  I lean towards the latter, because I'm
not sure we want to expend a relation open at this point (incurring an
attempt to lock the table, which could be problematic); hence the new
RelFileNodeGetNumberOfBlocks() thingy, which is admittedly pretty ugly,
not to mention untested.

(I have considered livetuples=0 however, hence the +1 there).

I think we now need to have a more focused discussion on useful
formulas to use here.  One thing I noticed that fails in the above
formula is that as nblocks grows, ceteris paribus, the score falls; but
that's wrong, because if you have a table that turns out to have much
larger nblocks because it bloated and pgstat lost the message, we need
to look harder at it.  So somehow we need to consider the tuple density
as given by pg_class.reltuples/pg_class.relpages, and compare with the
one given by pgstat.(live+dead) / nblocks; and raise the score as the
ratio goes down (in normal conditions the ratio should be 1; a bloated
table that pgstat hasn't noticed will have a lower ratio).

-- 
Álvaro Herrera                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
*** a/src/backend/postmaster/autovacuum.c
--- b/src/backend/postmaster/autovacuum.c
***************
*** 76,81 ****
--- 76,82 ----
  #include "catalog/dependency.h"
  #include "catalog/namespace.h"
  #include "catalog/pg_database.h"
+ #include "catalog/pg_tablespace.h"
  #include "commands/dbcommands.h"
  #include "commands/vacuum.h"
  #include "lib/ilist.h"
***************
*** 98,103 ****
--- 99,105 ----
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
  #include "utils/rel.h"
+ #include "utils/relmapper.h"
  #include "utils/snapmgr.h"
  #include "utils/syscache.h"
  #include "utils/timeout.h"
***************
*** 167,173 **** typedef struct avw_dbase
  	PgStat_StatDBEntry *adw_entry;
  } avw_dbase;
  
! /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
  typedef struct av_relation
  {
  	Oid			ar_toastrelid;	/* hash key - must be first */
--- 169,175 ----
  	PgStat_StatDBEntry *adw_entry;
  } avw_dbase;
  
! /* struct to keep track of TOAST<->main relation mappings */
  typedef struct av_relation
  {
  	Oid			ar_toastrelid;	/* hash key - must be first */
***************
*** 177,182 **** typedef struct av_relation
--- 179,217 ----
  								 * reloptions, or NULL if none */
  } av_relation;
  
+ /*
+  * A tasklist is a set of tables to process, collected during a worker's first
+  * phase.  For each table we keep track of its vacuum priority, so that we can
+  * process them in priority order.
+  */
+ typedef struct avw_tltable
+ {
+ 	Oid			tt_reloid;
+ 	float4		tt_vacuum_prio;
+ } avw_tltable;
+ 
+ /* a list of the above elements */
+ typedef struct avw_tasklist
+ {
+ 	int			tl_maxelts;
+ 	int			tl_nelts;
+ 	avw_tltable	**tl_elts;
+ } avw_tasklist;
+ 
+ /* data to compute a table's priority value */
+ typedef struct avw_priodata
+ {
+ 	float4		p_deadtuples;
+ 	float4		p_livetuples;
+ 	float4		p_vacthresh;
+ 	float4		p_relpages;
+ 	float4		p_reltuples;
+ 	RelFileNode	p_filenode;
+ 	int			p_xidage;
+ 	int			p_frzmaxage;
+ } avw_priodata;
+ 
+ 
  /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
  typedef struct autovac_table
  {
***************
*** 296,305 **** static void FreeWorkerInfo(int code, Datum arg);
  
  static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
  					  TupleDesc pg_class_desc);
! static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
  						  Form_pg_class classForm,
! 						  PgStat_StatTabEntry *tabentry,
! 						  bool *dovacuum, bool *doanalyze, bool *wraparound);
  
  static void autovacuum_do_vac_analyze(autovac_table *tab,
  						  BufferAccessStrategy bstrategy);
--- 331,341 ----
  
  static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
  					  TupleDesc pg_class_desc);
! static avw_priodata *relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
  						  Form_pg_class classForm,
! 						  PgStat_StatTabEntry *tabentry, bool retstruct,
! 						  bool *dovacuum, bool *doanalyze,
! 						  bool *wraparound);
  
  static void autovacuum_do_vac_analyze(autovac_table *tab,
  						  BufferAccessStrategy bstrategy);
***************
*** 1893,1898 **** get_database_list(void)
--- 1929,2021 ----
  }
  
  /*
+  * Initialize an empty task list for a worker
+  */
+ static avw_tasklist *
+ tasklist_initialize(void)
+ {
+ 	avw_tasklist   *tasklist;
+ 
+ 	tasklist = palloc(sizeof(avw_tasklist));
+ 	tasklist->tl_maxelts = 32;
+ 	tasklist->tl_nelts = 0;
+ 	tasklist->tl_elts = palloc(tasklist->tl_maxelts * sizeof(avw_tltable *));
+ 
+ 	return tasklist;
+ }
+ 
+ /*
+  * Compute vacuum priority.
+  *
+  * XXX explain rationale for the chosen formula
+  */
+ static float4
+ avw_compute_priority(avw_priodata *d)
+ {
+ 	BlockNumber		nblocks = InvalidBlockNumber;
+ 
+ 	if (d->p_filenode.relNode)
+ 		nblocks = RelFileNodeGetNumberOfBlocksInFork(d->p_filenode, MAIN_FORKNUM);
+ 
+ 	if (nblocks == InvalidBlockNumber)
+ 		nblocks = 100; /* XXX ?? */
+ 
+ 	return
+ 		(d->p_deadtuples * (d->p_livetuples + d->p_deadtuples) * d->p_relpages) /
+ 		((d->p_livetuples + 1) * d->p_reltuples * nblocks) +
+ 		exp(d->p_xidage * logf(nblocks) / UINT_MAX);
+ }
+ 
+ /*
+  * Add a table to the tasklisk.
+  */
+ static void
+ tasklist_add_table(avw_tasklist *tasklist, Oid relid, bool dovacuum,
+ 				   bool doanalyze, bool wraparound, avw_priodata *tabinfo)
+ {
+ 	avw_tltable *tab;
+ 
+ 	/* enlarge the array if necessary */
+ 	if (tasklist->tl_nelts >= tasklist->tl_maxelts)
+ 	{
+ 		tasklist->tl_maxelts *= 2;
+ 		tasklist->tl_elts = repalloc(tasklist->tl_elts, tasklist->tl_maxelts *
+ 									 sizeof(avw_tltable *));
+ 	}
+ 
+ 	tab = palloc0(sizeof(avw_tltable));
+ 
+ 	tab->tt_reloid = relid;
+ 	/* XXX tables not needing vacuum sort last -- is that OK? */
+ 	tab->tt_vacuum_prio = dovacuum ? avw_compute_priority(tabinfo) : 0;
+ 
+ 	tasklist->tl_elts[tasklist->tl_nelts++] = tab;
+ }
+ 
+ /*
+  * qsort comparator: sorts avw_tltable elements by value of vacuum priority,
+  * descending
+  */
+ static int
+ avw_tt_compar(const void *a, const void *b)
+ {
+ 	const avw_tltable *taba = *(avw_tltable *const *) a;
+ 	const avw_tltable *tabb = *(avw_tltable *const *) b;
+ 
+ 	return tabb->tt_vacuum_prio - taba->tt_vacuum_prio;
+ }
+ 
+ /*
+  * sort a tasklist using the vacuum priority
+  */
+ static void
+ tasklist_sort(avw_tasklist *tasklist)
+ {
+ 	qsort(tasklist->tl_elts, tasklist->tl_nelts,
+ 		  sizeof(avw_tltable *), avw_tt_compar);
+ }
+ 
+ /*
   * Process a database table-by-table
   *
   * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
***************
*** 1905,1919 **** do_autovacuum(void)
  	HeapTuple	tuple;
  	HeapScanDesc relScan;
  	Form_pg_database dbForm;
- 	List	   *table_oids = NIL;
  	HASHCTL		ctl;
  	HTAB	   *table_toast_map;
- 	ListCell   *volatile cell;
  	PgStat_StatDBEntry *shared;
  	PgStat_StatDBEntry *dbentry;
  	BufferAccessStrategy bstrategy;
  	ScanKeyData key;
  	TupleDesc	pg_class_desc;
  
  	/*
  	 * StartTransactionCommand and CommitTransactionCommand will automatically
--- 2028,2042 ----
  	HeapTuple	tuple;
  	HeapScanDesc relScan;
  	Form_pg_database dbForm;
  	HASHCTL		ctl;
  	HTAB	   *table_toast_map;
  	PgStat_StatDBEntry *shared;
  	PgStat_StatDBEntry *dbentry;
  	BufferAccessStrategy bstrategy;
  	ScanKeyData key;
  	TupleDesc	pg_class_desc;
+ 	avw_tasklist *tasklist;
+ 	int			i;
  
  	/*
  	 * StartTransactionCommand and CommitTransactionCommand will automatically
***************
*** 1988,1993 **** do_autovacuum(void)
--- 2111,2119 ----
  								  &ctl,
  								  HASH_ELEM | HASH_FUNCTION);
  
+ 	/* initialize our tasklist */
+ 	tasklist = tasklist_initialize();
+ 
  	/*
  	 * Scan pg_class to determine which tables to vacuum.
  	 *
***************
*** 2017,2022 **** do_autovacuum(void)
--- 2143,2149 ----
  		bool		dovacuum;
  		bool		doanalyze;
  		bool		wraparound;
+ 		avw_priodata *tabinfo;
  
  		if (classForm->relkind != RELKIND_RELATION &&
  			classForm->relkind != RELKIND_MATVIEW)
***************
*** 2030,2037 **** do_autovacuum(void)
  											 shared, dbentry);
  
  		/* Check if it needs vacuum or analyze */
! 		relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
! 								  &dovacuum, &doanalyze, &wraparound);
  
  		/*
  		 * Check if it is a temp table (presumably, of some other backend's).
--- 2157,2166 ----
  											 shared, dbentry);
  
  		/* Check if it needs vacuum or analyze */
! 		tabinfo = relation_needs_vacanalyze(relid, relopts, classForm,
! 											tabentry, true,
! 											&dovacuum, &doanalyze,
! 											&wraparound);
  
  		/*
  		 * Check if it is a temp table (presumably, of some other backend's).
***************
*** 2078,2086 **** do_autovacuum(void)
  		}
  		else
  		{
! 			/* relations that need work are added to table_oids */
  			if (dovacuum || doanalyze)
! 				table_oids = lappend_oid(table_oids, relid);
  
  			/*
  			 * Remember the association for the second pass.  Note: we must do
--- 2207,2216 ----
  		}
  		else
  		{
! 			/* relations that need work are added to our tasklist */
  			if (dovacuum || doanalyze)
! 				tasklist_add_table(tasklist, relid, dovacuum, doanalyze,
! 								   wraparound, tabinfo);
  
  			/*
  			 * Remember the association for the second pass.  Note: we must do
***************
*** 2130,2135 **** do_autovacuum(void)
--- 2260,2266 ----
  		bool		dovacuum;
  		bool		doanalyze;
  		bool		wraparound;
+ 		avw_priodata *tabinfo;
  
  		/*
  		 * We cannot safely process other backends' temp tables, so skip 'em.
***************
*** 2158,2169 **** do_autovacuum(void)
  		tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
  											 shared, dbentry);
  
! 		relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
! 								  &dovacuum, &doanalyze, &wraparound);
  
  		/* ignore analyze for toast tables */
  		if (dovacuum)
! 			table_oids = lappend_oid(table_oids, relid);
  	}
  
  	heap_endscan(relScan);
--- 2289,2303 ----
  		tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
  											 shared, dbentry);
  
! 		tabinfo = relation_needs_vacanalyze(relid, relopts, classForm,
! 											tabentry, true,
! 											&dovacuum, &doanalyze,
! 											&wraparound);
  
  		/* ignore analyze for toast tables */
  		if (dovacuum)
! 			tasklist_add_table(tasklist, relid, dovacuum, doanalyze,
! 							   wraparound, tabinfo);
  	}
  
  	heap_endscan(relScan);
***************
*** 2186,2203 **** do_autovacuum(void)
  										  ALLOCSET_DEFAULT_MINSIZE,
  										  ALLOCSET_DEFAULT_MAXSIZE);
  
  	/*
  	 * Perform operations on collected tables.
  	 */
! 	foreach(cell, table_oids)
  	{
- 		Oid			relid = lfirst_oid(cell);
  		autovac_table *tab;
  		bool		skipit;
  		int			stdVacuumCostDelay;
  		int			stdVacuumCostLimit;
  		dlist_iter	iter;
  
  		CHECK_FOR_INTERRUPTS();
  
  		/*
--- 2320,2342 ----
  										  ALLOCSET_DEFAULT_MINSIZE,
  										  ALLOCSET_DEFAULT_MAXSIZE);
  
+ 	/* sort our task list */
+ 	tasklist_sort(tasklist);
+ 
  	/*
  	 * Perform operations on collected tables.
  	 */
! 	for (i = 0; i < tasklist->tl_nelts; i++)
  	{
  		autovac_table *tab;
+ 		Oid			relid;
  		bool		skipit;
  		int			stdVacuumCostDelay;
  		int			stdVacuumCostLimit;
  		dlist_iter	iter;
  
+ 		relid = tasklist->tl_elts[i]->tt_reloid;
+ 
  		CHECK_FOR_INTERRUPTS();
  
  		/*
***************
*** 2501,2507 **** table_recheck_autovac(Oid relid, HTAB *table_toast_map,
  										 shared, dbentry);
  
  	relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
! 							  &dovacuum, &doanalyze, &wraparound);
  
  	/* ignore ANALYZE for toast tables */
  	if (classForm->relkind == RELKIND_TOASTVALUE)
--- 2640,2646 ----
  										 shared, dbentry);
  
  	relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
! 							  false, &dovacuum, &doanalyze, &wraparound);
  
  	/* ignore ANALYZE for toast tables */
  	if (classForm->relkind == RELKIND_TOASTVALUE)
***************
*** 2567,2575 **** table_recheck_autovac(Oid relid, HTAB *table_toast_map,
  /*
   * relation_needs_vacanalyze
   *
!  * Check whether a relation needs to be vacuumed or analyzed; return each into
!  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
!  * being forced because of Xid wraparound.
   *
   * relopts is a pointer to the AutoVacOpts options (either for itself in the
   * case of a plain table, or for either itself or its parent table in the case
--- 2706,2718 ----
  /*
   * relation_needs_vacanalyze
   *
!  * Check whether a relation needs to be vacuumed or analyzed, and return each
!  * into "dovacuum" and "doanalyze", respectively.  If "retstruct" is true,
!  * and the table needs vacuum, an avw_priodata is allocated, filled with
!  * various data about the state of the table, and returned.
!  *
!  * Output parameters:
!  *	wraparound: whether the vacuum is being forced because of Xid wraparound
   *
   * relopts is a pointer to the AutoVacOpts options (either for itself in the
   * case of a plain table, or for either itself or its parent table in the case
***************
*** 2600,2610 **** table_recheck_autovac(Oid relid, HTAB *table_toast_map,
   * value < 0 is substituted with the value of
   * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
   */
! static void
  relation_needs_vacanalyze(Oid relid,
  						  AutoVacOpts *relopts,
  						  Form_pg_class classForm,
  						  PgStat_StatTabEntry *tabentry,
   /* output params below */
  						  bool *dovacuum,
  						  bool *doanalyze,
--- 2743,2754 ----
   * value < 0 is substituted with the value of
   * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
   */
! static avw_priodata *
  relation_needs_vacanalyze(Oid relid,
  						  AutoVacOpts *relopts,
  						  Form_pg_class classForm,
  						  PgStat_StatTabEntry *tabentry,
+ 						  bool retstruct,
   /* output params below */
  						  bool *dovacuum,
  						  bool *doanalyze,
***************
*** 2612,2617 **** relation_needs_vacanalyze(Oid relid,
--- 2756,2762 ----
  {
  	bool		force_vacuum;
  	bool		av_enabled;
+ 	avw_priodata *tabinfo = NULL;
  	float4		reltuples;		/* pg_class.reltuples */
  
  	/* constants from reloptions or GUC variables */
***************
*** 2687,2693 **** relation_needs_vacanalyze(Oid relid,
  	{
  		*doanalyze = false;
  		*dovacuum = false;
! 		return;
  	}
  
  	if (PointerIsValid(tabentry))
--- 2832,2838 ----
  	{
  		*doanalyze = false;
  		*dovacuum = false;
! 		return NULL;
  	}
  
  	if (PointerIsValid(tabentry))
***************
*** 2717,2731 **** relation_needs_vacanalyze(Oid relid,
  		/*
  		 * Skip a table not found in stat hash, unless we have to force vacuum
  		 * for anti-wrap purposes.	If it's not acted upon, there's no need to
! 		 * vacuum it.
  		 */
  		*dovacuum = force_vacuum;
  		*doanalyze = false;
  	}
  
  	/* ANALYZE refuses to work with pg_statistics */
  	if (relid == StatisticRelationId)
  		*doanalyze = false;
  }
  
  /*
--- 2862,2923 ----
  		/*
  		 * Skip a table not found in stat hash, unless we have to force vacuum
  		 * for anti-wrap purposes.	If it's not acted upon, there's no need to
! 		 * vacuum it.  It seems okay to say that there are no tuples to remove,
! 		 * either.
  		 */
  		*dovacuum = force_vacuum;
  		*doanalyze = false;
+ 		vactuples = 0;
+ 
+ 		vacthresh = (float4) -1;	/* silence compiler below */
+ 	}
+ 
+ 	if (*dovacuum)
+ 	{
+ 		if (retstruct)
+ 		{
+ 			tabinfo = palloc(sizeof(avw_priodata));
+ 
+ 			tabinfo->p_deadtuples = tabentry ? tabentry->n_dead_tuples : -1;
+ 			tabinfo->p_livetuples = tabentry ? tabentry->n_live_tuples : -1;
+ 			tabinfo->p_relpages = classForm->relpages;
+ 			tabinfo->p_reltuples = classForm->reltuples;
+ 
+ 			if (classForm->reltablespace)
+ 				tabinfo->p_filenode.spcNode = classForm->reltablespace;
+ 			else
+ 				tabinfo->p_filenode.spcNode = MyDatabaseTableSpace;
+ 			if (tabinfo->p_filenode.spcNode == GLOBALTABLESPACE_OID)
+ 				tabinfo->p_filenode.dbNode = InvalidOid;
+ 			else
+ 				tabinfo->p_filenode.dbNode = MyDatabaseId;
+ 			if (classForm->relfilenode)
+ 				tabinfo->p_filenode.relNode = classForm->relfilenode;
+ 			else
+ 			{
+ 				tabinfo->p_filenode.relNode =
+ 					RelationMapOidToFilenode(relid,
+ 											 classForm->relisshared);
+ 				if (!OidIsValid(tabinfo->p_filenode.relNode))
+ 				{
+ 					elog(WARNING, "could not find relation mapping for relation with OID %u",
+ 						 relid);
+ 					tabinfo->p_filenode.spcNode = InvalidOid;
+ 					tabinfo->p_filenode.dbNode = InvalidOid;
+ 				}
+ 			}
+ 
+ 			tabinfo->p_vacthresh = tabentry ? vacthresh : -1;
+ 			tabinfo->p_xidage = recentXid - classForm->relfrozenxid;
+ 			tabinfo->p_frzmaxage = freeze_max_age;
+ 		}
  	}
  
  	/* ANALYZE refuses to work with pg_statistics */
  	if (relid == StatisticRelationId)
  		*doanalyze = false;
+ 
+ 	return tabinfo;
  }
  
  /*
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 2013,2018 **** RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
--- 2013,2031 ----
  	return smgrnblocks(relation->rd_smgr, forkNum);
  }
  
+ BlockNumber
+ RelFileNodeGetNumberOfBlocksInFork(RelFileNode rnode, ForkNumber forkNum)
+ {
+ 	BlockNumber		retval;
+ 	SMgrRelation	srel;
+ 
+ 	srel = smgropen(rnode, InvalidBackendId);
+ 	retval = smgrnblocks(srel, forkNum);
+ 	smgrclose(srel);
+ 
+ 	return retval;
+ }
+ 
  /*
   * BufferIsPermanent
   *		Determines whether a buffer will potentially still be around after
*** a/src/include/storage/bufmgr.h
--- b/src/include/storage/bufmgr.h
***************
*** 184,189 **** extern void CheckPointBuffers(int flags);
--- 184,191 ----
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
  								ForkNumber forkNum);
+ extern BlockNumber RelFileNodeGetNumberOfBlocksInFork(RelFileNode rnode,
+ 								   ForkNumber forkNum);
  extern void FlushRelationBuffers(Relation rel);
  extern void FlushDatabaseBuffers(Oid dbid);
  extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to