On Mon, 12 Mar 2018 10:43:36 -0400 Tom Lane <t...@sss.pgh.pa.us> wrote:
> Re-reading that thread, it seems like we should have applied Jeff's > initial trivial patch[1] (to not hold across > table_recheck_autovac) rather than waiting around for a super duper > improvement to get agreed on. I'm a bit tempted to go do that; > if nothing else, it seems simple enough to back-patch, unlike most > of the rest of what was discussed. This will help. In my testing it reduced the lock contention considerably. I think a lot of users with lots of tables will benefit from it. However it does nothing about the bigger issue which is that autovacuum flaps the stats temp files. I have attached the patch we are currently using. It applies to 9.6.8. I have versions for older releases in 9.4, 9.5, 9.6. I fails to apply to 10, and presumably head but I can update it if there is any interest. The patch has three main features: - Impose an ordering on the autovacuum workers worklist to avoid the need for rechecking statistics to skip already vacuumed tables. - Reduce the frequency of statistics refreshes - Remove the AutovacuumScheduleLock The patch is aware of the shared relations fix. It is subject to the problem Alvero noted in the original discussion: if the last table to be autovacuumed is large new workers will exit instead of choosing an earlier table. Not sure this is really a big problem in practice, but I agree it is a corner case. My patch does not do what I believe really needs doing: Schedule autovacuum more intelligently. Currently autovacuum collects all the tables in the physical order of pg_class and visits them one by one. With many tables it can take too long to respond to urgent vacuum needs, eg heavily updated tables or wraparound. There is a patch in the current commit fest that allows prioritizing tables manually. I don't like that idea much, but it does recognize that the current scheme is not adequate for databases with large numbers of tables. -dg -- David Gould da...@sonic.net If simplicity worked, the world would be overrun with insects.
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 6720248675..3bb06a2337 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -196,6 +196,26 @@ typedef struct autovac_table char *at_datname; } autovac_table; +/* + * This is used to collect the oids of tables that may need vacuuming. + * Once they are all collected it is sorted by oid and then stepped through. + * By establishing an order we reduce contention between workers for tables. + */ +typedef struct aw_workitem +{ + Oid relid; + bool sharedrel; +} aw_workitem; + +/* Extendable array of items. */ +typedef struct aw_worklist +{ + aw_workitem *items; + int maxitems, + numitems; +} aw_worklist; + + /*------------- * This struct holds information about a single worker's whereabouts. We keep * an array of these in shared memory, sized according to @@ -1069,6 +1089,76 @@ db_comparator(const void *a, const void *b) return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1; } + +/* + * Utilities to manage worklist of tables to vacuum. + */ + +static void aw_worklist_init(aw_worklist *w) +{ + w->maxitems = 32; /* not really a magic number, we have to start somewhere */ + w->numitems = 0; + w->items = palloc(w->maxitems * sizeof(aw_workitem)); +} + +static void aw_worklist_clear(aw_worklist *w) +{ + w->maxitems = 0; + w->numitems = 0; + pfree(w->items); + w->items = NULL; +} + +/* Append an item to the worklist */ +static aw_workitem *aw_worklist_add(aw_worklist *w, Oid oid, bool isshared) +{ + aw_workitem *newitem; + + if (w->numitems >= w->maxitems) + { + /* grow the array */ + w->maxitems = w->maxitems * 1.5; + w->items = (aw_workitem *) repalloc(w->items, + w->maxitems * sizeof(aw_workitem)); + } + newitem = w->items + w->numitems++; + newitem->relid = oid, + newitem->sharedrel = isshared; + return newitem; +} + +/* Release extra memory that might have been allocated during growth */ +static void aw_worklist_trim(aw_worklist *w) +{ + if (w->maxitems >= w->numitems) + { + w->maxitems = w->numitems; + w->items = (aw_workitem *) repalloc(w->items, + w->numitems * sizeof(aw_workitem)); + } +} + +/* qsort comparator for aw_workitem items */ +static int +aw_workitem_cmp(const void *p1, const void *p2) +{ + const aw_workitem *v1 = ((const aw_workitem *) p1); + const aw_workitem *v2 = ((const aw_workitem *) p2); + + if (v1->relid < v2->relid) + return -1; + if (v1->relid > v2->relid) + return 1; + return 0; +} + +static void aw_worklist_sort(aw_worklist *w) +{ + if (w->numitems > 1) + qsort(w->items, w->numitems, sizeof(aw_workitem), aw_workitem_cmp); +} + + /* * do_start_worker * @@ -1899,10 +1989,10 @@ do_autovacuum(void) HeapTuple tuple; HeapScanDesc relScan; Form_pg_database dbForm; - List *table_oids = NIL; HASHCTL ctl; HTAB *table_toast_map; - ListCell *volatile cell; + aw_worklist table_oids; + int oid_idx; PgStat_StatDBEntry *shared; PgStat_StatDBEntry *dbentry; BufferAccessStrategy bstrategy; @@ -1912,6 +2002,7 @@ do_autovacuum(void) bool did_vacuum = false; bool found_concurrent_worker = false; + /* * StartTransactionCommand and CommitTransactionCommand will automatically * switch to other contexts. We need this one to keep the list of @@ -1993,6 +2084,9 @@ do_autovacuum(void) &ctl, HASH_ELEM | HASH_BLOBS); + /* create array/list to track oids of all the tables that might need action */ + aw_worklist_init(&table_oids); + /* * Scan pg_class to determine which tables to vacuum. * @@ -2084,9 +2178,9 @@ do_autovacuum(void) } else { - /* relations that need work are added to table_oids */ + /* relations that may need work are added to table_oids */ if (dovacuum || doanalyze) - table_oids = lappend_oid(table_oids, relid); + (void) aw_worklist_add(&table_oids, relid, classForm->relisshared); /* * Remember the association for the second pass. Note: we must do @@ -2170,7 +2264,8 @@ do_autovacuum(void) /* ignore analyze for toast tables */ if (dovacuum) - table_oids = lappend_oid(table_oids, relid); + aw_worklist_add(&table_oids, relid, classForm->relisshared); + } heap_endscan(relScan); @@ -2193,12 +2288,30 @@ do_autovacuum(void) /* * Perform operations on collected tables. + * + * 1. Sort the collected oids so that workers process oids in + * the same order. + * 2. While there are items not yet visited do: + * 3. Find the last (highest) item worked on by any worker. + * 4. Claim the next item after the last one worked on. + * + * The ordering prevents workers from racing or competing with other + * workers for an item to work on. Without it there is no way to + * prevent them from trying to vacuum the same table repeatedly. */ - foreach(cell, table_oids) + + /* 1.Sort the oids to establish a consistent order for work items. */ + aw_worklist_trim(&table_oids); /* release any extra allocated memory */ + aw_worklist_sort(&table_oids); + + /* + * 2. While there are items that have not been visited... + */ + for(oid_idx = 0; oid_idx < table_oids.numitems; ) { - Oid relid = lfirst_oid(cell); autovac_table *tab; - bool skipit; + Oid relid; + Oid highest_oid_claimed; int stdVacuumCostDelay; int stdVacuumCostLimit; dlist_iter iter; @@ -2222,71 +2335,61 @@ do_autovacuum(void) } /* - * hold schedule lock from here until we're sure that this table still - * needs vacuuming. We also need the AutovacuumLock to walk the - * worker array, but we'll let go of that one quickly. + * We hold the Autovacuum lock to walk the worker array and to + * update our WorkerInfo with the chosen candidate table. */ - LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); - LWLockAcquire(AutovacuumLock, LW_SHARED); + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); /* - * Check whether the table is being vacuumed concurrently by another - * worker. + * 3. Find the last (highest) oid claimed by any worker. */ - skipit = false; + highest_oid_claimed = InvalidOid; dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); - /* ignore myself */ - if (worker == MyWorkerInfo) - continue; + if ((worker->wi_sharedrel || worker->wi_dboid == MyDatabaseId) + && worker->wi_tableoid >= highest_oid_claimed) + { + highest_oid_claimed = worker->wi_tableoid; + found_concurrent_worker = true; + } + } - /* ignore workers in other databases (unless table is shared) */ - if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId) - continue; + /* + * 4a. Skip past the highest_oid_claimed to find the next oid. + */ + for (relid = InvalidOid; oid_idx < table_oids.numitems; oid_idx++) + { + aw_workitem *item = &table_oids.items[oid_idx]; - if (worker->wi_tableoid == relid) + /* 4b. Claim the table by storing its oid in shared memory. */ + if (item->relid > highest_oid_claimed) { - skipit = true; - found_concurrent_worker = true; + relid = item->relid; + MyWorkerInfo->wi_tableoid = relid; + MyWorkerInfo->wi_sharedrel = item->sharedrel; break; } } + LWLockRelease(AutovacuumLock); - if (skipit) - { - LWLockRelease(AutovacuumScheduleLock); + + /* If we reached the end of the work list there is nothing to do. */ + if (relid == InvalidOid) continue; - } /* - * Check whether pgstat data still says we need to vacuum this table. - * It could have changed if something else processed the table while - * we weren't looking. - * - * Note: we have a special case in pgstat code to ensure that the - * stats we read are as up-to-date as possible, to avoid the problem - * that somebody just finished vacuuming this table. The window to - * the race condition is not closed but it is very small. + * Re-check whether we still need to vacuum this table. We are working + * from a list of tables that could be quite stale by now. Possibly + * this able was dropped or manually vacuumed while we weren't looking. */ MemoryContextSwitchTo(AutovacMemCxt); tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc, effective_multixact_freeze_max_age); if (tab == NULL) - { /* someone else vacuumed the table, or it went away */ - LWLockRelease(AutovacuumScheduleLock); continue; - } - - /* - * Ok, good to go. Store the table in shared memory before releasing - * the lock so that other workers don't vacuum it concurrently. - */ - MyWorkerInfo->wi_tableoid = relid; - MyWorkerInfo->wi_sharedrel = tab->at_sharedrel; - LWLockRelease(AutovacuumScheduleLock); /* * Remember the prevailing values of the vacuum cost GUCs. We have to @@ -2407,6 +2510,8 @@ deleted: VacuumCostLimit = stdVacuumCostLimit; } + aw_worklist_clear(&table_oids); /* free work list memory */ + /* * We leak table_toast_map here (among other things), but since we're * going away soon, it's not a problem. @@ -2514,9 +2619,7 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, bool wraparound; AutoVacOpts *avopts; - /* use fresh stats */ - autovac_refresh_stats(); - + /* Fetch pgstats for re-checking. */ shared = pgstat_fetch_stat_dbentry(InvalidOid); dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); @@ -2995,28 +3098,19 @@ AutoVacuumShmemInit(void) * * Cause the next pgstats read operation to obtain fresh data, but throttle * such refreshing in the autovacuum launcher. This is mostly to avoid - * rereading the pgstats files too many times in quick succession when there - * are many databases. - * - * Note: we avoid throttling in the autovac worker, as it would be - * counterproductive in the recheck logic. + * rereading the pgstats files too many times in quick succession. */ static void autovac_refresh_stats(void) { - if (IsAutoVacuumLauncherProcess()) - { - static TimestampTz last_read = 0; - TimestampTz current_time; - - current_time = GetCurrentTimestamp(); - - if (!TimestampDifferenceExceeds(last_read, current_time, - STATS_READ_DELAY)) - return; + static TimestampTz last_read = 0; + TimestampTz current_time; - last_read = current_time; - } + current_time = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_read, current_time, + STATS_READ_DELAY)) + return; + last_read = current_time; pgstat_clear_snapshot(); } diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 480d3b1876..54c4ca97a3 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -76,6 +76,9 @@ #define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file * updates; in milliseconds. */ +#define PGSTAT_AVAC_INTERVAL 5000 /* Minimum time between stats file + * updates for autovacuum workers */ + #define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for * a new file; in milliseconds. */ @@ -4759,10 +4762,9 @@ backend_read_statsfile(void) /* * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL * msec before now. This indirectly ensures that the collector - * needn't write the file more often than PGSTAT_STAT_INTERVAL. In - * an autovacuum worker, however, we want a lower delay to avoid - * using stale data, so we use PGSTAT_RETRY_DELAY (since the - * number of workers is low, this shouldn't be a problem). + * needn't write the file more often than PGSTAT_STAT_INTERVAL. + * Autovacuum workers use a longer interval PGSTAT_AVAC_INTERVAL to + * avoid requesting excessively frequent rewrites of the stats. * * We don't recompute min_ts after sleeping, except in the * unlikely case that cur_ts went backwards. So we might end up @@ -4773,12 +4775,10 @@ backend_read_statsfile(void) * actually accept. */ ref_ts = cur_ts; - if (IsAutoVacuumWorkerProcess()) - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_RETRY_DELAY); - else - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_STAT_INTERVAL); + min_ts = TimestampTzPlusMilliseconds(ref_ts, + IsAutoVacuumWorkerProcess() + ? -PGSTAT_AVAC_INTERVAL + : -PGSTAT_STAT_INTERVAL); } /* diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index f8996cd21a..63fb170f19 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -27,7 +27,7 @@ TablespaceCreateLock 19 BtreeVacuumLock 20 AddinShmemInitLock 21 AutovacuumLock 22 -AutovacuumScheduleLock 23 +# 23 is available; was formerly AutovacuumScheduleLock SyncScanLock 24 RelationMappingLock 25 AsyncCtlLock 26