Here's an updated version of this patch that takes care of the issues I reported previously: no more repalloc() of the requests array; it's now an slist, which makes the code much more natural IMV. And no more messing around with doing sprintf to create a separate sprintf pattern for the per-db stats file; instead have a function to return the name that uses just the pgstat dir as stored by GUC. I think this can be further simplified still.
I haven't reviewed the rest yet; please do give this a try to confirm that the speedups previously reported are still there (i.e. I didn't completely blew it). Thanks -- Álvaro Herrera http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
*** a/src/backend/postmaster/pgstat.c --- b/src/backend/postmaster/pgstat.c *************** *** 38,43 **** --- 38,44 ---- #include "access/xact.h" #include "catalog/pg_database.h" #include "catalog/pg_proc.h" + #include "lib/ilist.h" #include "libpq/ip.h" #include "libpq/libpq.h" #include "libpq/pqsignal.h" *************** *** 66,73 **** * Paths for the statistics files (relative to installation's $PGDATA). * ---------- */ ! #define PGSTAT_STAT_PERMANENT_FILENAME "global/pgstat.stat" ! #define PGSTAT_STAT_PERMANENT_TMPFILE "global/pgstat.tmp" /* ---------- * Timer definitions. --- 67,75 ---- * Paths for the statistics files (relative to installation's $PGDATA). * ---------- */ ! #define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat" ! #define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" ! #define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" /* ---------- * Timer definitions. *************** *** 115,120 **** int pgstat_track_activity_query_size = 1024; --- 117,123 ---- * Built from GUC parameter * ---------- */ + char *pgstat_stat_directory = NULL; char *pgstat_stat_filename = NULL; char *pgstat_stat_tmpname = NULL; *************** *** 219,229 **** static int localNumBackends = 0; */ static PgStat_GlobalStats globalStats; ! /* Last time the collector successfully wrote the stats file */ ! static TimestampTz last_statwrite; ! /* Latest statistics request time from backends */ ! static TimestampTz last_statrequest; static volatile bool need_exit = false; static volatile bool got_SIGHUP = false; --- 222,237 ---- */ static PgStat_GlobalStats globalStats; ! /* Write request info for each database */ ! typedef struct DBWriteRequest ! { ! Oid databaseid; /* OID of the database to write */ ! TimestampTz request_time; /* timestamp of the last write request */ ! slist_node next; ! } DBWriteRequest; ! /* Latest statistics request time from backends for each DB */ ! static slist_head last_statrequests = SLIST_STATIC_INIT(last_statrequests); static volatile bool need_exit = false; static volatile bool got_SIGHUP = false; *************** *** 252,262 **** static void pgstat_sighup_handler(SIGNAL_ARGS); static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create); ! static void pgstat_write_statsfile(bool permanent); ! static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent); static void backend_read_statsfile(void); static void pgstat_read_current_status(void); static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg); static void pgstat_send_funcstats(void); static HTAB *pgstat_collect_oids(Oid catalogid); --- 260,276 ---- static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create); ! static void pgstat_write_statsfile(bool permanent, bool force); ! static void pgstat_write_db_statsfile(PgStat_StatDBEntry * dbentry, bool permanent); ! static void pgstat_write_db_dummyfile(Oid databaseid); ! static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent, bool onlydbs); ! static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent); static void backend_read_statsfile(void); static void pgstat_read_current_status(void); + static bool pgstat_write_statsfile_needed(void); + static bool pgstat_db_requested(Oid databaseid); + static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg); static void pgstat_send_funcstats(void); static HTAB *pgstat_collect_oids(Oid catalogid); *************** *** 285,291 **** static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int le static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); - /* ------------------------------------------------------------ * Public functions called from postmaster follow * ------------------------------------------------------------ --- 299,304 ---- *************** *** 549,556 **** startup_failed: void pgstat_reset_all(void) { ! unlink(pgstat_stat_filename); ! unlink(PGSTAT_STAT_PERMANENT_FILENAME); } #ifdef EXEC_BACKEND --- 562,605 ---- void pgstat_reset_all(void) { ! DIR * dir; ! struct dirent * entry; ! ! dir = AllocateDir(pgstat_stat_directory); ! while ((entry = ReadDir(dir, pgstat_stat_directory)) != NULL) ! { ! char *fname; ! int totlen; ! ! if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) ! continue; ! ! totlen = strlen(pgstat_stat_directory) + strlen(entry->d_name) + 2; ! fname = palloc(totlen); ! ! snprintf(fname, totlen, "%s/%s", pgstat_stat_directory, entry->d_name); ! unlink(fname); ! pfree(fname); ! } ! FreeDir(dir); ! ! dir = AllocateDir(PGSTAT_STAT_PERMANENT_DIRECTORY); ! while ((entry = ReadDir(dir, PGSTAT_STAT_PERMANENT_DIRECTORY)) != NULL) ! { ! char *fname; ! int totlen; ! ! if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) ! continue; ! ! totlen = strlen(pgstat_stat_directory) + strlen(entry->d_name) + 2; ! fname = palloc(totlen); ! ! snprintf(fname, totlen, "%s/%s", PGSTAT_STAT_PERMANENT_FILENAME, entry->d_name); ! unlink(fname); ! pfree(fname); ! } ! FreeDir(dir); } #ifdef EXEC_BACKEND *************** *** 1408,1420 **** pgstat_ping(void) * ---------- */ static void ! pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time) { PgStat_MsgInquiry msg; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); msg.clock_time = clock_time; msg.cutoff_time = cutoff_time; pgstat_send(&msg, sizeof(msg)); } --- 1457,1470 ---- * ---------- */ static void ! pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid) { PgStat_MsgInquiry msg; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); msg.clock_time = clock_time; msg.cutoff_time = cutoff_time; + msg.databaseid = databaseid; pgstat_send(&msg, sizeof(msg)); } *************** *** 3004,3009 **** PgstatCollectorMain(int argc, char *argv[]) --- 3054,3060 ---- int len; PgStat_Msg msg; int wr; + bool first_write = true; IsUnderPostmaster = true; /* we are a postmaster subprocess now */ *************** *** 3053,3069 **** PgstatCollectorMain(int argc, char *argv[]) init_ps_display("stats collector process", "", "", ""); /* - * Arrange to write the initial status file right away - */ - last_statrequest = GetCurrentTimestamp(); - last_statwrite = last_statrequest - 1; - - /* * Read in an existing statistics stats file or initialize the stats to ! * zero. */ pgStatRunningInCollector = true; ! pgStatDBHash = pgstat_read_statsfile(InvalidOid, true); /* * Loop to process messages until we get SIGQUIT or detect ungraceful --- 3104,3114 ---- init_ps_display("stats collector process", "", "", ""); /* * Read in an existing statistics stats file or initialize the stats to ! * zero (read data for all databases, including table/func stats). */ pgStatRunningInCollector = true; ! pgStatDBHash = pgstat_read_statsfile(InvalidOid, true, false); /* * Loop to process messages until we get SIGQUIT or detect ungraceful *************** *** 3107,3116 **** PgstatCollectorMain(int argc, char *argv[]) /* * Write the stats file if a new request has arrived that is not ! * satisfied by existing file. */ ! if (last_statwrite < last_statrequest) ! pgstat_write_statsfile(false); /* * Try to receive and process a message. This will not block, --- 3152,3165 ---- /* * Write the stats file if a new request has arrived that is not ! * satisfied by existing file (force writing all files if it's ! * the first write after startup). */ ! if (first_write || pgstat_write_statsfile_needed()) ! { ! pgstat_write_statsfile(false, first_write); ! first_write = false; ! } /* * Try to receive and process a message. This will not block, *************** *** 3269,3275 **** PgstatCollectorMain(int argc, char *argv[]) /* * Save the final stats to reuse at next startup. */ ! pgstat_write_statsfile(true); exit(0); } --- 3318,3324 ---- /* * Save the final stats to reuse at next startup. */ ! pgstat_write_statsfile(true, true); exit(0); } *************** *** 3349,3354 **** pgstat_get_db_entry(Oid databaseid, bool create) --- 3398,3404 ---- result->n_block_write_time = 0; result->stat_reset_timestamp = GetCurrentTimestamp(); + result->stats_timestamp = 0; memset(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(Oid); *************** *** 3429,3451 **** pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) * shutting down only), remove the temporary file so that backends * starting up under a new postmaster can't read the old data before * the new collector is ready. * ---------- */ static void ! pgstat_write_statsfile(bool permanent) { HASH_SEQ_STATUS hstat; - HASH_SEQ_STATUS tstat; - HASH_SEQ_STATUS fstat; PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; - PgStat_StatFuncEntry *funcentry; FILE *fpout; int32 format_id; const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; int rc; /* * Open the statistics temp file to write out the current values. */ --- 3479,3503 ---- * shutting down only), remove the temporary file so that backends * starting up under a new postmaster can't read the old data before * the new collector is ready. + * + * When 'allDbs' is false, only the requested databases (listed in + * last_statrequests) will be written. If 'allDbs' is true, all databases + * will be written. * ---------- */ static void ! pgstat_write_statsfile(bool permanent, bool allDbs) { HASH_SEQ_STATUS hstat; PgStat_StatDBEntry *dbentry; FILE *fpout; int32 format_id; const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; int rc; + elog(DEBUG1, "writing statsfile '%s'", statfile); + /* * Open the statistics temp file to write out the current values. */ *************** *** 3484,3489 **** pgstat_write_statsfile(bool permanent) --- 3536,3555 ---- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) { /* + * Write our the tables and functions into a separate file, but only + * if the database is in the requests or if all DBs are to be written. + * + * We need to do this before the dbentry write to write the proper + * timestamp to the global file. + */ + if (allDbs || pgstat_db_requested(dbentry->databaseid)) + { + elog(DEBUG1, "writing statsfile for DB %d", dbentry->databaseid); + dbentry->stats_timestamp = globalStats.stats_timestamp; + pgstat_write_db_statsfile(dbentry, permanent); + } + + /* * Write out the DB entry including the number of live backends. We * don't write the tables or functions pointers, since they're of no * use to any other process. *************** *** 3493,3521 **** pgstat_write_statsfile(bool permanent) (void) rc; /* we'll check for error with ferror */ /* - * Walk through the database's access stats per table. - */ - hash_seq_init(&tstat, dbentry->tables); - while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) - { - fputc('T', fpout); - rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * Walk through the database's function stats table. - */ - hash_seq_init(&fstat, dbentry->functions); - while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) - { - fputc('F', fpout); - rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* * Mark the end of this DB */ fputc('d', fpout); } --- 3559,3568 ---- (void) rc; /* we'll check for error with ferror */ /* * Mark the end of this DB + * + * TODO Does using these chars still make sense, when the tables/func + * stats are moved to a separate file? */ fputc('d', fpout); } *************** *** 3527,3532 **** pgstat_write_statsfile(bool permanent) --- 3574,3607 ---- */ fputc('E', fpout); + /* In any case, we can just throw away all the db requests, but we need to + * write dummy files for databases without a stat entry (it would cause + * issues in pgstat_read_db_statsfile_timestamp and pgstat wait timeouts). + * This may happen e.g. for shared DB (oid = 0) right after initdb. + */ + if (!slist_is_empty(&last_statrequests)) + { + slist_mutable_iter iter; + + slist_foreach_modify(iter, &last_statrequests) + { + DBWriteRequest *req = slist_container(DBWriteRequest, next, + iter.cur); + + /* + * Create dummy files for requested databases without a proper + * dbentry. It's much easier this way than dealing with multiple + * timestamps, possibly existing but not yet written DBs etc. + * */ + if (!pgstat_get_db_entry(req->databaseid, false)) + pgstat_write_db_dummyfile(req->databaseid); + + pfree(req); + } + + slist_init(&last_statrequests); + } + if (ferror(fpout)) { ereport(LOG, *************** *** 3552,3608 **** pgstat_write_statsfile(bool permanent) tmpfile, statfile))); unlink(tmpfile); } - else - { - /* - * Successful write, so update last_statwrite. - */ - last_statwrite = globalStats.stats_timestamp; - - /* - * If there is clock skew between backends and the collector, we could - * receive a stats request time that's in the future. If so, complain - * and reset last_statrequest. Resetting ensures that no inquiry - * message can cause more than one stats file write to occur. - */ - if (last_statrequest > last_statwrite) - { - char *reqtime; - char *mytime; - - /* Copy because timestamptz_to_str returns a static buffer */ - reqtime = pstrdup(timestamptz_to_str(last_statrequest)); - mytime = pstrdup(timestamptz_to_str(last_statwrite)); - elog(LOG, "last_statrequest %s is later than collector's time %s", - reqtime, mytime); - pfree(reqtime); - pfree(mytime); - - last_statrequest = last_statwrite; - } - } if (permanent) unlink(pgstat_stat_filename); } /* ---------- * pgstat_read_statsfile() - * * Reads in an existing statistics collector file and initializes the * databases' hash table (whose entries point to the tables' hash tables). * ---------- */ static HTAB * ! pgstat_read_statsfile(Oid onlydb, bool permanent) { PgStat_StatDBEntry *dbentry; PgStat_StatDBEntry dbbuf; - PgStat_StatTabEntry *tabentry; - PgStat_StatTabEntry tabbuf; - PgStat_StatFuncEntry funcbuf; - PgStat_StatFuncEntry *funcentry; HASHCTL hash_ctl; HTAB *dbhash; HTAB *tabhash = NULL; --- 3627,3905 ---- tmpfile, statfile))); unlink(tmpfile); } if (permanent) unlink(pgstat_stat_filename); } + /* + * return the length that a DB stat file would have (including terminating \0) + * + * XXX We could avoid this overhead by caching a maximum length in + * assign_pgstat_temp_directory; also the distinctions on "permanent" and + * "tempname" seem pointless (what do you mean to save one byte of stack + * space!?) + */ + static int + get_dbstat_file_len(bool permanent, bool tempname, Oid databaseid) + { + char tmp[1]; + int len; + + /* don't actually print, but return how many chars would be used */ + len = snprintf(tmp, 1, "%s/db_%u.%s", + permanent ? "pg_stat" : pgstat_stat_directory, + databaseid, + tempname ? "tmp" : "stat"); + /* XXX pointless? */ + if (len >= MAXPGPATH) + elog(PANIC, "pgstat path too long"); + + /* count terminating \0 */ + return len + 1; + } + + /* + * return the filename for a DB stat file; filename is the output buffer, + * and len is its length. + */ + static void + get_dbstat_filename(bool permanent, bool tempname, Oid databaseid, + char *filename, int len) + { + #ifdef USE_ASSERT_CHECKING + int printed; + + printed = + #endif + snprintf(filename, len, "%s/db_%u.%s", + permanent ? "pg_stat" : pgstat_stat_directory, + databaseid, + tempname ? "tmp" : "stat"); + Assert(printed <= len); + } + + /* ---------- + * pgstat_write_db_statsfile() - + * + * Tell the news. This writes stats file for a single database. + * + * If writing to the permanent file (happens when the collector is + * shutting down only), remove the temporary file so that backends + * starting up under a new postmaster can't read the old data before + * the new collector is ready. + * ---------- + */ + static void + pgstat_write_db_statsfile(PgStat_StatDBEntry * dbentry, bool permanent) + { + HASH_SEQ_STATUS tstat; + HASH_SEQ_STATUS fstat; + PgStat_StatTabEntry *tabentry; + PgStat_StatFuncEntry *funcentry; + FILE *fpout; + int32 format_id; + Oid dbid = dbentry->databaseid; + int rc; + int tmpfilelen = get_dbstat_file_len(permanent, true, dbid); + char tmpfile[tmpfilelen]; + int statfilelen = get_dbstat_file_len(permanent, false, dbid); + char statfile[statfilelen]; + + get_dbstat_filename(permanent, true, dbid, tmpfile, tmpfilelen); + get_dbstat_filename(permanent, false, dbid, statfile, statfilelen); + + elog(DEBUG1, "writing statsfile '%s'", statfile); + + /* + * Open the statistics temp file to write out the current values. + */ + fpout = AllocateFile(tmpfile, PG_BINARY_W); + if (fpout == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open temporary statistics file \"%s\": %m", + tmpfile))); + return; + } + + /* + * Write the file header --- currently just a format ID. + */ + format_id = PGSTAT_FILE_FORMAT_ID; + rc = fwrite(&format_id, sizeof(format_id), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Write the timestamp. + */ + rc = fwrite(&(globalStats.stats_timestamp), sizeof(globalStats.stats_timestamp), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Walk through the database's access stats per table. + */ + hash_seq_init(&tstat, dbentry->tables); + while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) + { + fputc('T', fpout); + rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + } + + /* + * Walk through the database's function stats table. + */ + hash_seq_init(&fstat, dbentry->functions); + while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) + { + fputc('F', fpout); + rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + } + + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } + + if (permanent) + { + elog(DEBUG1, "removing temporary stat file '%s'", tmpfile); + unlink(tmpfile); + } + } + + + /* ---------- + * pgstat_write_db_dummyfile() - + * + * All this does is writing a dummy stat file for databases without dbentry + * yet. It basically writes just a file header - format ID and a timestamp. + * ---------- + */ + static void + pgstat_write_db_dummyfile(Oid databaseid) + { + FILE *fpout; + int32 format_id; + int rc; + int tmpfilelen = get_dbstat_file_len(false, true, databaseid); + char tmpfile[tmpfilelen]; + int statfilelen = get_dbstat_file_len(false, false, databaseid); + char statfile[statfilelen]; + + get_dbstat_filename(false, true, databaseid, tmpfile, tmpfilelen); + get_dbstat_filename(false, false, databaseid, statfile, statfilelen); + + elog(DEBUG1, "writing statsfile '%s'", statfile); + + /* + * Open the statistics temp file to write out the current values. + */ + fpout = AllocateFile(tmpfile, PG_BINARY_W); + if (fpout == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open temporary statistics file \"%s\": %m", + tmpfile))); + return; + } + + /* + * Write the file header --- currently just a format ID. + */ + format_id = PGSTAT_FILE_FORMAT_ID; + rc = fwrite(&format_id, sizeof(format_id), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Write the timestamp. + */ + rc = fwrite(&(globalStats.stats_timestamp), sizeof(globalStats.stats_timestamp), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write temporary dummy statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary dummy statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary dummy statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } + + } /* ---------- * pgstat_read_statsfile() - * * Reads in an existing statistics collector file and initializes the * databases' hash table (whose entries point to the tables' hash tables). + * + * Allows reading only the global stats (at database level), which is just + * enough for many purposes (e.g. autovacuum launcher etc.). If this is + * sufficient for you, use onlydbs=true. * ---------- */ static HTAB * ! pgstat_read_statsfile(Oid onlydb, bool permanent, bool onlydbs) { PgStat_StatDBEntry *dbentry; PgStat_StatDBEntry dbbuf; HASHCTL hash_ctl; HTAB *dbhash; HTAB *tabhash = NULL; *************** *** 3613,3618 **** pgstat_read_statsfile(Oid onlydb, bool permanent) --- 3910,3920 ---- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; /* + * If we want a db-level stats only, we don't want a particular db. + */ + Assert(!((onlydb != InvalidOid) && onlydbs)); + + /* * The tables will live in pgStatLocalContext. */ pgstat_setup_memcxt(); *************** *** 3758,3763 **** pgstat_read_statsfile(Oid onlydb, bool permanent) --- 4060,4075 ---- */ tabhash = dbentry->tables; funchash = dbentry->functions; + + /* + * Read the data from the file for this database. If there was + * onlydb specified (!= InvalidOid), we would not get here because + * of a break above. So we don't need to recheck. + */ + if (!onlydbs) + pgstat_read_db_statsfile(dbentry->databaseid, tabhash, funchash, + permanent); + break; /* *************** *** 3768,3773 **** pgstat_read_statsfile(Oid onlydb, bool permanent) --- 4080,4177 ---- funchash = NULL; break; + case 'E': + goto done; + + default: + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + } + + done: + FreeFile(fpin); + + if (permanent) + unlink(PGSTAT_STAT_PERMANENT_FILENAME); + + return dbhash; + } + + + /* ---------- + * pgstat_read_db_statsfile() - + * + * Reads in an existing statistics collector db file and initializes the + * tables and functions hash tables (for the database identified by Oid). + * ---------- + */ + static void + pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent) + { + PgStat_StatTabEntry *tabentry; + PgStat_StatTabEntry tabbuf; + PgStat_StatFuncEntry funcbuf; + PgStat_StatFuncEntry *funcentry; + FILE *fpin; + int32 format_id; + TimestampTz timestamp; + bool found; + int statfilelen = get_dbstat_file_len(permanent, false, databaseid); + char statfile[statfilelen]; + + get_dbstat_filename(permanent, false, databaseid, statfile, statfilelen); + + /* + * Try to open the status file. If it doesn't exist, the backends simply + * return zero for anything and the collector simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if the stats collector is not running or has + * not yet written the stats file the first time. Any other failure + * condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + return; + } + + /* + * Verify it's of the expected format. + */ + if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) + || format_id != PGSTAT_FILE_FORMAT_ID) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + /* + * Read global stats struct + */ + if (fread(×tamp, 1, sizeof(timestamp), fpin) != sizeof(timestamp)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + /* + * We found an existing collector stats file. Read it and put all the + * hashtable entries into place. + */ + for (;;) + { + switch (fgetc(fpin)) + { /* * 'T' A PgStat_StatTabEntry follows. */ *************** *** 3854,3878 **** done: FreeFile(fpin); if (permanent) ! unlink(PGSTAT_STAT_PERMANENT_FILENAME); ! return dbhash; } /* ---------- ! * pgstat_read_statsfile_timestamp() - * ! * Attempt to fetch the timestamp of an existing stats file. * Returns TRUE if successful (timestamp is stored at *ts). * ---------- */ static bool ! pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts) { ! PgStat_GlobalStats myGlobalStats; FILE *fpin; int32 format_id; ! const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; /* * Try to open the status file. As above, anything but ENOENT is worthy --- 4258,4294 ---- FreeFile(fpin); if (permanent) ! { ! int statfilelen = get_dbstat_file_len(permanent, false, databaseid); ! char statfile[statfilelen]; ! get_dbstat_filename(permanent, false, databaseid, statfile, statfilelen); ! ! elog(DEBUG1, "removing permanent stats file '%s'", statfile); ! unlink(statfile); ! } ! ! return; } + /* ---------- ! * pgstat_read_db_statsfile_timestamp() - * ! * Attempt to fetch the timestamp of an existing stats file (for a DB). * Returns TRUE if successful (timestamp is stored at *ts). * ---------- */ static bool ! pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, TimestampTz *ts) { ! TimestampTz timestamp; FILE *fpin; int32 format_id; ! int filenamelen = get_dbstat_file_len(permanent, false, databaseid); ! char statfile[filenamelen]; ! ! get_dbstat_filename(permanent, false, databaseid, statfile, filenamelen); /* * Try to open the status file. As above, anything but ENOENT is worthy *************** *** 3903,3909 **** pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts) /* * Read global stats struct */ ! if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), fpin) != sizeof(myGlobalStats)) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); --- 4319,4325 ---- /* * Read global stats struct */ ! if (fread(×tamp, 1, sizeof(TimestampTz), fpin) != sizeof(TimestampTz)) { ereport(pgStatRunningInCollector ? LOG : WARNING, (errmsg("corrupted statistics file \"%s\"", statfile))); *************** *** 3911,3917 **** pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts) return false; } ! *ts = myGlobalStats.stats_timestamp; FreeFile(fpin); return true; --- 4327,4333 ---- return false; } ! *ts = timestamp; FreeFile(fpin); return true; *************** *** 3947,3953 **** backend_read_statsfile(void) CHECK_FOR_INTERRUPTS(); ! ok = pgstat_read_statsfile_timestamp(false, &file_ts); cur_ts = GetCurrentTimestamp(); /* Calculate min acceptable timestamp, if we didn't already */ --- 4363,4369 ---- CHECK_FOR_INTERRUPTS(); ! ok = pgstat_read_db_statsfile_timestamp(MyDatabaseId, false, &file_ts); cur_ts = GetCurrentTimestamp(); /* Calculate min acceptable timestamp, if we didn't already */ *************** *** 4006,4012 **** backend_read_statsfile(void) pfree(mytime); } ! pgstat_send_inquiry(cur_ts, min_ts); break; } --- 4422,4428 ---- pfree(mytime); } ! pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId); break; } *************** *** 4016,4022 **** backend_read_statsfile(void) /* Not there or too old, so kick the collector and wait a bit */ if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) ! pgstat_send_inquiry(cur_ts, min_ts); pg_usleep(PGSTAT_RETRY_DELAY * 1000L); } --- 4432,4438 ---- /* Not there or too old, so kick the collector and wait a bit */ if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) ! pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId); pg_usleep(PGSTAT_RETRY_DELAY * 1000L); } *************** *** 4026,4034 **** backend_read_statsfile(void) /* Autovacuum launcher wants stats about all databases */ if (IsAutoVacuumLauncherProcess()) ! pgStatDBHash = pgstat_read_statsfile(InvalidOid, false); else ! pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false); } --- 4442,4457 ---- /* Autovacuum launcher wants stats about all databases */ if (IsAutoVacuumLauncherProcess()) ! /* ! * FIXME Does it really need info including tables/functions? Or is it enough to read ! * database-level stats? It seems to me the launcher needs PgStat_StatDBEntry only ! * (at least that's how I understand the rebuild_database_list() in autovacuum.c), ! * because pgstat_stattabentries are used in do_autovacuum() only, that that's what's ! * executed in workers ... So maybe we'd be just fine by reading in the dbentries? ! */ ! pgStatDBHash = pgstat_read_statsfile(InvalidOid, false, true); else ! pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false, false); } *************** *** 4084,4109 **** pgstat_clear_snapshot(void) static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) { /* ! * Advance last_statrequest if this requestor has a newer cutoff time ! * than any previous request. */ ! if (msg->cutoff_time > last_statrequest) ! last_statrequest = msg->cutoff_time; /* ! * If the requestor's local clock time is older than last_statwrite, we * should suspect a clock glitch, ie system time going backwards; though * the more likely explanation is just delayed message receipt. It is * worth expending a GetCurrentTimestamp call to be sure, since a large * retreat in the system clock reading could otherwise cause us to neglect * to update the stats file for a long time. */ ! if (msg->clock_time < last_statwrite) { TimestampTz cur_ts = GetCurrentTimestamp(); ! if (cur_ts < last_statwrite) { /* * Sure enough, time went backwards. Force a new stats file write --- 4507,4559 ---- static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) { + slist_iter iter; + bool found = false; + DBWriteRequest *newreq; + PgStat_StatDBEntry *dbentry; + + elog(DEBUG1, "received inquiry for %d", msg->databaseid); + + /* + * Find the last write request for this DB (found=true in that case). Plain + * linear search, not really worth doing any magic here (probably). + */ + slist_foreach(iter, &last_statrequests) + { + DBWriteRequest *req = slist_container(DBWriteRequest, next, iter.cur); + + if (req->databaseid != msg->databaseid) + continue; + + if (msg->cutoff_time > req->request_time) + req->request_time = msg->cutoff_time; + found = true; + return; + } + /* ! * There's no request for this DB yet, so create one. */ ! newreq = palloc(sizeof(DBWriteRequest)); ! ! newreq->databaseid = msg->databaseid; ! newreq->request_time = msg->clock_time; ! slist_push_head(&last_statrequests, &newreq->next); /* ! * If the requestor's local clock time is older than stats_timestamp, we * should suspect a clock glitch, ie system time going backwards; though * the more likely explanation is just delayed message receipt. It is * worth expending a GetCurrentTimestamp call to be sure, since a large * retreat in the system clock reading could otherwise cause us to neglect * to update the stats file for a long time. */ ! dbentry = pgstat_get_db_entry(msg->databaseid, false); ! if ((dbentry != NULL) && (msg->clock_time < dbentry->stats_timestamp)) { TimestampTz cur_ts = GetCurrentTimestamp(); ! if (cur_ts < dbentry->stats_timestamp) { /* * Sure enough, time went backwards. Force a new stats file write *************** *** 4113,4127 **** pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) char *mytime; /* Copy because timestamptz_to_str returns a static buffer */ ! writetime = pstrdup(timestamptz_to_str(last_statwrite)); mytime = pstrdup(timestamptz_to_str(cur_ts)); ! elog(LOG, "last_statwrite %s is later than collector's time %s", ! writetime, mytime); pfree(writetime); pfree(mytime); ! last_statrequest = cur_ts; ! last_statwrite = last_statrequest - 1; } } } --- 4563,4578 ---- char *mytime; /* Copy because timestamptz_to_str returns a static buffer */ ! writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp)); mytime = pstrdup(timestamptz_to_str(cur_ts)); ! elog(LOG, ! "stats_timestamp %s is later than collector's time %s for db %d", ! writetime, mytime, dbentry->databaseid); pfree(writetime); pfree(mytime); ! newreq->request_time = cur_ts; ! dbentry->stats_timestamp = cur_ts - 1; } } } *************** *** 4270,4298 **** pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len) static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) { PgStat_StatDBEntry *dbentry; /* * Lookup the database in the hashtable. */ ! dbentry = pgstat_get_db_entry(msg->m_databaseid, false); /* ! * If found, remove it. */ if (dbentry) { if (dbentry->tables != NULL) hash_destroy(dbentry->tables); if (dbentry->functions != NULL) hash_destroy(dbentry->functions); if (hash_search(pgStatDBHash, ! (void *) &(dbentry->databaseid), HASH_REMOVE, NULL) == NULL) ereport(ERROR, ! (errmsg("database hash table corrupted " ! "during cleanup --- abort"))); } } --- 4721,4757 ---- static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) { + Oid dbid = msg->m_databaseid; PgStat_StatDBEntry *dbentry; /* * Lookup the database in the hashtable. */ ! dbentry = pgstat_get_db_entry(dbid, false); /* ! * If found, remove it (along with the db statfile). */ if (dbentry) { + int statfilelen = get_dbstat_file_len(true, false, dbid); + char statfile[statfilelen]; + + get_dbstat_filename(true, false, dbid, statfile, statfilelen); + + elog(DEBUG1, "removing %s", statfile); + unlink(statfile); + if (dbentry->tables != NULL) hash_destroy(dbentry->tables); if (dbentry->functions != NULL) hash_destroy(dbentry->functions); if (hash_search(pgStatDBHash, ! (void *) &dbid, HASH_REMOVE, NULL) == NULL) ereport(ERROR, ! (errmsg("database hash table corrupted during cleanup --- abort"))); } } *************** *** 4687,4689 **** pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) --- 5146,5206 ---- HASH_REMOVE, NULL); } } + + /* ---------- + * pgstat_write_statsfile_needed() - + * + * Checks whether there's a db stats request, requiring a file write. + * + * TODO Seems that thanks the way we handle last_statrequests (erase after + * a write), this is unnecessary. Just check that there's at least one + * request and you're done. Although there might be delayed requests ... + * ---------- + */ + static bool + pgstat_write_statsfile_needed(void) + { + PgStat_StatDBEntry *dbentry; + slist_iter iter; + + /* Check the databases if they need to refresh the stats. */ + slist_foreach(iter, &last_statrequests) + { + DBWriteRequest *req = slist_container(DBWriteRequest, next, iter.cur); + + dbentry = pgstat_get_db_entry(req->databaseid, false); + + /* No dbentry yet or too old. */ + if (!dbentry || (dbentry->stats_timestamp < req->request_time)) + { + return true; + } + } + + /* Well, everything was written recently ... */ + return false; + } + + /* ---------- + * pgstat_write_statsfile_needed() - + * + * Checks whether stats for a particular DB need to be written to a file). + * ---------- + */ + + static bool + pgstat_db_requested(Oid databaseid) + { + slist_iter iter; + + /* Check the databases if they need to refresh the stats. */ + slist_foreach(iter, &last_statrequests) + { + DBWriteRequest *req = slist_container(DBWriteRequest, next, iter.cur); + + if (req->databaseid == databaseid) + return true; + } + + return false; + } *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 8704,8717 **** static void assign_pgstat_temp_directory(const char *newval, void *extra) { /* check_canonical_path already canonicalized newval for us */ char *tname; char *fname; ! tname = guc_malloc(ERROR, strlen(newval) + 12); /* /pgstat.tmp */ ! sprintf(tname, "%s/pgstat.tmp", newval); ! fname = guc_malloc(ERROR, strlen(newval) + 13); /* /pgstat.stat */ ! sprintf(fname, "%s/pgstat.stat", newval); if (pgstat_stat_tmpname) free(pgstat_stat_tmpname); pgstat_stat_tmpname = tname; --- 8704,8726 ---- assign_pgstat_temp_directory(const char *newval, void *extra) { /* check_canonical_path already canonicalized newval for us */ + char *dname; char *tname; char *fname; ! /* directory */ ! dname = guc_malloc(ERROR, strlen(newval) + 1); /* runtime dir */ ! sprintf(dname, "%s", newval); + /* global stats */ + tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */ + sprintf(tname, "%s/global.tmp", newval); + fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */ + sprintf(fname, "%s/global.stat", newval); + + if (pgstat_stat_directory) + free(pgstat_stat_directory); + pgstat_stat_directory = dname; if (pgstat_stat_tmpname) free(pgstat_stat_tmpname); pgstat_stat_tmpname = tname; *** a/src/bin/initdb/initdb.c --- b/src/bin/initdb/initdb.c *************** *** 192,197 **** const char *subdirs[] = { --- 192,198 ---- "base", "base/1", "pg_tblspc", + "pg_stat", "pg_stat_tmp" }; *** a/src/include/pgstat.h --- b/src/include/pgstat.h *************** *** 205,210 **** typedef struct PgStat_MsgInquiry --- 205,211 ---- PgStat_MsgHdr m_hdr; TimestampTz clock_time; /* observed local clock time */ TimestampTz cutoff_time; /* minimum acceptable file timestamp */ + Oid databaseid; /* requested DB (InvalidOid => all DBs) */ } PgStat_MsgInquiry; *************** *** 514,520 **** typedef union PgStat_Msg * ------------------------------------------------------------ */ ! #define PGSTAT_FILE_FORMAT_ID 0x01A5BC9A /* ---------- * PgStat_StatDBEntry The collector's data per database --- 515,521 ---- * ------------------------------------------------------------ */ ! #define PGSTAT_FILE_FORMAT_ID 0xA240CA47 /* ---------- * PgStat_StatDBEntry The collector's data per database *************** *** 545,550 **** typedef struct PgStat_StatDBEntry --- 546,552 ---- PgStat_Counter n_block_write_time; TimestampTz stat_reset_timestamp; + TimestampTz stats_timestamp; /* time of db stats file update */ /* * tables and functions must be last in the struct, because we don't write *************** *** 722,727 **** extern bool pgstat_track_activities; --- 724,730 ---- extern bool pgstat_track_counts; extern int pgstat_track_functions; extern PGDLLIMPORT int pgstat_track_activity_query_size; + extern char *pgstat_stat_directory; extern char *pgstat_stat_tmpname; extern char *pgstat_stat_filename;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers