shared-memory based stats collector

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
86 messages Options
12345
Reply | Threaded
Open this post in threaded view
|

Re: shared-memory based stats collector

Alvaro Herrera-9
On 2019-Jul-11, Kyotaro Horiguchi wrote:

> Hello. This is v21 of the patch.
>
> > CF-bot warned that it doesn't work on Windows. I'm experiencing
> > an very painful time to wait for tortoise git is walking slowly
> > as its name suggests. It would be fixed in the next version.
>
> Found a bug in initialization. StatsShememInit() was placed at a
> wrong place and stats code on child processes accessed
> uninitialized pointer. It is a leftover from the previous shape
> where dsm was activated on postmaster.

This doesn't apply anymore.  Can you please rebase?

--
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


Reply | Threaded
Open this post in threaded view
|

Re: shared-memory based stats collector

Kyotaro Horiguchi-4
At Tue, 3 Sep 2019 18:28:05 -0400, Alvaro Herrera <[hidden email]> wrote in <[hidden email]>
> > Found a bug in initialization. StatsShememInit() was placed at a
> > wrong place and stats code on child processes accessed
> > uninitialized pointer. It is a leftover from the previous shape
> > where dsm was activated on postmaster.
>
> This doesn't apply anymore.  Can you please rebase?

Thanks! I forgot to post rebased version after doing. Here it is.

- (Re)Rebased to the current master.
- Passed all tests for me.

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

From 99b86de7e647c74a01fb694c2a868fa24fdf6424 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Fri, 29 Jun 2018 16:41:04 +0900
Subject: [PATCH v22 1/5] sequential scan for dshash

Add sequential scan feature to dshash.
---
 src/backend/lib/dshash.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/include/lib/dshash.h |  23 +++++-
 2 files changed, 206 insertions(+), 5 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 350f8c0a66..4f0c7ec840 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -112,6 +112,7 @@ struct dshash_table
  size_t size_log2; /* log2(number of buckets) */
  bool find_locked; /* Is any partition lock held by 'find'? */
  bool find_exclusively_locked; /* ... exclusively? */
+ bool seqscan_running;/* now under sequential scan */
 };
 
 /* Given a pointer to an item, find the entry (user data) it holds. */
@@ -127,6 +128,10 @@ struct dshash_table
 #define NUM_SPLITS(size_log2) \
  (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
 
+/* How many buckets are there in a given size? */
+#define NUM_BUCKETS(size_log2) \
+ (((size_t) 1) << (size_log2))
+
 /* How many buckets are there in each partition at a given size? */
 #define BUCKETS_PER_PARTITION(size_log2) \
  (((size_t) 1) << NUM_SPLITS(size_log2))
@@ -153,6 +158,10 @@ struct dshash_table
 #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
  ((partition) << NUM_SPLITS(size_log2))
 
+/* Choose partition based on bucket index. */
+#define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
+ ((bucket_idx) >> NUM_SPLITS(size_log2))
+
 /* The head of the active bucket for a given hash value (lvalue). */
 #define BUCKET_FOR_HASH(hash_table, hash) \
  (hash_table->buckets[ \
@@ -228,6 +237,7 @@ dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
 
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
+ hash_table->seqscan_running = false;
 
  /*
  * Set up the initial array of buckets.  Our initial size is the same as
@@ -279,6 +289,7 @@ dshash_attach(dsa_area *area, const dshash_parameters *params,
  hash_table->control = dsa_get_address(area, control);
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
+ hash_table->seqscan_running = false;
  Assert(hash_table->control->magic == DSHASH_MAGIC);
 
  /*
@@ -324,7 +335,7 @@ dshash_destroy(dshash_table *hash_table)
  ensure_valid_bucket_pointers(hash_table);
 
  /* Free all the entries. */
- size = ((size_t) 1) << hash_table->size_log2;
+ size = NUM_BUCKETS(hash_table->size_log2);
  for (i = 0; i < size; ++i)
  {
  dsa_pointer item_pointer = hash_table->buckets[i];
@@ -549,9 +560,14 @@ dshash_delete_entry(dshash_table *hash_table, void *entry)
  LW_EXCLUSIVE));
 
  delete_item(hash_table, item);
- hash_table->find_locked = false;
- hash_table->find_exclusively_locked = false;
- LWLockRelease(PARTITION_LOCK(hash_table, partition));
+
+ /* We need to keep partition lock while sequential scan */
+ if (!hash_table->seqscan_running)
+ {
+ hash_table->find_locked = false;
+ hash_table->find_exclusively_locked = false;
+ LWLockRelease(PARTITION_LOCK(hash_table, partition));
+ }
 }
 
 /*
@@ -568,6 +584,8 @@ dshash_release_lock(dshash_table *hash_table, void *entry)
  Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition_index),
  hash_table->find_exclusively_locked
  ? LW_EXCLUSIVE : LW_SHARED));
+ /* lock is under control of sequential scan */
+ Assert(!hash_table->seqscan_running);
 
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
@@ -592,6 +610,168 @@ dshash_memhash(const void *v, size_t size, void *arg)
  return tag_hash(v, size);
 }
 
+/*
+ * dshash_seq_init/_next/_term
+ *           Sequentially scan trhough dshash table and return all the
+ *           elements one by one, return NULL when no more.
+ *
+ * dshash_seq_term should be called if and only if the scan is abandoned
+ * before completion; if dshash_seq_next returns NULL then it has already done
+ * the end-of-scan cleanup.
+ *
+ * On returning element, it is locked as is the case with dshash_find.
+ * However, the caller must not release the lock. The lock is released as
+ * necessary in continued scan.
+ *
+ * As opposed to the equivalent for dynanash, the caller is not supposed to
+ * delete the returned element before continuing the scan.
+ *
+ * If consistent is set for dshash_seq_init, the whole hash table is
+ * non-exclusively locked. Otherwise a part of the hash table is locked in the
+ * same mode (partition lock).
+ */
+void
+dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+ bool consistent, bool exclusive)
+{
+ /* allowed at most one scan at once */
+ Assert(!hash_table->seqscan_running);
+
+ status->hash_table = hash_table;
+ status->curbucket = 0;
+ status->nbuckets = 0;
+ status->curitem = NULL;
+ status->pnextitem = InvalidDsaPointer;
+ status->curpartition = -1;
+ status->consistent = consistent;
+ status->exclusive = exclusive;
+ hash_table->seqscan_running = true;
+
+ /*
+ * Protect all partitions from modification if the caller wants a
+ * consistent result.
+ */
+ if (consistent)
+ {
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+
+ LWLockAcquire(PARTITION_LOCK(hash_table, i),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ }
+ ensure_valid_bucket_pointers(hash_table);
+ }
+}
+
+void *
+dshash_seq_next(dshash_seq_status *status)
+{
+ dsa_pointer next_item_pointer;
+
+ Assert(status->hash_table->seqscan_running);
+ if (status->curitem == NULL)
+ {
+ int partition;
+
+ Assert (status->curbucket == 0);
+ Assert(!status->hash_table->find_locked);
+
+ /* first shot. grab the first item. */
+ if (!status->consistent)
+ {
+ partition =
+ PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+   status->hash_table->size_log2);
+ LWLockAcquire(PARTITION_LOCK(status->hash_table, partition),
+  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ status->curpartition = partition;
+
+ /* resize doesn't happen from now until seq scan ends */
+ status->nbuckets =
+ NUM_BUCKETS(status->hash_table->control->size_log2);
+ ensure_valid_bucket_pointers(status->hash_table);
+ }
+
+ next_item_pointer = status->hash_table->buckets[status->curbucket];
+ }
+ else
+ next_item_pointer = status->pnextitem;
+
+ /* Move to the next bucket if we finished the current bucket */
+ while (!DsaPointerIsValid(next_item_pointer))
+ {
+ if (++status->curbucket >= status->nbuckets)
+ {
+ /* all buckets have been scanned. finsih. */
+ dshash_seq_term(status);
+ return NULL;
+ }
+
+ /* Also move parititon lock if needed */
+ if (!status->consistent)
+ {
+ int next_partition =
+ PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+   status->hash_table->size_log2);
+
+ /* Move lock along with partition for the bucket */
+ if (status->curpartition != next_partition)
+ {
+ /*
+ * Take lock on the next partition then release the current,
+ * not in the reverse order. This is required to avoid
+ * resizing from happening during a sequential scan. Locks are
+ * taken in partition order so no dead lock happen with other
+ * seq scans or resizing.
+ */
+ LWLockAcquire(PARTITION_LOCK(status->hash_table,
+ next_partition),
+  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ LWLockRelease(PARTITION_LOCK(status->hash_table,
+ status->curpartition));
+ status->curpartition = next_partition;
+ }
+ }
+
+ next_item_pointer = status->hash_table->buckets[status->curbucket];
+ }
+
+ status->curitem =
+ dsa_get_address(status->hash_table->area, next_item_pointer);
+ status->hash_table->find_locked = true;
+ status->hash_table->find_exclusively_locked = status->exclusive;
+
+ /*
+ * This item can be deleted by the caller. Store the next item for the
+ * next iteration for the occasion.
+ */
+ status->pnextitem = status->curitem->next;
+
+ return ENTRY_FROM_ITEM(status->curitem);
+}
+
+void
+dshash_seq_term(dshash_seq_status *status)
+{
+ Assert(status->hash_table->seqscan_running);
+ status->hash_table->find_locked = false;
+ status->hash_table->find_exclusively_locked = false;
+ status->hash_table->seqscan_running = false;
+
+ if (status->consistent)
+ {
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ LWLockRelease(PARTITION_LOCK(status->hash_table, i));
+ }
+ else if (status->curpartition >= 0)
+ LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
+}
+
 /*
  * Print debugging information about the internal state of the hash table to
  * stderr.  The caller must hold no partition locks.
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index fa2e28ff3e..79698a6ad6 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -59,6 +59,23 @@ typedef struct dshash_parameters
 struct dshash_table_item;
 typedef struct dshash_table_item dshash_table_item;
 
+/*
+ * Sequential scan state of dshash. The detail is exposed since the storage
+ * size should be known to users but it should be considered as an opaque
+ * type by callers.
+ */
+typedef struct dshash_seq_status
+{
+ dshash_table   *hash_table;
+ int curbucket;
+ int nbuckets;
+ dshash_table_item  *curitem;
+ dsa_pointer pnextitem;
+ int curpartition;
+ bool consistent;
+ bool exclusive;
+} dshash_seq_status;
+
 /* Creating, sharing and destroying from hash tables. */
 extern dshash_table *dshash_create(dsa_area *area,
    const dshash_parameters *params,
@@ -70,7 +87,6 @@ extern dshash_table *dshash_attach(dsa_area *area,
 extern void dshash_detach(dshash_table *hash_table);
 extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table);
 extern void dshash_destroy(dshash_table *hash_table);
-
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
  const void *key, bool exclusive);
@@ -80,6 +96,11 @@ extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
 
+/* seq scan support */
+extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+ bool consistent, bool exclusive);
+extern void *dshash_seq_next(dshash_seq_status *status);
+extern void dshash_seq_term(dshash_seq_status *status);
 /* Convenience hash and compare functions wrapping memcmp and tag_hash. */
 extern int dshash_memcmp(const void *a, const void *b, size_t size, void *arg);
 extern dshash_hash dshash_memhash(const void *v, size_t size, void *arg);
--
2.16.3


From 261e9ed8d118e7b3bce0c5a69a58eacff5b3c7d3 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Thu, 27 Sep 2018 11:15:19 +0900
Subject: [PATCH v22 2/5] Add conditional lock feature to dshash

Dshash currently waits for lock unconditionally. This commit adds new
interfaces for dshash_find and dshash_find_or_insert. The new
interfaces have an extra parameter "nowait" taht commands not to wait
for lock.
---
 src/backend/lib/dshash.c | 69 +++++++++++++++++++++++++++++++++++++++++++-----
 src/include/lib/dshash.h |  6 +++++
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 4f0c7ec840..60a6e3c0bc 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -394,19 +394,48 @@ dshash_get_hash_table_handle(dshash_table *hash_table)
  */
 void *
 dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
+{
+ return dshash_find_extended(hash_table, key, exclusive, false, NULL);
+}
+
+/*
+ * Addition to dshash_find, returns immediately when nowait is true and lock
+ * was not acquired. Lock status is set to *lock_failed if any.
+ */
+void *
+dshash_find_extended(dshash_table *hash_table, const void *key,
+ bool exclusive, bool nowait, bool *lock_acquired)
 {
  dshash_hash hash;
  size_t partition;
  dshash_table_item *item;
 
+ /* allowing !nowait returning the result is just not sensible */
+ Assert(nowait || !lock_acquired);
+
  hash = hash_key(hash_table, key);
  partition = PARTITION_FOR_HASH(hash);
 
  Assert(hash_table->control->magic == DSHASH_MAGIC);
  Assert(!hash_table->find_locked);
 
- LWLockAcquire(PARTITION_LOCK(hash_table, partition),
-  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ if (nowait)
+ {
+ if (!LWLockConditionalAcquire(PARTITION_LOCK(hash_table, partition),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED))
+ {
+ if (lock_acquired)
+ *lock_acquired = false;
+ return NULL;
+ }
+ }
+ else
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+
+ if (lock_acquired)
+ *lock_acquired = true;
+
  ensure_valid_bucket_pointers(hash_table);
 
  /* Search the active bucket. */
@@ -441,6 +470,22 @@ void *
 dshash_find_or_insert(dshash_table *hash_table,
   const void *key,
   bool *found)
+{
+ return dshash_find_or_insert_extended(hash_table, key, found, false);
+}
+
+/*
+ * Addition to dshash_find_or_insert, returns NULL if nowait is true and lock
+ * was not acquired.
+ *
+ * Notes above dshash_find_extended() regarding locking and error handling
+ * equally apply here.
+ */
+void *
+dshash_find_or_insert_extended(dshash_table *hash_table,
+   const void *key,
+   bool *found,
+   bool nowait)
 {
  dshash_hash hash;
  size_t partition_index;
@@ -455,8 +500,16 @@ dshash_find_or_insert(dshash_table *hash_table,
  Assert(!hash_table->find_locked);
 
 restart:
- LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
-  LW_EXCLUSIVE);
+ if (nowait)
+ {
+ if (!LWLockConditionalAcquire(
+ PARTITION_LOCK(hash_table, partition_index),
+ LW_EXCLUSIVE))
+ return NULL;
+ }
+ else
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
+  LW_EXCLUSIVE);
  ensure_valid_bucket_pointers(hash_table);
 
  /* Search the active bucket. */
@@ -626,9 +679,11 @@ dshash_memhash(const void *v, size_t size, void *arg)
  * As opposed to the equivalent for dynanash, the caller is not supposed to
  * delete the returned element before continuing the scan.
  *
- * If consistent is set for dshash_seq_init, the whole hash table is
- * non-exclusively locked. Otherwise a part of the hash table is locked in the
- * same mode (partition lock).
+ * If consistent is set for dshash_seq_init, the all hash table
+ * partitions are locked in the requested mode (as determined by the
+ * exclusive flag), and the locks are held until the end of the scan.
+ * Otherwise the partition locks are acquired and released as needed
+ * during the scan (up to two partitions may be locked at the same time).
  */
 void
 dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index 79698a6ad6..67f7d77f71 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -90,8 +90,14 @@ extern void dshash_destroy(dshash_table *hash_table);
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
  const void *key, bool exclusive);
+extern void *dshash_find_extended(dshash_table *hash_table, const void *key,
+  bool exclusive, bool nowait,
+  bool *lock_acquired);
 extern void *dshash_find_or_insert(dshash_table *hash_table,
    const void *key, bool *found);
+extern void *dshash_find_or_insert_extended(dshash_table *hash_table,
+ const void *key, bool *found,
+ bool nowait);
 extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
--
2.16.3


From 4647fcbf1ef032bec3090f6e2702e8cc9997ea6b Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Wed, 7 Nov 2018 16:53:49 +0900
Subject: [PATCH v22 3/5] Make archiver process an auxiliary process

This is a preliminary patch for shared-memory based stats collector.
Archiver process must be a auxiliary process since it uses shared
memory after stats data wes moved onto shared-memory. Make the process
an auxiliary process in order to make it work.
---
 src/backend/bootstrap/bootstrap.c   |  8 +++
 src/backend/postmaster/pgarch.c     | 98 +++++++++----------------------------
 src/backend/postmaster/pgstat.c     |  6 +++
 src/backend/postmaster/postmaster.c | 35 +++++++++----
 src/include/miscadmin.h             |  2 +
 src/include/pgstat.h                |  1 +
 src/include/postmaster/pgarch.h     |  4 +-
 7 files changed, 67 insertions(+), 87 deletions(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 9238fbe98d..dde2485b14 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -329,6 +329,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
  case BgWriterProcess:
  statmsg = pgstat_get_backend_desc(B_BG_WRITER);
  break;
+ case ArchiverProcess:
+ statmsg = pgstat_get_backend_desc(B_ARCHIVER);
+ break;
  case CheckpointerProcess:
  statmsg = pgstat_get_backend_desc(B_CHECKPOINTER);
  break;
@@ -456,6 +459,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
  BackgroundWriterMain();
  proc_exit(1); /* should never return */
 
+ case ArchiverProcess:
+ /* don't set signals, archiver has its own agenda */
+ PgArchiverMain();
+ proc_exit(1); /* should never return */
+
  case CheckpointerProcess:
  /* don't set signals, checkpointer has its own agenda */
  CheckpointerMain();
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index f84f882c4c..4342ebdab4 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -77,7 +77,6 @@
  * Local data
  * ----------
  */
-static time_t last_pgarch_start_time;
 static time_t last_sigterm_time = 0;
 
 /*
@@ -96,7 +95,6 @@ static volatile sig_atomic_t ready_to_stop = false;
 static pid_t pgarch_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]) pg_attribute_noreturn();
 static void pgarch_exit(SIGNAL_ARGS);
 static void ArchSigHupHandler(SIGNAL_ARGS);
 static void ArchSigTermHandler(SIGNAL_ARGS);
@@ -114,75 +112,6 @@ static void pgarch_archiveDone(char *xlog);
  * ------------------------------------------------------------
  */
 
-/*
- * pgarch_start
- *
- * Called from postmaster at startup or after an existing archiver
- * died.  Attempt to fire up a fresh archiver process.
- *
- * Returns PID of child process, or 0 if fail.
- *
- * Note: if fail, we will be called again from the postmaster main loop.
- */
-int
-pgarch_start(void)
-{
- time_t curtime;
- pid_t pgArchPid;
-
- /*
- * Do nothing if no archiver needed
- */
- if (!XLogArchivingActive())
- return 0;
-
- /*
- * Do nothing if too soon since last archiver start.  This is a safety
- * valve to protect against continuous respawn attempts if the archiver is
- * dying immediately at launch. Note that since we will be re-called from
- * the postmaster main loop, we will get another chance later.
- */
- curtime = time(NULL);
- if ((unsigned int) (curtime - last_pgarch_start_time) <
- (unsigned int) PGARCH_RESTART_INTERVAL)
- return 0;
- last_pgarch_start_time = curtime;
-
-#ifdef EXEC_BACKEND
- switch ((pgArchPid = pgarch_forkexec()))
-#else
- switch ((pgArchPid = fork_process()))
-#endif
- {
- case -1:
- ereport(LOG,
- (errmsg("could not fork archiver: %m")));
- return 0;
-
-#ifndef EXEC_BACKEND
- case 0:
- /* in postmaster child ... */
- InitPostmasterChild();
-
- /* Close the postmaster's sockets */
- ClosePostmasterPorts(false);
-
- /* Drop our connection to postmaster's shared memory, as well */
- dsm_detach_all();
- PGSharedMemoryDetach();
-
- PgArchiverMain(0, NULL);
- break;
-#endif
-
- default:
- return (int) pgArchPid;
- }
-
- /* shouldn't get here */
- return 0;
-}
-
 /* ------------------------------------------------------------
  * Local functions called by archiver follow
  * ------------------------------------------------------------
@@ -222,8 +151,8 @@ pgarch_forkexec(void)
  * The argc/argv parameters are valid only in EXEC_BACKEND case.  However,
  * since we don't use 'em, it hardly matters...
  */
-NON_EXEC_STATIC void
-PgArchiverMain(int argc, char *argv[])
+void
+PgArchiverMain(void)
 {
  /*
  * Ignore all signals usually bound to some action in the postmaster,
@@ -255,8 +184,27 @@ PgArchiverMain(int argc, char *argv[])
 static void
 pgarch_exit(SIGNAL_ARGS)
 {
- /* SIGQUIT means curl up and die ... */
- exit(1);
+ PG_SETMASK(&BlockSig);
+
+ /*
+ * We DO NOT want to run proc_exit() callbacks -- we're here because
+ * shared memory may be corrupted, so we don't want to try to clean up our
+ * transaction.  Just nail the windows shut and get out of town.  Now that
+ * there's an atexit callback to prevent third-party code from breaking
+ * things by calling exit() directly, we have to reset the callbacks
+ * explicitly to make this work as intended.
+ */
+ on_exit_reset();
+
+ /*
+ * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+ * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+ * backend.  This is necessary precisely because we don't clean up our
+ * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+ * should ensure the postmaster sees this as a crash, too, but no harm in
+ * being doubly sure.)
+ */
+ exit(2);
 }
 
 /* SIGHUP signal handler for archiver process */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 011076c3e3..043e3ff9d2 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2934,6 +2934,9 @@ pgstat_bestart(void)
  case StartupProcess:
  lbeentry.st_backendType = B_STARTUP;
  break;
+ case ArchiverProcess:
+ beentry->st_backendType = B_ARCHIVER;
+ break;
  case BgWriterProcess:
  lbeentry.st_backendType = B_BG_WRITER;
  break;
@@ -4277,6 +4280,9 @@ pgstat_get_backend_desc(BackendType backendType)
 
  switch (backendType)
  {
+ case B_ARCHIVER:
+ backendDesc = "archiver";
+ break;
  case B_AUTOVAC_LAUNCHER:
  backendDesc = "autovacuum launcher";
  break;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index a5446d54bb..582434252f 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -146,7 +146,8 @@
 #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */
 #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */
 #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */
-#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */
+#define BACKEND_TYPE_ARCHIVER 0x0010 /* archiver process */
+#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */
 
 #define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER)
 
@@ -539,6 +540,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #endif /* EXEC_BACKEND */
 
 #define StartupDataBase() StartChildProcess(StartupProcess)
+#define StartArchiver() StartChildProcess(ArchiverProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
 #define StartCheckpointer() StartChildProcess(CheckpointerProcess)
 #define StartWalWriter() StartChildProcess(WalWriterProcess)
@@ -1762,7 +1764,7 @@ ServerLoop(void)
 
  /* If we have lost the archiver, try to start a new one. */
  if (PgArchPID == 0 && PgArchStartupAllowed())
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
 
  /* If we need to signal the autovacuum launcher, do so now */
  if (avlauncher_needs_signal)
@@ -2991,7 +2993,7 @@ reaper(SIGNAL_ARGS)
  if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0)
  AutoVacPID = StartAutoVacLauncher();
  if (PgArchStartupAllowed() && PgArchPID == 0)
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
  if (PgStatPID == 0)
  PgStatPID = pgstat_start();
 
@@ -3136,10 +3138,8 @@ reaper(SIGNAL_ARGS)
  {
  PgArchPID = 0;
  if (!EXIT_STATUS_0(exitstatus))
- LogChildExit(LOG, _("archiver process"),
- pid, exitstatus);
- if (PgArchStartupAllowed())
- PgArchPID = pgarch_start();
+ HandleChildCrash(pid, exitstatus,
+ _("archiver process"));
  continue;
  }
 
@@ -3385,7 +3385,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, archiver or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -3590,6 +3590,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
  }
 
+ /* Take care of the archiver too */
+ if (pid == PgArchPID)
+ PgArchPID = 0;
+ else if (PgArchPID != 0 && take_action)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) PgArchPID)));
+ signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
  /*
  * Force a power-cycle of the pgarch process too.  (This isn't absolutely
  * necessary, but it seems like a good idea for robustness, and it
@@ -3862,6 +3874,7 @@ PostmasterStateMachine(void)
  Assert(CheckpointerPID == 0);
  Assert(WalWriterPID == 0);
  Assert(AutoVacPID == 0);
+ Assert(PgArchPID == 0);
  /* syslogger is not considered here */
  pmState = PM_NO_CHILDREN;
  }
@@ -5131,7 +5144,7 @@ sigusr1_handler(SIGNAL_ARGS)
  */
  Assert(PgArchPID == 0);
  if (XLogArchivingAlways())
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
 
  /*
  * If we aren't planning to enter hot standby mode later, treat
@@ -5414,6 +5427,10 @@ StartChildProcess(AuxProcType type)
  ereport(LOG,
  (errmsg("could not fork startup process: %m")));
  break;
+ case ArchiverProcess:
+ ereport(LOG,
+ (errmsg("could not fork archiver process: %m")));
+ break;
  case BgWriterProcess:
  ereport(LOG,
  (errmsg("could not fork background writer process: %m")));
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index bc6e03fbc7..1f4db67f3f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -399,6 +399,7 @@ typedef enum
  BootstrapProcess,
  StartupProcess,
  BgWriterProcess,
+ ArchiverProcess,
  CheckpointerProcess,
  WalWriterProcess,
  WalReceiverProcess,
@@ -411,6 +412,7 @@ extern AuxProcType MyAuxProcType;
 #define AmBootstrapProcess() (MyAuxProcType == BootstrapProcess)
 #define AmStartupProcess() (MyAuxProcType == StartupProcess)
 #define AmBackgroundWriterProcess() (MyAuxProcType == BgWriterProcess)
+#define AmArchiverProcess() (MyAuxProcType == ArchiverProcess)
 #define AmCheckpointerProcess() (MyAuxProcType == CheckpointerProcess)
 #define AmWalWriterProcess() (MyAuxProcType == WalWriterProcess)
 #define AmWalReceiverProcess() (MyAuxProcType == WalReceiverProcess)
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe076d823d..65713abc2b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -718,6 +718,7 @@ typedef struct PgStat_GlobalStats
  */
 typedef enum BackendType
 {
+ B_ARCHIVER,
  B_AUTOVAC_LAUNCHER,
  B_AUTOVAC_WORKER,
  B_BACKEND,
diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h
index 2474eac26a..88f16863d4 100644
--- a/src/include/postmaster/pgarch.h
+++ b/src/include/postmaster/pgarch.h
@@ -32,8 +32,6 @@
  */
 extern int pgarch_start(void);
 
-#ifdef EXEC_BACKEND
-extern void PgArchiverMain(int argc, char *argv[]) pg_attribute_noreturn();
-#endif
+extern void PgArchiverMain(void) pg_attribute_noreturn();
 
 #endif /* _PGARCH_H */
--
2.16.3


From 1a5a0e2bd49d2ec1c4a73c8be7c7d7d390c61a37 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Thu, 21 Feb 2019 12:44:56 +0900
Subject: [PATCH v22 4/5] Shared-memory based stats collector

Previously activity statistics is shared via files on disk. Every
backend sends the numbers to the stats collector process via a socket.
It makes snapshots as a set of files on disk with a certain interval
then every backend reads them as necessary. It worked fine for
comparatively small set of statistics but the set is under the
pressure to growing up and the file size has reached the order of
megabytes. To deal with larger statistics set, this patch let backends
directly share the statistics via shared memory.
---
 doc/src/sgml/monitoring.sgml                 |    6 +-
 src/backend/postmaster/autovacuum.c          |   12 +-
 src/backend/postmaster/pgstat.c              | 5661 ++++++++++++--------------
 src/backend/postmaster/postmaster.c          |  139 +-
 src/backend/storage/ipc/ipci.c               |    2 +
 src/backend/storage/lmgr/lwlock.c            |    1 +
 src/backend/tcop/postgres.c                  |   27 +-
 src/backend/utils/init/globals.c             |    1 +
 src/backend/utils/init/postinit.c            |   11 +
 src/bin/pg_basebackup/t/010_pg_basebackup.pl |    4 +-
 src/include/miscadmin.h                      |    1 +
 src/include/pgstat.h                         |  441 +-
 src/include/storage/lwlock.h                 |    1 +
 src/include/utils/timeout.h                  |    1 +
 14 files changed, 2637 insertions(+), 3671 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 828e9084dd..ea6aad4d1e 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -53,7 +53,6 @@ postgres  15554  0.0  0.0  57536  1184 ?        Ss   18:02   0:00 postgres: back
 postgres  15555  0.0  0.0  57536   916 ?        Ss   18:02   0:00 postgres: checkpointer
 postgres  15556  0.0  0.0  57536   916 ?        Ss   18:02   0:00 postgres: walwriter
 postgres  15557  0.0  0.0  58504  2244 ?        Ss   18:02   0:00 postgres: autovacuum launcher
-postgres  15558  0.0  0.0  17512  1068 ?        Ss   18:02   0:00 postgres: stats collector
 postgres  15582  0.0  0.0  58772  3080 ?        Ss   18:04   0:00 postgres: joe runbug 127.0.0.1 idle
 postgres  15606  0.0  0.0  58772  3052 ?        Ss   18:07   0:00 postgres: tgl regression [local] SELECT waiting
 postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl regression [local] idle in transaction
@@ -65,9 +64,8 @@ postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl
    master server process.  The command arguments
    shown for it are the same ones used when it was launched.  The next five
    processes are background worker processes automatically launched by the
-   master process.  (The <quote>stats collector</quote> process will not be present
-   if you have set the system not to start the statistics collector; likewise
-   the <quote>autovacuum launcher</quote> process can be disabled.)
+   master process.  (The <quote>autovacuum launcher</quote> process will not
+   be present if you have set the system not to start it.)
    Each of the remaining
    processes is a server process handling one client connection.  Each such
    process sets its command line display in the form
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 073f313337..a222817f55 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -1958,15 +1958,15 @@ do_autovacuum(void)
   ALLOCSET_DEFAULT_SIZES);
  MemoryContextSwitchTo(AutovacMemCxt);
 
+ /* Start a transaction so our commands have one to play into. */
+ StartTransactionCommand();
+
  /*
  * may be NULL if we couldn't find an entry (only happens if we are
  * forcing a vacuum for anti-wrap purposes).
  */
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
 
- /* Start a transaction so our commands have one to play into. */
- StartTransactionCommand();
-
  /*
  * Clean up any dead statistics collector entries for this DB. We always
  * want to do this exactly once per DB-processing cycle, even if we find
@@ -2749,12 +2749,10 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
  if (isshared)
  {
  if (PointerIsValid(shared))
- tabentry = hash_search(shared->tables, &relid,
-   HASH_FIND, NULL);
+ tabentry = pgstat_fetch_stat_tabentry_extended(shared, relid);
  }
  else if (PointerIsValid(dbentry))
- tabentry = hash_search(dbentry->tables, &relid,
-   HASH_FIND, NULL);
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
 
  return tabentry;
 }
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 043e3ff9d2..c0b20763b0 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1,15 +1,23 @@
 /* ----------
  * pgstat.c
  *
- * All the statistics collector stuff hacked up in one big, ugly file.
+ * Statistics collector facility.
  *
- * TODO: - Separate collector, postmaster and backend stuff
- *  into different files.
+ *  Collects per-table and per-function usage statistics of all backends on
+ *  shared memory. pg_count_*() and friends interfaces stores activity of
+ *  every backend during a transaction. Then pgstat_flush_stat() is called at
+ *  the end of a transaction to flush out the local numbers to shared memory.
  *
- * - Add some automatic call for pgstat vacuuming.
+ *  To avoid congestion on the shared memory, we update shared stats no more
+ *  often than intervals of PGSTAT_STAT_MIN_INTERVAL(500ms). Still it is
+ *  possible that a backend cannot flush all or a part of local numbers
+ *  immediately, we postpone updates and try the next chance after the
+ *  interval of PGSTAT_STAT_RETRY_INTERVAL(100ms), but they are not kept
+ *  longer than PGSTAT_STAT_MAX_INTERVAL(1000ms).
  *
- * - Add a pgstat config column to pg_database, so this
- *  entire thing can be enabled/disabled on a per db basis.
+ *  The first process that uses stats collector creates the area then load the
+ *  stored stats file if any, and the last process at shutdown writes the
+ *  shared stats to the file then destroy the area before exit.
  *
  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  *
@@ -19,18 +27,6 @@
 #include "postgres.h"
 
 #include <unistd.h>
-#include <fcntl.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/socket.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <signal.h>
-#include <time.h>
-#ifdef HAVE_SYS_SELECT_H
-#include <sys/select.h>
-#endif
 
 #include "pgstat.h"
 
@@ -42,66 +38,38 @@
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_proc.h"
-#include "common/ip.h"
 #include "libpq/libpq.h"
-#include "libpq/pqsignal.h"
-#include "mb/pg_wchar.h"
 #include "miscadmin.h"
-#include "pg_trace.h"
 #include "postmaster/autovacuum.h"
-#include "postmaster/fork_process.h"
-#include "postmaster/postmaster.h"
 #include "replication/walsender.h"
-#include "storage/backendid.h"
-#include "storage/dsm.h"
-#include "storage/fd.h"
 #include "storage/ipc.h"
-#include "storage/latch.h"
 #include "storage/lmgr.h"
-#include "storage/pg_shmem.h"
+#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "storage/sinvaladt.h"
 #include "utils/ascii.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
-#include "utils/ps_status.h"
-#include "utils/rel.h"
+#include "utils/probes.h"
 #include "utils/snapmgr.h"
-#include "utils/timestamp.h"
-
 
 /* ----------
  * Timer definitions.
  * ----------
  */
-#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file
- * updates; in milliseconds. */
+#define PGSTAT_STAT_MIN_INTERVAL 500 /* Minimum time between stats data
+ * updates; in milliseconds. */
 
-#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a
- * new file; in milliseconds. */
-
-#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats
- * file update; in milliseconds. */
-
-#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a
- * new file; in milliseconds. */
-
-#define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a
- * failed statistics collector; in
- * seconds. */
-
-#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
-#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
-
-/* Minimum receive buffer size for the collector's socket. */
-#define PGSTAT_MIN_RCVBUF (100 * 1024)
+#define PGSTAT_STAT_RETRY_INTERVAL 100 /* Retry interval between after
+ * elapsed PGSTAT_MIN_INTERVAL*/
 
+#define PGSTAT_STAT_MAX_INTERVAL   1000 /* Maximum time between stats data
+ * updates; in milliseconds. */
 
 /* ----------
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
-#define PGSTAT_DB_HASH_SIZE 16
 #define PGSTAT_TAB_HASH_SIZE 512
 #define PGSTAT_FUNCTION_HASH_SIZE 512
 
@@ -117,6 +85,19 @@
  */
 #define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
 
+/*
+ * Operation mode and return code of pgstat_get_db_entry.
+ */
+#define PGSTAT_SHARED 0
+#define PGSTAT_EXCLUSIVE 1
+#define PGSTAT_NOWAIT 2
+
+typedef enum PgStat_TableLookupResult
+{
+ NOT_FOUND,
+ FOUND,
+ LOCK_FAILED
+} PgStat_TableLookupResult;
 
 /* ----------
  * GUC parameters
@@ -132,31 +113,63 @@ int pgstat_track_activity_query_size = 1024;
  * ----------
  */
 char   *pgstat_stat_directory = NULL;
+
+/* No longer used, but will be removed with GUC */
 char   *pgstat_stat_filename = NULL;
 char   *pgstat_stat_tmpname = NULL;
 
+#define StatsLock (&StatsShmem->StatsMainLock)
+
+/* Shared stats bootstrap information */
+typedef struct StatsShmemStruct
+{
+ LWLock StatsMainLock; /* lock protecting this struct */
+ dsa_handle stats_dsa_handle; /* DSA handle for stats collector */
+ dshash_table_handle db_hash_handle;
+ dsa_pointer global_stats;
+ dsa_pointer archiver_stats;
+ int refcount;
+} StatsShmemStruct;
+
 /*
- * BgWriter global statistics counters (unused in other processes).
- * Stored directly in a stats message structure so it can be sent
- * without needing to copy things around.  We assume this inits to zeroes.
+ * BgWriter global statistics counters. The name is the remnant from the time
+ * when the stats collector was a dedicate process, which used sockets to send
+ * it.
  */
-PgStat_MsgBgWriter BgWriterStats;
+PgStat_MsgBgWriter BgWriterStats = {0};
 
-/* ----------
- * Local data
- * ----------
- */
-NON_EXEC_STATIC pgsocket pgStatSock = PGINVALID_SOCKET;
+/* Variables lives for the backend lifetime */
+static StatsShmemStruct * StatsShmem = NULL;
+static dsa_area *area = NULL;
+static dshash_table *pgStatDBHash = NULL;
 
-static struct sockaddr_storage pgStatAddr;
 
-static time_t last_pgstat_start_time;
-
-static bool pgStatRunningInCollector = false;
+/* parameter for each type of shared hash */
+static const dshash_parameters dsh_dbparams = {
+ sizeof(Oid),
+ SHARED_DBENT_SIZE,
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
+static const dshash_parameters dsh_tblparams = {
+ sizeof(Oid),
+ sizeof(PgStat_StatTabEntry),
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
+static const dshash_parameters dsh_funcparams = {
+ sizeof(Oid),
+ sizeof(PgStat_StatFuncEntry),
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
 
 /*
  * Structures in which backends store per-table info that's waiting to be
- * sent to the collector.
+ * written to shared memory.
  *
  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
  * for the life of the backend.  Also, we zero out the t_id fields of the
@@ -191,8 +204,8 @@ typedef struct TabStatHashEntry
 static HTAB *pgStatTabHash = NULL;
 
 /*
- * Backends store per-function info that's waiting to be sent to the collector
- * in this hash table (indexed by function OID).
+ * Backends store per-function info that's waiting to be flushed out to shared
+ * memory in this hash table (indexed by function OID).
  */
 static HTAB *pgStatFunctions = NULL;
 
@@ -202,6 +215,68 @@ static HTAB *pgStatFunctions = NULL;
  */
 static bool have_function_stats = false;
 
+/* common header of snapshot entry in backend snapshot hash */
+typedef struct PgStat_snapshot
+{
+ Oid key;
+ bool negative;
+ void   *body; /* end of header part: to keep alignment */
+} PgStat_snapshot;
+
+/* context struct for snapshot_statentry */
+typedef struct pgstat_snapshot_param
+{
+ char   *hash_name; /* name of the snapshot hash */
+ int hash_entsize; /* element size of hash entry */
+ dshash_table_handle dsh_handle; /* dsh handle to attach */
+ const dshash_parameters *dsh_params;/* dshash params */
+ HTAB  **hash; /* points to variable to hold hash */
+ dshash_table  **dshash; /* ditto for dshash */
+} pgstat_snapshot_param;
+
+/*
+ * Backends store various database-wide info that's waiting to be flushed out
+ * to shared memory in these variables.
+ *
+ * checksum_failures is the exception in that it is cluster-wide.
+ */
+typedef struct BackendDBStats
+{
+ int n_conflict_tablespace;
+ int n_conflict_lock;
+ int n_conflict_snapshot;
+ int n_conflict_bufferpin;
+ int n_conflict_startup_deadlock;
+ int n_deadlocks;
+ size_t n_tmpfiles;
+ size_t tmpfilesize;
+ HTAB *checksum_failures;
+} BackendDBStats;
+
+/* Hash entry struct for checksum_failures above */
+typedef struct ChecksumFailureEnt
+{
+ Oid dboid;
+ int count;
+} ChecksumFailureEnt;
+
+static BackendDBStats BeDBStats = {0};
+
+/* macros to check BeDBStats at once */
+#define HAVE_PENDING_CONFLICTS() \
+ (BeDBStats.n_conflict_tablespace > 0 || \
+ BeDBStats.n_conflict_lock > 0 || \
+ BeDBStats.n_conflict_bufferpin > 0 || \
+ BeDBStats.n_conflict_startup_deadlock > 0)
+
+#define HAVE_PENDING_DBSTATS() \
+ (HAVE_PENDING_CONFLICTS() || \
+ BeDBStats.n_deadlocks > 0 || \
+ BeDBStats.n_tmpfiles > 0 || \
+ /* no need to check tmpfilesize */ \
+ BeDBStats.checksum_failures != NULL)
+
+
 /*
  * Tuple insertion/deletion counts for an open transaction can't be propagated
  * into PgStat_TableStatus counters until we know if it is going to commit
@@ -237,11 +312,11 @@ typedef struct TwoPhasePgStatRecord
  bool t_truncated; /* was the relation truncated? */
 } TwoPhasePgStatRecord;
 
-/*
- * Info about current "snapshot" of stats file
- */
+/* Variables for backend status snapshot */
 static MemoryContext pgStatLocalContext = NULL;
-static HTAB *pgStatDBHash = NULL;
+static MemoryContext pgStatSnapshotContext = NULL;
+static HTAB *pgStatLocalHash = NULL;
+static bool clear_snapshot = false;
 
 /* Status for backends including auxiliary */
 static LocalPgBackendStatus *localBackendStatusTable = NULL;
@@ -250,23 +325,35 @@ static LocalPgBackendStatus *localBackendStatusTable = NULL;
 static int localNumBackends = 0;
 
 /*
- * Cluster wide statistics, kept in the stats collector.
- * Contains statistics that are not collected per database
- * or per table.
+ * Struct for context for pgstat_flush_* functions
+ *
+ * To avoid repeated attach/detch of the same dshash, dshashes once attached
+ * is stored in this structure and moved around multiple calls and multiple
+ * functions. generation here means the value returned by pin_hashes().
  */
-static PgStat_ArchiverStats archiverStats;
-static PgStat_GlobalStats globalStats;
+typedef struct pgstat_flush_stat_context
+{
+ int shgeneration; /* "generation" of shdb_tabhash below */
+ PgStat_StatDBEntry *shdbentry; /* dbentry for shared tables (oid = 0) */
+ dshash_table *shdb_tabhash; /* tabentry dshash of shared tables */
+
+ int mygeneration; /* "generation" of mydb_tabhash below */
+ PgStat_StatDBEntry *mydbentry; /* dbengry for my database */
+ dshash_table *mydb_tabhash; /* tabentry dshash of my database */
+} pgstat_flush_stat_context;
 
 /*
- * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
- * it means to write only the shared-catalog stats ("DB 0"); otherwise, we
- * will write both that DB's data and the shared stats.
+ * Cluster wide statistics.
+ *
+ * Contains statistics that are collected not per database nor per table
+ * basis.  shared_* points to shared memroy and snapshot_* are backend
+ * snapshots. Their validity is indicated by global_snapshot_is_valid.
  */
-static List *pending_write_requests = NIL;
-
-/* Signal handler flags */
-static volatile bool need_exit = false;
-static volatile bool got_SIGHUP = false;
+static bool global_snapshot_is_valid = false;
+static PgStat_ArchiverStats *shared_archiverStats;
+static PgStat_ArchiverStats snapshot_archiverStats;
+static PgStat_GlobalStats *shared_globalStats;
+static PgStat_GlobalStats snapshot_globalStats;
 
 /*
  * Total time charged to functions so far in the current backend.
@@ -280,35 +367,41 @@ static instr_time total_func_time;
  * Local function forward declarations
  * ----------
  */
-#ifdef EXEC_BACKEND
-static pid_t pgstat_forkexec(void);
-#endif
 
-NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-static void pgstat_exit(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
-static void pgstat_sighup_handler(SIGNAL_ARGS);
-
-static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
+static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, int op,
+ PgStat_TableLookupResult *status);
+static PgStat_StatTabEntry *pgstat_get_tab_entry(dshash_table *table,
  Oid tableoid, bool create);
-static void pgstat_write_statsfiles(bool permanent, bool allDbs);
-static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
-static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
-static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
-static void backend_read_statsfile(void);
+static void pgstat_write_pgStatDBHashfile(PgStat_StatDBEntry *dbentry);
+static void pgstat_read_pgStatDBHashfile(PgStat_StatDBEntry *dbentry);
 static void pgstat_read_current_status(void);
-
-static bool pgstat_write_statsfile_needed(void);
-static bool pgstat_db_requested(Oid databaseid);
-
-static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
-static void pgstat_send_funcstats(void);
+static bool pgstat_flush_stat(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_flush_tabstat(pgstat_flush_stat_context *cxt, bool nowait,
+ PgStat_TableStatus *entry);
+static bool pgstat_flush_funcstats(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_flush_dbstats(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_update_tabentry(dshash_table *tabhash,
+   PgStat_TableStatus *stat, bool nowait);
+static void pgstat_update_dbentry(PgStat_StatDBEntry *dbentry,
+  PgStat_TableStatus *stat);
 static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid);
 
+static void pgstat_remove_useless_entries(const dshash_table_handle dshhandle,
+  const dshash_parameters *dshparams,
+  HTAB *oidtab);
 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
 
 static void pgstat_setup_memcxt(void);
+static void pgstat_flush_recovery_conflict(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_deadlock(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_checksum_failure(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_tempfile(PgStat_StatDBEntry *dbentry);
+static HTAB *create_tabstat_hash(void);
+static PgStat_SubXactStatus *get_tabstat_stack_level(int nest_level);
+static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level);
+static PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry_extended(PgStat_StatDBEntry *dbent, Oid funcid);
+static void pgstat_snapshot_global_stats(void);
 
 static const char *pgstat_get_wait_activity(WaitEventActivity w);
 static const char *pgstat_get_wait_client(WaitEventClient w);
@@ -316,481 +409,197 @@ static const char *pgstat_get_wait_ipc(WaitEventIPC w);
 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
 static const char *pgstat_get_wait_io(WaitEventIO w);
 
-static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
-static void pgstat_send(void *msg, int len);
-
-static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
-static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
-static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
-static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
-static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
-static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len);
-static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len);
-static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
-static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
-static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
-static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
-static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
-static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
-static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
-static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
-static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
-static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
-static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
+/* ------------------------------------------------------------
+ * Local support functions follow
+ * ------------------------------------------------------------
+ */
+static int pin_hashes(PgStat_StatDBEntry *dbentry);
+static void unpin_hashes(PgStat_StatDBEntry *dbentry, int generation);
+static dshash_table *attach_table_hash(PgStat_StatDBEntry *dbent, int gen);
+static dshash_table *attach_function_hash(PgStat_StatDBEntry *dbent, int gen);
+static void reset_dbentry_counters(PgStat_StatDBEntry *dbentry);
 
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
  */
 
-/* ----------
- * pgstat_init() -
- *
- * Called from postmaster at startup. Create the resources required
- * by the statistics collector process.  If unable to do so, do not
- * fail --- better to let the postmaster start with stats collection
- * disabled.
- * ----------
+/*
+ * StatsShmemSize
+ * Compute space needed for stats collector's shared memory
  */
-void
-pgstat_init(void)
+Size
+StatsShmemSize(void)
 {
- ACCEPT_TYPE_ARG3 alen;
- struct addrinfo *addrs = NULL,
-   *addr,
- hints;
- int ret;
- fd_set rset;
- struct timeval tv;
- char test_byte;
- int sel_res;
- int tries = 0;
-
-#define TESTBYTEVAL ((char) 199)
-
- /*
- * This static assertion verifies that we didn't mess up the calculations
- * involved in selecting maximum payload sizes for our UDP messages.
- * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would
- * be silent performance loss from fragmentation, it seems worth having a
- * compile-time cross-check that we didn't.
- */
- StaticAssertStmt(sizeof(PgStat_Msg) <= PGSTAT_MAX_MSG_SIZE,
- "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE");
-
- /*
- * Create the UDP socket for sending and receiving statistic messages
- */
- hints.ai_flags = AI_PASSIVE;
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_DGRAM;
- hints.ai_protocol = 0;
- hints.ai_addrlen = 0;
- hints.ai_addr = NULL;
- hints.ai_canonname = NULL;
- hints.ai_next = NULL;
- ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
- if (ret || !addrs)
- {
- ereport(LOG,
- (errmsg("could not resolve \"localhost\": %s",
- gai_strerror(ret))));
- goto startup_failed;
- }
-
- /*
- * On some platforms, pg_getaddrinfo_all() may return multiple addresses
- * only one of which will actually work (eg, both IPv6 and IPv4 addresses
- * when kernel will reject IPv6).  Worse, the failure may occur at the
- * bind() or perhaps even connect() stage.  So we must loop through the
- * results till we find a working combination. We will generate LOG
- * messages, but no error, for bogus combinations.
- */
- for (addr = addrs; addr; addr = addr->ai_next)
- {
-#ifdef HAVE_UNIX_SOCKETS
- /* Ignore AF_UNIX sockets, if any are returned. */
- if (addr->ai_family == AF_UNIX)
- continue;
-#endif
-
- if (++tries > 1)
- ereport(LOG,
- (errmsg("trying another address for the statistics collector")));
-
- /*
- * Create the socket.
- */
- if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not create socket for statistics collector: %m")));
- continue;
- }
-
- /*
- * Bind it to a kernel assigned port on localhost and get the assigned
- * port via getsockname().
- */
- if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not bind socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- alen = sizeof(pgStatAddr);
- if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not get address of socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /*
- * Connect the socket to its own address.  This saves a few cycles by
- * not having to respecify the target address on every send. This also
- * provides a kernel-level check that only packets from this same
- * address will be received.
- */
- if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not connect socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /*
- * Try to send and receive a one-byte test message on the socket. This
- * is to catch situations where the socket can be created but will not
- * actually pass data (for instance, because kernel packet filtering
- * rules prevent it).
- */
- test_byte = TESTBYTEVAL;
-
-retry1:
- if (send(pgStatSock, &test_byte, 1, 0) != 1)
- {
- if (errno == EINTR)
- goto retry1; /* if interrupted, just retry */
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not send test message on socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /*
- * There could possibly be a little delay before the message can be
- * received.  We arbitrarily allow up to half a second before deciding
- * it's broken.
- */
- for (;;) /* need a loop to handle EINTR */
- {
- FD_ZERO(&rset);
- FD_SET(pgStatSock, &rset);
-
- tv.tv_sec = 0;
- tv.tv_usec = 500000;
- sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
- if (sel_res >= 0 || errno != EINTR)
- break;
- }
- if (sel_res < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("select() failed in statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
- if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
- {
- /*
- * This is the case we actually think is likely, so take pains to
- * give a specific message for it.
- *
- * errno will not be set meaningfully here, so don't use it.
- */
- ereport(LOG,
- (errcode(ERRCODE_CONNECTION_FAILURE),
- errmsg("test message did not get through on socket for statistics collector")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- test_byte++; /* just make sure variable is changed */
-
-retry2:
- if (recv(pgStatSock, &test_byte, 1, 0) != 1)
- {
- if (errno == EINTR)
- goto retry2; /* if interrupted, just retry */
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not receive test message on socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */
- {
- ereport(LOG,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("incorrect test message transmission on socket for statistics collector")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /* If we get here, we have a working socket */
- break;
- }
-
- /* Did we find a working address? */
- if (!addr || pgStatSock == PGINVALID_SOCKET)
- goto startup_failed;
-
- /*
- * Set the socket to non-blocking IO.  This ensures that if the collector
- * falls behind, statistics messages will be discarded; backends won't
- * block waiting to send messages to the collector.
- */
- if (!pg_set_noblock(pgStatSock))
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not set statistics collector socket to nonblocking mode: %m")));
- goto startup_failed;
- }
-
- /*
- * Try to ensure that the socket's receive buffer is at least
- * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose
- * data.  Use of UDP protocol means that we are willing to lose data under
- * heavy load, but we don't want it to happen just because of ridiculously
- * small default buffer sizes (such as 8KB on older Windows versions).
- */
- {
- int old_rcvbuf;
- int new_rcvbuf;
- ACCEPT_TYPE_ARG3 rcvbufsize = sizeof(old_rcvbuf);
-
- if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
-   (char *) &old_rcvbuf, &rcvbufsize) < 0)
- {
- elog(LOG, "getsockopt(SO_RCVBUF) failed: %m");
- /* if we can't get existing size, always try to set it */
- old_rcvbuf = 0;
- }
-
- new_rcvbuf = PGSTAT_MIN_RCVBUF;
- if (old_rcvbuf < new_rcvbuf)
- {
- if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
-   (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0)
- elog(LOG, "setsockopt(SO_RCVBUF) failed: %m");
- }
- }
-
- pg_freeaddrinfo_all(hints.ai_family, addrs);
-
- return;
-
-startup_failed:
- ereport(LOG,
- (errmsg("disabling statistics collector for lack of working socket")));
-
- if (addrs)
- pg_freeaddrinfo_all(hints.ai_family, addrs);
-
- if (pgStatSock != PGINVALID_SOCKET)
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
-
- /*
- * Adjust GUC variables to suppress useless activity, and for debugging
- * purposes (seeing track_counts off is a clue that we failed here). We
- * use PGC_S_OVERRIDE because there is no point in trying to turn it back
- * on from postgresql.conf without a restart.
- */
- SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
+ return sizeof(StatsShmemStruct);
 }
 
 /*
- * subroutine for pgstat_reset_all
+ * StatsShmemInit - initialize during shared-memory creation
+ */
+void
+StatsShmemInit(void)
+{
+ bool found;
+
+ StatsShmem = (StatsShmemStruct *)
+ ShmemInitStruct("Stats area", StatsShmemSize(),
+ &found);
+
+ if (!IsUnderPostmaster)
+ {
+ Assert(!found);
+
+ StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+ }
+
+ LWLockInitialize(StatsLock, LWTRANCHE_STATS);
+}
+
+/* ----------
+ * pgstat_attach_shared_stats() -
+ *
+ * Attach shared or create stats memory.
+ * ---------
  */
 static void
-pgstat_reset_remove_files(const char *directory)
+pgstat_attach_shared_stats(void)
 {
- DIR   *dir;
- struct dirent *entry;
- char fname[MAXPGPATH * 2];
+ MemoryContext oldcontext;
 
- dir = AllocateDir(directory);
- while ((entry = ReadDir(dir, directory)) != NULL)
+ /*
+ * Don't use dsm under postmaster, when not tracking counts.
+ */
+ if (!pgstat_track_counts || !IsUnderPostmaster)
+ return;
+
+ pgstat_setup_memcxt();
+
+ if (area)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+
+ if (StatsShmem->refcount > 0)
+ StatsShmem->refcount++;
+ else
  {
- int nchars;
- Oid tmp_oid;
+ /* Need to create shared memory area and load saved stats if any. */
+ Assert(StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID);
 
- /*
- * Skip directory entries that don't match the file names we write.
- * See get_dbstat_filename for the database-specific pattern.
- */
- if (strncmp(entry->d_name, "global.", 7) == 0)
- nchars = 7;
- else
- {
- nchars = 0;
- (void) sscanf(entry->d_name, "db_%u.%n",
-  &tmp_oid, &nchars);
- if (nchars <= 0)
- continue;
- /* %u allows leading whitespace, so reject that */
- if (strchr("0123456789", entry->d_name[3]) == NULL)
- continue;
- }
+ /* Initialize shared memory area */
+ area = dsa_create(LWTRANCHE_STATS);
+ pgStatDBHash = dshash_create(area, &dsh_dbparams, 0);
 
- if (strcmp(entry->d_name + nchars, "tmp") != 0 &&
- strcmp(entry->d_name + nchars, "stat") != 0)
- continue;
+ StatsShmem->stats_dsa_handle = dsa_get_handle(area);
+ StatsShmem->global_stats =
+ dsa_allocate0(area, sizeof(PgStat_GlobalStats));
+ StatsShmem->archiver_stats =
+ dsa_allocate0(area, sizeof(PgStat_ArchiverStats));
+ StatsShmem->db_hash_handle = dshash_get_hash_table_handle(pgStatDBHash);
 
- snprintf(fname, sizeof(fname), "%s/%s", directory,
- entry->d_name);
- unlink(fname);
+ shared_globalStats = (PgStat_GlobalStats *)
+ dsa_get_address(area, StatsShmem->global_stats);
+ shared_archiverStats = (PgStat_ArchiverStats *)
+ dsa_get_address(area, StatsShmem->archiver_stats);
+
+ /* Load saved data if any. */
+ pgstat_read_statsfiles();
+
+ StatsShmem->refcount = 1;
  }
- FreeDir(dir);
+
+ LWLockRelease(StatsLock);
+
+ /*
+ * If we're not the first process, attach existing shared stats area
+ * outside StatsLock.
+ */
+ if (!area)
+ {
+ /* Shared area already exists. Just attach it. */
+ area = dsa_attach(StatsShmem->stats_dsa_handle);
+ pgStatDBHash = dshash_attach(area, &dsh_dbparams,
+ StatsShmem->db_hash_handle, 0);
+
+ /* Setup local variables */
+ pgStatLocalHash = NULL;
+ shared_globalStats = (PgStat_GlobalStats *)
+ dsa_get_address(area, StatsShmem->global_stats);
+ shared_archiverStats = (PgStat_ArchiverStats *)
+ dsa_get_address(area, StatsShmem->archiver_stats);
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ dsa_pin_mapping(area);
+ global_snapshot_is_valid = false;
+}
+
+/* ----------
+ * pgstat_detach_shared_stats() -
+ *
+ * Detach shared stats. Write out to file if we're the last process and
+ * instructed to write file.
+ * ----------
+ */
+static void
+pgstat_detach_shared_stats(bool write_stats)
+{
+ if (!area || !IsUnderPostmaster)
+ return;
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+
+ /* write out the shared stats to file if needed */
+ if (--StatsShmem->refcount < 1)
+ {
+ if (write_stats)
+ pgstat_write_statsfiles();
+
+ /* We're the last process. Invalidate the dsa area handle. */
+ StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+ }
+
+ LWLockRelease(StatsLock);
+
+ /*
+ * Detach the area. Automatically destroyed when the last process detached
+ * it.
+ */
+ dsa_detach(area);
+
+ area = NULL;
+ pgStatDBHash = NULL;
+ shared_globalStats = NULL;
+ shared_archiverStats = NULL;
+ pgStatLocalHash = NULL;
+ global_snapshot_is_valid = false;
 }
 
 /*
  * pgstat_reset_all() -
  *
- * Remove the stats files.  This is currently used only if WAL
- * recovery is needed after a crash.
+ * Remove the stats file.  This is currently used only if WAL recovery is
+ * needed after a crash.
  */
 void
 pgstat_reset_all(void)
 {
- pgstat_reset_remove_files(pgstat_stat_directory);
- pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY);
-}
+ /* we must have shared stats attached */
+ Assert (StatsShmem->stats_dsa_handle != DSM_HANDLE_INVALID);
 
-#ifdef EXEC_BACKEND
-
-/*
- * pgstat_forkexec() -
- *
- * Format up the arglist for, then fork and exec, statistics collector process
- */
-static pid_t
-pgstat_forkexec(void)
-{
- char   *av[10];
- int ac = 0;
-
- av[ac++] = "postgres";
- av[ac++] = "--forkcol";
- av[ac++] = NULL; /* filled in by postmaster_forkexec */
-
- av[ac] = NULL;
- Assert(ac < lengthof(av));
-
- return postmaster_forkexec(ac, av);
-}
-#endif /* EXEC_BACKEND */
-
-
-/*
- * pgstat_start() -
- *
- * Called from postmaster at startup or after an existing collector
- * died.  Attempt to fire up a fresh statistics collector.
- *
- * Returns PID of child process, or 0 if fail.
- *
- * Note: if fail, we will be called again from the postmaster main loop.
- */
-int
-pgstat_start(void)
-{
- time_t curtime;
- pid_t pgStatPid;
+ /* Startup must be the only user of shared stats */
+ Assert (StatsShmem->refcount == 1);
 
  /*
- * Check that the socket is there, else pgstat_init failed and we can do
- * nothing useful.
+ * We could directly remove files and recreate the shared memory area. But
+ * detach then attach for simplicity.
  */
- if (pgStatSock == PGINVALID_SOCKET)
- return 0;
-
- /*
- * Do nothing if too soon since last collector start.  This is a safety
- * valve to protect against continuous respawn attempts if the collector
- * is dying immediately at launch.  Note that since we will be re-called
- * from the postmaster main loop, we will get another chance later.
- */
- curtime = time(NULL);
- if ((unsigned int) (curtime - last_pgstat_start_time) <
- (unsigned int) PGSTAT_RESTART_INTERVAL)
- return 0;
- last_pgstat_start_time = curtime;
-
- /*
- * Okay, fork off the collector.
- */
-#ifdef EXEC_BACKEND
- switch ((pgStatPid = pgstat_forkexec()))
-#else
- switch ((pgStatPid = fork_process()))
-#endif
- {
- case -1:
- ereport(LOG,
- (errmsg("could not fork statistics collector: %m")));
- return 0;
-
-#ifndef EXEC_BACKEND
- case 0:
- /* in postmaster child ... */
- InitPostmasterChild();
-
- /* Close the postmaster's sockets */
- ClosePostmasterPorts(false);
-
- /* Drop our connection to postmaster's shared memory, as well */
- dsm_detach_all();
- PGSharedMemoryDetach();
-
- PgstatCollectorMain(0, NULL);
- break;
-#endif
-
- default:
- return (int) pgStatPid;
- }
-
- /* shouldn't get here */
- return 0;
-}
-
-void
-allow_immediate_pgstat_restart(void)
-{
- last_pgstat_start_time = 0;
+ pgstat_detach_shared_stats(false); /* Don't write */
+ pgstat_attach_shared_stats();
 }
 
 /* ------------------------------------------------------------
@@ -798,75 +607,293 @@ allow_immediate_pgstat_restart(void)
  *------------------------------------------------------------
  */
 
-
 /* ----------
  * pgstat_report_stat() -
  *
  * Must be called by processes that performs DML: tcop/postgres.c, logical
- * receiver processes, SPI worker, etc. to send the so far collected
- * per-table and function usage statistics to the collector.  Note that this
- * is called only when not within a transaction, so it is fair to use
+ * receiver processes, SPI worker, etc. to apply the so far collected
+ * per-table and function usage statistics to the shared statistics hashes.
+ *
+ *  Updates are applied not more frequent than the interval of
+ *  PGSTAT_STAT_MIN_INTERVAL milliseconds. They are also postponed on lock
+ *  failure if force is false and there's no pending updates longer than
+ *  PGSTAT_STAT_MAX_INTERVAL milliseconds. Postponed updates are retried in
+ *  succeeding calls of this function.
+ *
+ * Returns the time until the next timing when updates are applied in
+ * milliseconds if there are no updates holded for more than
+ * PGSTAT_STAT_MIN_INTERVAL milliseconds.
+ *
+ * Note that this is called only out of a transaction, so it is fine to use
  * transaction stop time as an approximation of current time.
- * ----------
+ * ----------
  */
-void
+long
 pgstat_report_stat(bool force)
 {
- /* we assume this inits to all zeroes: */
- static const PgStat_TableCounts all_zeroes;
- static TimestampTz last_report = 0;
-
+ static TimestampTz next_flush = 0;
+ static TimestampTz pending_since = 0;
  TimestampTz now;
- PgStat_MsgTabstat regular_msg;
- PgStat_MsgTabstat shared_msg;
- TabStatusArray *tsa;
- int i;
+ pgstat_flush_stat_context cxt = {0};
+ bool pending_stats = false;
+ long elapsed;
+ long secs;
+ int usecs;
 
  /* Don't expend a clock check if nothing to do */
- if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
- pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
- !have_function_stats)
- return;
+ if (area == NULL ||
+ ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
+ pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
+ !HAVE_PENDING_DBSTATS()  && !have_function_stats))
+ return 0;
+
+ now = GetCurrentTransactionStopTimestamp();
+
+ if (!force)
+ {
+ /*
+ * Don't flush stats unless it's the time.  Returns time to wait in
+ * milliseconds.
+ */
+ if (now < next_flush)
+ {
+ /* Record the oldest pending update if not yet. */
+ if (pending_since == 0)
+ pending_since = now;
+
+ /* now < next_flush here */
+ return (next_flush - now) / 1000;
+ }
+
+ /*
+ * Don't keep pending updates longer than PGSTAT_STAT_MAX_INTERVAL.
+ */
+ if (pending_since > 0)
+ {
+ TimestampDifference(pending_since, now, &secs, &usecs);
+ elapsed = secs * 1000 + usecs /1000;
+
+ if(elapsed > PGSTAT_STAT_MAX_INTERVAL)
+ force = true;
+ }
+ }
+
+ /* Flush out table stats */
+ if (pgStatTabList != NULL && !pgstat_flush_stat(&cxt, !force))
+ pending_stats = true;
+
+ /* Flush out function stats */
+ if (pgStatFunctions != NULL && !pgstat_flush_funcstats(&cxt, !force))
+ pending_stats = true;
+
+ /* Flush out database-wide stats */
+ if (HAVE_PENDING_DBSTATS())
+ {
+ if (!pgstat_flush_dbstats(&cxt, !force))
+ pending_stats = true;
+ }
+
+ /* Unpin dbentry if pinned */
+ if (cxt.mydb_tabhash)
+ {
+ dshash_detach(cxt.mydb_tabhash);
+ unpin_hashes(cxt.mydbentry, cxt.mygeneration);
+ cxt.mydb_tabhash = NULL;
+ cxt.mydbentry = NULL;
+ }
+
+ /* Publish the last flush time */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ if (shared_globalStats->stats_timestamp < now)
+ shared_globalStats->stats_timestamp = now;
+ LWLockRelease(StatsLock);
+
+ /* Record how long we are keepnig pending updats. */
+ if (pending_stats)
+ {
+ /* Preserve the first value */
+ if (pending_since == 0)
+ pending_since = now;
+
+ /*
+ * It's possible that the retry interval is longer than the limit by
+ * PGSTAT_STAT_MAX_INTERVAL. We don't bother that since it's not so
+ * much.
+ */
+ return PGSTAT_STAT_RETRY_INTERVAL;
+ }
+
+ /* Set the next time to update stats */
+ next_flush = now + PGSTAT_STAT_MIN_INTERVAL * 1000;
+ pending_since = 0;
+
+ return 0;
+}
+
+/*
+ * snapshot_statentry() - Common routine for functions
+ * pgstat_fetch_stat_*entry()
+ *
+ *  Returns the pointer to a snapshot of a shared entry for the key or NULL if
+ *  not found. Returned snapshots are stable during the current transaction or
+ *  until pgstat_clear_snapshot() is called.
+ *
+ *  The snapshots are stored in a hash, pointer to which is stored in the
+ *  *HTAB variable pointed by cxt->hash. If not created yet, it is created
+ *  using hash_name, hash_entsize in cxt.
+ *
+ *  cxt->dshash points to dshash_table for dbstat entries. If not yet
+ *  attached, it is attached using cxt->dsh_handle.
+ */
+static void *
+snapshot_statentry(pgstat_snapshot_param *cxt, Oid key)
+{
+ PgStat_snapshot *lentry = NULL;
+ size_t keysize = cxt->dsh_params->key_size;
+ size_t dsh_entrysize = cxt->dsh_params->entry_size;
+ bool found;
 
  /*
- * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
- * msec since we last sent one, or the caller wants to force stats out.
+ * We don't want so frequent update of stats snapshot. Keep it at least
+ * for PGSTAT_STAT_MIN_INTERVAL ms. Not postpone but just ignore the cue.
  */
- now = GetCurrentTransactionStopTimestamp();
- if (!force &&
- !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
- return;
- last_report = now;
+ if (clear_snapshot)
+ {
+ clear_snapshot = false;
+
+ if (pgStatSnapshotContext &&
+ snapshot_globalStats.stats_timestamp <
+ GetCurrentStatementStartTimestamp() -
+ PGSTAT_STAT_MIN_INTERVAL * 1000)
+ {
+ MemoryContextReset(pgStatSnapshotContext);
+
+ /* Reset variables */
+ global_snapshot_is_valid = false;
+ pgStatSnapshotContext = NULL;
+ pgStatLocalHash = NULL;
+
+ pgstat_setup_memcxt();
+ }
+ }
+
+ /*
+ * Create new hash, with rather arbitrary initial number of entries since
+ * we don't know how this hash will grow.
+ */
+ if (!*cxt->hash)
+ {
+ HASHCTL ctl;
+
+ /*
+ * Create the hash in the stats context
+ *
+ * The entry is prepended by common header part represented by
+ * PgStat_snapshot.
+ */
+
+ ctl.keysize = keysize;
+ ctl.entrysize = offsetof(PgStat_snapshot, body) + cxt->hash_entsize;
+ ctl.hcxt = pgStatSnapshotContext;
+ *cxt->hash = hash_create(cxt->hash_name, 32, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ }
+
+ lentry = hash_search(*cxt->hash, &key, HASH_ENTER, &found);
+
+ /*
+ * Refer shared hash if not found in the local hash. We return up-to-date
+ * entries outside a transaction so do the same even if the snapshot is
+ * found.
+ */
+ if (!found || !IsTransactionState())
+ {
+ void *sentry;
+
+ /* attach shared hash if not given, leave it alone for later use */
+ if (!*cxt->dshash)
+ {
+ MemoryContext oldcxt;
+
+ Assert (cxt->dsh_handle != DSM_HANDLE_INVALID);
+ oldcxt = MemoryContextSwitchTo(pgStatSnapshotContext);
+ *cxt->dshash =
+ dshash_attach(area, cxt->dsh_params, cxt->dsh_handle, NULL);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ sentry = dshash_find(*cxt->dshash, &key, false);
+
+ if (sentry)
+ {
+ /*
+ * In transaction state, it is obvious that we should create local
+ * cache entries for consistency. If we are not, we return an
+ * up-to-date entry. Having said that, we need a local copy since
+ * dshash entry must be released immediately. We share the same
+ * local hash entry for the purpose.
+ */
+ memcpy(&lentry->body, sentry, dsh_entrysize);
+ dshash_release_lock(*cxt->dshash, sentry);
+
+ /* then zero out the local additional space if any */
+ if (dsh_entrysize < cxt->hash_entsize)
+ MemSet((char *)&lentry->body + dsh_entrysize, 0,
+   cxt->hash_entsize - dsh_entrysize);
+ }
+
+ lentry->negative = !sentry;
+ }
+
+ if (lentry->negative)
+ return NULL;
+
+ return &lentry->body;
+}
+
+/*
+ * pgstat_flush_stat: Flushes table stats out to shared statistics.
+ *
+ *  If nowait is true, returns false if required lock was not acquired
+ *  immediately. In that case, unapplied table stats updates are left alone in
+ *  TabStatusArray to wait for the next chance. cxt holds some dshash related
+ *  values that we want to carry around while updating shared stats.
+ *
+ *  Returns true if all stats info are flushed. Caller must detach dshashes
+ *  stored in cxt after use.
+ */
+static bool
+pgstat_flush_stat(pgstat_flush_stat_context *cxt, bool nowait)
+{
+ static const PgStat_TableCounts all_zeroes;
+ TabStatusArray *tsa;
+ HTAB   *new_tsa_hash = NULL;
+ TabStatusArray *dest_tsa = pgStatTabList;
+ int dest_elem = 0;
+ int i;
+
+ /* nothing to do, just return  */
+ if (pgStatTabHash == NULL)
+ return true;
 
  /*
  * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry
- * entries it points to.  (Should we fail partway through the loop below,
- * it's okay to have removed the hashtable already --- the only
- * consequence is we'd get multiple entries for the same table in the
- * pgStatTabList, and that's safe.)
+ * entries it points to.
  */
- if (pgStatTabHash)
- hash_destroy(pgStatTabHash);
+ hash_destroy(pgStatTabHash);
  pgStatTabHash = NULL;
 
  /*
  * Scan through the TabStatusArray struct(s) to find tables that actually
- * have counts, and build messages to send.  We have to separate shared
- * relations from regular ones because the databaseid field in the message
- * header has to depend on that.
+ * have counts, and try flushing it out to shared stats. We may fail on
+ * some entries in the array. Leaving the entries being packed at the
+ * beginning of the array.
  */
- regular_msg.m_databaseid = MyDatabaseId;
- shared_msg.m_databaseid = InvalidOid;
- regular_msg.m_nentries = 0;
- shared_msg.m_nentries = 0;
-
  for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
  {
  for (i = 0; i < tsa->tsa_used; i++)
  {
  PgStat_TableStatus *entry = &tsa->tsa_entries[i];
- PgStat_MsgTabstat *this_msg;
- PgStat_TableEntry *this_ent;
 
  /* Shouldn't have any pending transaction-dependent counts */
  Assert(entry->trans == NULL);
@@ -879,178 +906,352 @@ pgstat_report_stat(bool force)
    sizeof(PgStat_TableCounts)) == 0)
  continue;
 
- /*
- * OK, insert data into the appropriate message, and send if full.
- */
- this_msg = entry->t_shared ? &shared_msg : &regular_msg;
- this_ent = &this_msg->m_entry[this_msg->m_nentries];
- this_ent->t_id = entry->t_id;
- memcpy(&this_ent->t_counts, &entry->t_counts,
-   sizeof(PgStat_TableCounts));
- if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+ /* try to apply the tab stats */
+ if (!pgstat_flush_tabstat(cxt, nowait, entry))
  {
- pgstat_send_tabstat(this_msg);
- this_msg->m_nentries = 0;
+ /*
+ * Failed. Move it to the beginning in TabStatusArray and
+ * leave it.
+ */
+ TabStatHashEntry *hash_entry;
+ bool found;
+
+ if (new_tsa_hash == NULL)
+ new_tsa_hash = create_tabstat_hash();
+
+ /* Create hash entry for this entry */
+ hash_entry = hash_search(new_tsa_hash, &entry->t_id,
+ HASH_ENTER, &found);
+ Assert(!found);
+
+ /*
+ * Move insertion pointer to the next segment if the segment
+ * is filled up.
+ */
+ if (dest_elem >= TABSTAT_QUANTUM)
+ {
+ Assert(dest_tsa->tsa_next != NULL);
+ dest_tsa = dest_tsa->tsa_next;
+ dest_elem = 0;
+ }
+
+ /*
+ * Pack the entry at the begining of the array. Do nothing if
+ * no need to be moved.
+ */
+ if (tsa != dest_tsa || i != dest_elem)
+ {
+ PgStat_TableStatus *new_entry;
+ new_entry = &dest_tsa->tsa_entries[dest_elem];
+ *new_entry = *entry;
+
+ /* use new_entry as entry hereafter */
+ entry = new_entry;
+ }
+
+ hash_entry->tsa_entry = entry;
+ dest_elem++;
  }
  }
- /* zero out PgStat_TableStatus structs after use */
- MemSet(tsa->tsa_entries, 0,
-   tsa->tsa_used * sizeof(PgStat_TableStatus));
- tsa->tsa_used = 0;
  }
 
- /*
- * Send partial messages.  Make sure that any pending xact commit/abort
- * gets counted, even if there are no table stats to send.
- */
- if (regular_msg.m_nentries > 0 ||
- pgStatXactCommit > 0 || pgStatXactRollback > 0)
- pgstat_send_tabstat(&regular_msg);
- if (shared_msg.m_nentries > 0)
- pgstat_send_tabstat(&shared_msg);
+ /* zero out unused area of TableStatus */
+ dest_tsa->tsa_used = dest_elem;
+ MemSet(&dest_tsa->tsa_entries[dest_elem], 0,
+   (TABSTAT_QUANTUM - dest_elem) * sizeof(PgStat_TableStatus));
+ while (dest_tsa->tsa_next)
+ {
+ dest_tsa = dest_tsa->tsa_next;
+ MemSet(dest_tsa->tsa_entries, 0,
+   dest_tsa->tsa_used * sizeof(PgStat_TableStatus));
+ dest_tsa->tsa_used = 0;
+ }
 
- /* Now, send function statistics */
- pgstat_send_funcstats();
+ /* and set the new TabStatusArray hash if any */
+ pgStatTabHash = new_tsa_hash;
+
+ /*
+ * We no longer need shared database and table entries, but that for my
+ * database may be used later.
+ */
+ if (cxt->shdb_tabhash)
+ {
+ dshash_detach(cxt->shdb_tabhash);
+ unpin_hashes(cxt->shdbentry, cxt->shgeneration);
+ cxt->shdb_tabhash = NULL;
+ cxt->shdbentry = NULL;
+ }
+
+ return pgStatTabHash == NULL;
 }
 
-/*
- * Subroutine for pgstat_report_stat: finish and send a tabstat message
+/* -------
+ * Subroutines for pgstat_flush_stat.
+ * -------
  */
-static void
-pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
+/*
+ * pgstat_flush_tabstat: Flushes a table stats entry.
+ *
+ *  If nowait is true, returns false on lock failure.  Dshashes for table and
+ *  function stats are kept attached in ctx. The caller must detach them after
+ *  use.
+ *
+ *  Returns true if the entry is flushed out.
+ */
+bool
+pgstat_flush_tabstat(pgstat_flush_stat_context *cxt, bool nowait,
+ PgStat_TableStatus *entry)
 {
- int n;
- int len;
+ Oid dboid = entry->t_shared ? InvalidOid : MyDatabaseId;
+ int table_mode = PGSTAT_EXCLUSIVE;
+ bool updated = false;
+ dshash_table *tabhash;
+ PgStat_StatDBEntry *dbent;
+ int generation;
 
- /* It's unlikely we'd get here with no socket, but maybe not impossible */
- if (pgStatSock == PGINVALID_SOCKET)
- return;
+ if (nowait)
+ table_mode |= PGSTAT_NOWAIT;
 
- /*
- * Report and reset accumulated xact commit/rollback and I/O timings
- * whenever we send a normal tabstat message
- */
- if (OidIsValid(tsmsg->m_databaseid))
+ /* Attach required table hash if not yet. */
+ if ((entry->t_shared ? cxt->shdb_tabhash : cxt->mydb_tabhash) == NULL)
  {
- tsmsg->m_xact_commit = pgStatXactCommit;
- tsmsg->m_xact_rollback = pgStatXactRollback;
- tsmsg->m_block_read_time = pgStatBlockReadTime;
- tsmsg->m_block_write_time = pgStatBlockWriteTime;
- pgStatXactCommit = 0;
- pgStatXactRollback = 0;
- pgStatBlockReadTime = 0;
- pgStatBlockWriteTime = 0;
+ /*
+ *  Return if we don't have corresponding dbentry. It would've been
+ *  removed.
+ */
+ dbent = pgstat_get_db_entry(dboid, table_mode, NULL);
+ if (!dbent)
+ return false;
+
+ /*
+ * We don't hold lock on the dbentry since it cannot be dropped while
+ * we are working on it.
+ */
+ generation = pin_hashes(dbent);
+ tabhash = attach_table_hash(dbent, generation);
+
+ if (entry->t_shared)
+ {
+ cxt->shgeneration = generation;
+ cxt->shdbentry = dbent;
+ cxt->shdb_tabhash = tabhash;
+ }
+ else
+ {
+ cxt->mygeneration = generation;
+ cxt->mydbentry = dbent;
+ cxt->mydb_tabhash = tabhash;
+
+ /*
+ * We come here once per database. Take the chance to update
+ * database-wide stats
+ */
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
+ dbent->n_xact_commit += pgStatXactCommit;
+ dbent->n_xact_rollback += pgStatXactRollback;
+ dbent->n_block_read_time += pgStatBlockReadTime;
+ dbent->n_block_write_time += pgStatBlockWriteTime;
+ LWLockRelease(&dbent->lock);
+ pgStatXactCommit = 0;
+ pgStatXactRollback = 0;
+ pgStatBlockReadTime = 0;
+ pgStatBlockWriteTime = 0;
+ }
+ }
+ else if (entry->t_shared)
+ {
+ dbent = cxt->shdbentry;
+ tabhash = cxt->shdb_tabhash;
  }
  else
  {
- tsmsg->m_xact_commit = 0;
- tsmsg->m_xact_rollback = 0;
- tsmsg->m_block_read_time = 0;
- tsmsg->m_block_write_time = 0;
+ dbent = cxt->mydbentry;
+ tabhash = cxt->mydb_tabhash;
  }
 
- n = tsmsg->m_nentries;
- len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
- n * sizeof(PgStat_TableEntry);
 
- pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
- pgstat_send(tsmsg, len);
+ /*
+ * Local table stats should be applied to both dbentry and tabentry at
+ * once. Update dbentry only if we could update tabentry.
+ */
+ if (pgstat_update_tabentry(tabhash, entry, nowait))
+ {
+ pgstat_update_dbentry(dbent, entry);
+ updated = true;
+ }
+
+ return updated;
 }
 
 /*
- * Subroutine for pgstat_report_stat: populate and send a function stat message
+ * pgstat_flush_funcstats: Flushes function stats.
+ *
+ *  If nowait is true, returns false on lock failure. Unapplied local hash
+ *  entryis are left alone.
+ *
+ *  Returns true if all entries are flushed out.
  */
-static void
-pgstat_send_funcstats(void)
+static bool
+pgstat_flush_funcstats(pgstat_flush_stat_context *cxt, bool nowait)
 {
  /* we assume this inits to all zeroes: */
  static const PgStat_FunctionCounts all_zeroes;
-
- PgStat_MsgFuncstat msg;
- PgStat_BackendFunctionEntry *entry;
+ dshash_table   *funchash;
  HASH_SEQ_STATUS fstat;
+ PgStat_BackendFunctionEntry *bestat;
 
+ /* nothing to do, just return  */
  if (pgStatFunctions == NULL)
- return;
+ return true;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT);
- msg.m_databaseid = MyDatabaseId;
- msg.m_nentries = 0;
-
- hash_seq_init(&fstat, pgStatFunctions);
- while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
+ /* get dbentry into cxt if not yet.  */
+ if (cxt->mydbentry == NULL)
  {
- PgStat_FunctionEntry *m_ent;
+ int op = PGSTAT_EXCLUSIVE;
 
- /* Skip it if no counts accumulated since last time */
- if (memcmp(&entry->f_counts, &all_zeroes,
+ if (nowait)
+ op |= PGSTAT_NOWAIT;
+
+ cxt->mydbentry = pgstat_get_db_entry(MyDatabaseId, op, NULL);
+
+ if (cxt->mydbentry == NULL)
+ return false;
+
+ cxt->mygeneration = pin_hashes(cxt->mydbentry);
+ }
+
+ funchash = attach_function_hash(cxt->mydbentry, cxt->mygeneration);
+ if (funchash == NULL)
+ return false;
+
+ have_function_stats = false;
+
+ /*
+ * Scan through the pgStatFunctions to find functions that actually have
+ * counts, and try flushing it out to shared stats.
+ */
+ hash_seq_init(&fstat, pgStatFunctions);
+ while ((bestat = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
+ {
+ bool found;
+ PgStat_StatFuncEntry *funcent = NULL;
+
+ /* Skip it if no counts accumulated for it so far */
+ if (memcmp(&bestat->f_counts, &all_zeroes,
    sizeof(PgStat_FunctionCounts)) == 0)
  continue;
 
- /* need to convert format of time accumulators */
- m_ent = &msg.m_entry[msg.m_nentries];
- m_ent->f_id = entry->f_id;
- m_ent->f_numcalls = entry->f_counts.f_numcalls;
- m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time);
- m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time);
+ funcent = (PgStat_StatFuncEntry *)
+ dshash_find_or_insert_extended(funchash, (void *) &(bestat->f_id),
+   &found, nowait);
 
- if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
+ /*
+ * We couldn't acquire lock on the required entry. Leave the local
+ * entry alone.
+ */
+ if (!funcent)
  {
- pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
- msg.m_nentries * sizeof(PgStat_FunctionEntry));
- msg.m_nentries = 0;
+ have_function_stats = true;
+ continue;
  }
 
- /* reset the entry's counts */
- MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
+ /* Initialize if it's new, or add to it. */
+ if (!found)
+ {
+ funcent->functionid = bestat->f_id;
+ funcent->f_numcalls = bestat->f_counts.f_numcalls;
+ funcent->f_total_time =
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_total_time);
+ funcent->f_self_time =
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_self_time);
+ }
+ else
+ {
+ funcent->f_numcalls += bestat->f_counts.f_numcalls;
+ funcent->f_total_time +=
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_total_time);
+ funcent->f_self_time +=
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_self_time);
+ }
+ dshash_release_lock(funchash, funcent);
+
+ /* reset used counts */
+ MemSet(&bestat->f_counts, 0, sizeof(PgStat_FunctionCounts));
  }
 
- if (msg.m_nentries > 0)
- pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
- msg.m_nentries * sizeof(PgStat_FunctionEntry));
-
- have_function_stats = false;
+ return !have_function_stats;
 }
 
+/*
+ * pgstat_flush_dbstats: Flushes out miscellaneous database stats.
+ *
+ *  If nowait is true, returns with false on lock failure on dbentry.
+ *
+ *  Returns true if all stats are flushed out.
+ */
+static bool
+pgstat_flush_dbstats(pgstat_flush_stat_context *cxt, bool nowait)
+{
+ /* get dbentry if not yet.  */
+ if (cxt->mydbentry == NULL)
+ {
+ int op = PGSTAT_EXCLUSIVE;
+ if (nowait)
+ op |= PGSTAT_NOWAIT;
+
+ cxt->mydbentry = pgstat_get_db_entry(MyDatabaseId, op, NULL);
+
+ /* return if lock failed. */
+ if (cxt->mydbentry == NULL)
+ return false;
+
+ /* we use this generation of table /function stats in this turn */
+ cxt->mygeneration = pin_hashes(cxt->mydbentry);
+ }
+
+ LWLockAcquire(&cxt->mydbentry->lock, LW_EXCLUSIVE);
+ if (HAVE_PENDING_CONFLICTS())
+ pgstat_flush_recovery_conflict(cxt->mydbentry);
+ if (BeDBStats.n_deadlocks != 0)
+ pgstat_flush_deadlock(cxt->mydbentry);
+ if (BeDBStats.n_tmpfiles != 0)
+ pgstat_flush_tempfile(cxt->mydbentry);
+ if (BeDBStats.checksum_failures != NULL)
+ pgstat_flush_checksum_failure(cxt->mydbentry);
+ LWLockRelease(&cxt->mydbentry->lock);
+
+ return true;
+}
 
 /* ----------
  * pgstat_vacuum_stat() -
  *
- * Will tell the collector about objects he can get rid of.
+ * Remove objects we can get rid of.
  * ----------
  */
 void
 pgstat_vacuum_stat(void)
 {
- HTAB   *htab;
- PgStat_MsgTabpurge msg;
- PgStat_MsgFuncpurge f_msg;
- HASH_SEQ_STATUS hstat;
+ HTAB   *oidtab;
+ dshash_seq_status dshstat;
  PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
- PgStat_StatFuncEntry *funcentry;
- int len;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ /* we don't collect stats under standalone mode */
+ if (!IsUnderPostmaster)
  return;
 
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
-
  /*
  * Read pg_database and make a list of OIDs of all existing databases
  */
- htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
+ oidtab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
 
  /*
- * Search the database hash table for dead databases and tell the
- * collector to drop them.
+ * Search the database hash table for dead databases and drop them
+ * from the hash.
  */
- hash_seq_init(&hstat, pgStatDBHash);
- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+
+ dshash_seq_init(&dshstat, pgStatDBHash, false, true);
+ while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&dshstat)) != NULL)
  {
  Oid dbid = dbentry->databaseid;
 
@@ -1058,137 +1259,43 @@ pgstat_vacuum_stat(void)
 
  /* the DB entry for shared tables (with InvalidOid) is never dropped */
  if (OidIsValid(dbid) &&
- hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
+ hash_search(oidtab, (void *) &dbid, HASH_FIND, NULL) == NULL)
  pgstat_drop_database(dbid);
  }
 
  /* Clean up */
- hash_destroy(htab);
+ hash_destroy(oidtab);
 
  /*
  * Lookup our own database entry; if not found, nothing more to do.
  */
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &MyDatabaseId,
- HASH_FIND, NULL);
- if (dbentry == NULL || dbentry->tables == NULL)
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, NULL);
+ if (!dbentry)
  return;
 
  /*
  * Similarly to above, make a list of all known relations in this DB.
  */
- htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
-
- /*
- * Initialize our messages table counter to zero
- */
- msg.m_nentries = 0;
+ oidtab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
 
  /*
  * Check for all tables listed in stats hashtable if they still exist.
+ * Stats cache is useless here so directly search the shared hash.
  */
- hash_seq_init(&hstat, dbentry->tables);
- while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
- {
- Oid tabid = tabentry->tableid;
-
- CHECK_FOR_INTERRUPTS();
-
- if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
- continue;
-
- /*
- * Not there, so add this table's Oid to the message
- */
- msg.m_tableid[msg.m_nentries++] = tabid;
-
- /*
- * If the message is full, send it out and reinitialize to empty
- */
- if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
- {
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
- + msg.m_nentries * sizeof(Oid);
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
-
- msg.m_nentries = 0;
- }
- }
+ pgstat_remove_useless_entries(dbentry->tables, &dsh_tblparams, oidtab);
 
  /*
- * Send the rest
+ * Repeat the above but we needn't bother in the common case where no
+ * function stats are being collected.
  */
- if (msg.m_nentries > 0)
+ if (dbentry->functions != DSM_HANDLE_INVALID)
  {
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
- + msg.m_nentries * sizeof(Oid);
+ oidtab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
- }
-
- /* Clean up */
- hash_destroy(htab);
-
- /*
- * Now repeat the above steps for functions.  However, we needn't bother
- * in the common case where no function stats are being collected.
- */
- if (dbentry->functions != NULL &&
- hash_get_num_entries(dbentry->functions) > 0)
- {
- htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
-
- pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE);
- f_msg.m_databaseid = MyDatabaseId;
- f_msg.m_nentries = 0;
-
- hash_seq_init(&hstat, dbentry->functions);
- while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
- {
- Oid funcid = funcentry->functionid;
-
- CHECK_FOR_INTERRUPTS();
-
- if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
- continue;
-
- /*
- * Not there, so add this function's Oid to the message
- */
- f_msg.m_functionid[f_msg.m_nentries++] = funcid;
-
- /*
- * If the message is full, send it out and reinitialize to empty
- */
- if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
- {
- len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
- + f_msg.m_nentries * sizeof(Oid);
-
- pgstat_send(&f_msg, len);
-
- f_msg.m_nentries = 0;
- }
- }
-
- /*
- * Send the rest
- */
- if (f_msg.m_nentries > 0)
- {
- len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
- + f_msg.m_nentries * sizeof(Oid);
-
- pgstat_send(&f_msg, len);
- }
-
- hash_destroy(htab);
+ pgstat_remove_useless_entries(dbentry->functions, &dsh_funcparams,
+  oidtab);
  }
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
 
@@ -1242,66 +1349,99 @@ pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid)
  return htab;
 }
 
+/*
+ * pgstat_remove_useless_entries - Remove useless entries from per
+ * table/function dshashes.
+ *
+ *  Scan the dshash specified by dshhandle removing entries that are not in
+ *  oidtab. oidtab is destroyed before returning.
+ */
+void
+pgstat_remove_useless_entries(const dshash_table_handle dshhandle,
+  const dshash_parameters *dshparams,
+  HTAB *oidtab)
+{
+ dshash_table *dshtable;
+ dshash_seq_status dshstat;
+ void *ent;
+
+ dshtable = dshash_attach(area, dshparams, dshhandle, 0);
+ dshash_seq_init(&dshstat, dshtable, false, true);
+
+ while ((ent = dshash_seq_next(&dshstat)) != NULL)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* The first member of the entries must be Oid */
+ if (hash_search(oidtab, ent, HASH_FIND, NULL) != NULL)
+ continue;
+
+ /* Not there, so purge this entry */
+ dshash_delete_entry(dshtable, ent);
+ }
+ dshash_detach(dshtable);
+ hash_destroy(oidtab);
+}
 
 /* ----------
  * pgstat_drop_database() -
  *
- * Tell the collector that we just dropped a database.
- * (If the message gets lost, we will still clean the dead DB eventually
- * via future invocations of pgstat_vacuum_stat().)
+ * Remove entry for the database that we just dropped.
+ *
+ * If some stats are flushed after this, this entry will be re-created but we
+ * will still clean the dead DB eventually via future invocations of
+ * pgstat_vacuum_stat().
  * ----------
  */
 void
 pgstat_drop_database(Oid databaseid)
 {
- PgStat_MsgDropdb msg;
+ PgStat_StatDBEntry *dbentry;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ Assert (OidIsValid(databaseid));
+
+ if (!IsUnderPostmaster || !pgStatDBHash)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
- msg.m_databaseid = databaseid;
- pgstat_send(&msg, sizeof(msg));
+ /*
+ * Lookup the database in the hashtable with exclusive lock.
+ */
+ dbentry = pgstat_get_db_entry(databaseid, PGSTAT_EXCLUSIVE, NULL);
+
+ /*
+ * If found, remove it.
+ */
+ if (dbentry)
+ {
+ /* LWLock is needed to rewrite */
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+
+ /* No one is using tables/functions in this dbentry */
+ Assert(dbentry->refcnt == 0);
+
+ /* Remove table/function stats dshash first. */
+ if (dbentry->tables != DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl =
+ dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_destroy(tbl);
+ }
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl =
+ dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_destroy(tbl);
+ }
+ LWLockRelease(&dbentry->lock);
+
+ dshash_delete_entry(pgStatDBHash, (void *)dbentry);
+ }
 }
 
-
-/* ----------
- * pgstat_drop_relation() -
- *
- * Tell the collector that we just dropped a relation.
- * (If the message gets lost, we will still clean the dead entry eventually
- * via future invocations of pgstat_vacuum_stat().)
- *
- * Currently not used for lack of any good place to call it; we rely
- * entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
- * ----------
- */
-#ifdef NOT_USED
-void
-pgstat_drop_relation(Oid relid)
-{
- PgStat_MsgTabpurge msg;
- int len;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
- msg.m_tableid[0] = relid;
- msg.m_nentries = 1;
-
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
-}
-#endif /* NOT_USED */
-
-
 /* ----------
  * pgstat_reset_counters() -
  *
- * Tell the statistics collector to reset counters for our database.
+ * Reset counters for our database.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1310,20 +1450,32 @@ pgstat_drop_relation(Oid relid)
 void
 pgstat_reset_counters(void)
 {
- PgStat_MsgResetcounter msg;
+ PgStat_StatDBEntry   *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ if (!pgStatDBHash)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, sizeof(msg));
+ /*
+ * Lookup the database in the hashtable.  Nothing to do if not there.
+ */
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, &status);
+
+ if (!dbentry)
+ return;
+
+ /* This database is active, safe to release the lock immediately. */
+ dshash_release_lock(pgStatDBHash, dbentry);
+
+ /* Reset database-level stats. */
+ reset_dbentry_counters(dbentry);
+
 }
 
 /* ----------
  * pgstat_reset_shared_counters() -
  *
- * Tell the statistics collector to reset cluster-wide shared counters.
+ * Reset cluster-wide shared counters.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1332,29 +1484,37 @@ pgstat_reset_counters(void)
 void
 pgstat_reset_shared_counters(const char *target)
 {
- PgStat_MsgResetsharedcounter msg;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
+ /* Reset the archiver statistics for the cluster. */
  if (strcmp(target, "archiver") == 0)
- msg.m_resettarget = RESET_ARCHIVER;
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ MemSet(shared_archiverStats, 0, sizeof(*shared_archiverStats));
+ shared_archiverStats->stat_reset_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+ /* Reset the bgwriter statistics for the cluster. */
  else if (strcmp(target, "bgwriter") == 0)
- msg.m_resettarget = RESET_BGWRITER;
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ MemSet(shared_globalStats, 0, sizeof(*shared_globalStats));
+ shared_globalStats->stat_reset_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
  else
  ereport(ERROR,
  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  errmsg("unrecognized reset target: \"%s\"", target),
  errhint("Target must be \"archiver\" or \"bgwriter\".")));
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
- pgstat_send(&msg, sizeof(msg));
 }
 
 /* ----------
  * pgstat_reset_single_counter() -
  *
- * Tell the statistics collector to reset a single counter.
+ * Reset a single counter.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1363,17 +1523,42 @@ pgstat_reset_shared_counters(const char *target)
 void
 pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 {
- PgStat_MsgResetsinglecounter msg;
+ PgStat_StatDBEntry *dbentry;
+ TimestampTz ts;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, NULL);
+
+ if (!dbentry)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER);
- msg.m_databaseid = MyDatabaseId;
- msg.m_resettype = type;
- msg.m_objectid = objoid;
+ /* This database is active, safe to release the lock immediately. */
+ generation = pin_hashes(dbentry);
 
- pgstat_send(&msg, sizeof(msg));
+ /* Set the reset timestamp for the whole database */
+ ts = GetCurrentTimestamp();
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->stat_reset_timestamp = ts;
+ LWLockRelease(&dbentry->lock);
+
+ /* Remove object if it exists, ignore if not */
+ if (type == RESET_TABLE)
+ {
+ dshash_table *t = attach_table_hash(dbentry, generation);
+ dshash_delete_key(t, (void *) &objoid);
+ dshash_detach(t);
+ }
+
+ if (type == RESET_FUNCTION)
+ {
+ dshash_table *t = attach_function_hash(dbentry, generation);
+ if (t)
+ {
+ dshash_delete_key(t, (void *) &objoid);
+ dshash_detach(t);
+ }
+ }
+ unpin_hashes(dbentry, generation);
 }
 
 /* ----------
@@ -1387,48 +1572,81 @@ pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 void
 pgstat_report_autovac(Oid dboid)
 {
- PgStat_MsgAutovacStart msg;
+ PgStat_StatDBEntry *dbentry;
+ TimestampTz ts;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
- msg.m_databaseid = dboid;
- msg.m_start_time = GetCurrentTimestamp();
+ /*
+ * Store the last autovacuum time in the database's hashtable entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ dshash_release_lock(pgStatDBHash, dbentry);
 
- pgstat_send(&msg, sizeof(msg));
+ ts = GetCurrentTimestamp();
+
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->last_autovac_time = ts;
+ LWLockRelease(&dbentry->lock);
 }
 
 
 /* ---------
  * pgstat_report_vacuum() -
  *
- * Tell the collector about the table we just vacuumed.
+ * Report about the table we just vacuumed.
  * ---------
  */
 void
 pgstat_report_vacuum(Oid tableoid, bool shared,
  PgStat_Counter livetuples, PgStat_Counter deadtuples)
 {
- PgStat_MsgVacuum msg;
+ Oid dboid;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ dshash_table *table;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
- msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
- msg.m_tableoid = tableoid;
- msg.m_autovacuum = IsAutoVacuumWorkerProcess();
- msg.m_vacuumtime = GetCurrentTimestamp();
- msg.m_live_tuples = livetuples;
- msg.m_dead_tuples = deadtuples;
- pgstat_send(&msg, sizeof(msg));
+ dboid = shared ? InvalidOid : MyDatabaseId;
+
+ /*
+ * Store the data in the table's hash table entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ generation = pin_hashes(dbentry);
+ table = attach_table_hash(dbentry, generation);
+
+ tabentry = pgstat_get_tab_entry(table, tableoid, true);
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_vacuum_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_vacuum_count++;
+ }
+ else
+ {
+ tabentry->vacuum_timestamp = GetCurrentTimestamp();
+ tabentry->vacuum_count++;
+ }
+ dshash_release_lock(table, tabentry);
+
+ dshash_detach(table);
+ unpin_hashes(dbentry, generation);
 }
 
 /* --------
  * pgstat_report_analyze() -
  *
- * Tell the collector about the table we just analyzed.
+ * Report about the table we just analyzed.
  *
  * Caller must provide new live- and dead-tuples estimates, as well as a
  * flag indicating whether to reset the changes_since_analyze counter.
@@ -1439,9 +1657,14 @@ pgstat_report_analyze(Relation rel,
   PgStat_Counter livetuples, PgStat_Counter deadtuples,
   bool resetcounter)
 {
- PgStat_MsgAnalyze msg;
+ Oid dboid;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ dshash_table   *table;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
  /*
@@ -1470,78 +1693,153 @@ pgstat_report_analyze(Relation rel,
  deadtuples = Max(deadtuples, 0);
  }
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
- msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
- msg.m_tableoid = RelationGetRelid(rel);
- msg.m_autovacuum = IsAutoVacuumWorkerProcess();
- msg.m_resetcounter = resetcounter;
- msg.m_analyzetime = GetCurrentTimestamp();
- msg.m_live_tuples = livetuples;
- msg.m_dead_tuples = deadtuples;
- pgstat_send(&msg, sizeof(msg));
+ dboid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
+
+ /*
+ * Store the data in the table's hashtable entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ generation = pin_hashes(dbentry);
+ table = attach_table_hash(dbentry, generation);
+ tabentry = pgstat_get_tab_entry(table, RelationGetRelid(rel), true);
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ /*
+ * If commanded, reset changes_since_analyze to zero.  This forgets any
+ * changes that were committed while the ANALYZE was in progress, but we
+ * have no good way to estimate how many of those there were.
+ */
+ if (resetcounter)
+ tabentry->changes_since_analyze = 0;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_analyze_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_analyze_count++;
+ }
+ else
+ {
+ tabentry->analyze_timestamp = GetCurrentTimestamp();
+ tabentry->analyze_count++;
+ }
+ dshash_release_lock(table, tabentry);
+
+ dshash_detach(table);
+ unpin_hashes(dbentry, generation);
 }
 
 /* --------
  * pgstat_report_recovery_conflict() -
  *
- * Tell the collector about a Hot Standby recovery conflict.
+ * Report a Hot Standby recovery conflict.
  * --------
  */
 void
 pgstat_report_recovery_conflict(int reason)
 {
- PgStat_MsgRecoveryConflict msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT);
- msg.m_databaseid = MyDatabaseId;
- msg.m_reason = reason;
- pgstat_send(&msg, sizeof(msg));
+ switch (reason)
+ {
+ case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+
+ /*
+ * Since we drop the information about the database as soon as it
+ * replicates, there is no point in counting these conflicts.
+ */
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ BeDBStats.n_conflict_tablespace++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ BeDBStats.n_conflict_lock++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ BeDBStats.n_conflict_snapshot++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ BeDBStats.n_conflict_bufferpin++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ BeDBStats.n_conflict_startup_deadlock++;
+ break;
+ }
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ /* We had a chance to flush immediately */
+ pgstat_flush_recovery_conflict(dbentry);
+
+ dshash_release_lock(pgStatDBHash, dbentry);
+}
+
+/*
+ * flush recovery conflict stats
+ */
+static void
+pgstat_flush_recovery_conflict(PgStat_StatDBEntry *dbentry)
+{
+ dbentry->n_conflict_tablespace += BeDBStats.n_conflict_tablespace;
+ dbentry->n_conflict_lock += BeDBStats.n_conflict_lock;
+ dbentry->n_conflict_snapshot += BeDBStats.n_conflict_snapshot;
+ dbentry->n_conflict_bufferpin += BeDBStats.n_conflict_bufferpin;
+ dbentry->n_conflict_startup_deadlock += BeDBStats.n_conflict_startup_deadlock;
+
+ BeDBStats.n_conflict_tablespace = 0;
+ BeDBStats.n_conflict_lock = 0;
+ BeDBStats.n_conflict_snapshot = 0;
+ BeDBStats.n_conflict_bufferpin = 0;
+ BeDBStats.n_conflict_startup_deadlock = 0;
 }
 
 /* --------
  * pgstat_report_deadlock() -
  *
- * Tell the collector about a deadlock detected.
+ * Report a deadlock detected.
  * --------
  */
 void
 pgstat_report_deadlock(void)
 {
- PgStat_MsgDeadlock msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, sizeof(msg));
+ BeDBStats.n_deadlocks++;
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
-
-
-/* --------
- * pgstat_report_checksum_failures_in_db() -
- *
- * Tell the collector about one or more checksum failures.
- * --------
+/*
+ * flush dead lock stats
  */
-void
-pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
+static void
+pgstat_flush_deadlock(PgStat_StatDBEntry *dbentry)
 {
- PgStat_MsgChecksumFailure msg;
-
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
- return;
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
- msg.m_databaseid = dboid;
- msg.m_failurecount = failurecount;
- msg.m_failure_time = GetCurrentTimestamp();
-
- pgstat_send(&msg, sizeof(msg));
+ dbentry->n_deadlocks += BeDBStats.n_deadlocks;
+ BeDBStats.n_deadlocks = 0;
 }
 
 /* --------
@@ -1559,60 +1857,153 @@ pgstat_report_checksum_failure(void)
 /* --------
  * pgstat_report_tempfile() -
  *
- * Tell the collector about a temporary file.
+ * Report a temporary file.
  * --------
  */
 void
 pgstat_report_tempfile(size_t filesize)
 {
- PgStat_MsgTempFile msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE);
- msg.m_databaseid = MyDatabaseId;
- msg.m_filesize = filesize;
- pgstat_send(&msg, sizeof(msg));
-}
+ if (filesize > 0) /* Is there a case where filesize is really 0? */
+ {
+ BeDBStats.tmpfilesize += filesize; /* needs check overflow */
+ BeDBStats.n_tmpfiles++;
+ }
 
-
-/* ----------
- * pgstat_ping() -
- *
- * Send some junk data to the collector to increase traffic.
- * ----------
- */
-void
-pgstat_ping(void)
-{
- PgStat_MsgDummy msg;
-
- if (pgStatSock == PGINVALID_SOCKET)
+ if (BeDBStats.n_tmpfiles == 0)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
- pgstat_send(&msg, sizeof(msg));
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ /* We had a chance to flush immediately */
+ pgstat_flush_tempfile(dbentry);
+
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
-/* ----------
- * pgstat_send_inquiry() -
- *
- * Notify collector that we need fresh data.
- * ----------
+/*
+ * flush temporary file stats
  */
 static void
-pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
+pgstat_flush_tempfile(PgStat_StatDBEntry *dbentry)
 {
- PgStat_MsgInquiry msg;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
- msg.clock_time = clock_time;
- msg.cutoff_time = cutoff_time;
- msg.databaseid = databaseid;
- pgstat_send(&msg, sizeof(msg));
+ dbentry->n_temp_bytes += BeDBStats.tmpfilesize;
+ dbentry->n_temp_files += BeDBStats.n_tmpfiles;
+ BeDBStats.tmpfilesize = 0;
+ BeDBStats.n_tmpfiles = 0;
 }
 
+/* --------
+ * pgstat_report_checksum_failures_in_db(dboid, failure_count) -
+ *
+ * Tell the collector about one or more checksum failures.
+ * --------
+ */
+void
+pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
+{
+ PgStat_StatDBEntry   *dbentry;
+ PgStat_TableLookupResult status;
+ ChecksumFailureEnt   *failent = NULL;
+
+ /* return if we are not collecting stats */
+ if (!area)
+ return;
+
+ if (BeDBStats.checksum_failures != NULL)
+ {
+ failent = hash_search(BeDBStats.checksum_failures, &dboid,
+  HASH_FIND, NULL);
+ if (failent)
+ failurecount += failent->count;
+ }
+
+ if (failurecount == 0)
+ return;
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ {
+ if (!failent)
+ {
+ if (!BeDBStats.checksum_failures)
+ {
+ HASHCTL ctl;
+
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(ChecksumFailureEnt);
+ BeDBStats.checksum_failures =
+ hash_create("pgstat checksum failure count hash",
+ 32, &ctl, HASH_ELEM | HASH_BLOBS);
+ }
+
+ failent = hash_search(BeDBStats.checksum_failures,
+  &dboid, HASH_ENTER, NULL);
+ }
+
+ failent->count = failurecount;
+ return;
+ }
+
+ /* We have a chance to flush immediately */
+ dbentry->n_checksum_failures += failurecount;
+ BeDBStats.checksum_failures = NULL;
+
+ dshash_release_lock(pgStatDBHash, dbentry);
+}
+
+/*
+ * flush checkpoint failure count for all databases
+ */
+static void
+pgstat_flush_checksum_failure(PgStat_StatDBEntry *dbentry)
+{
+ HASH_SEQ_STATUS stat;
+ ChecksumFailureEnt *ent;
+ bool release_dbent;
+
+ if (BeDBStats.checksum_failures == NULL)
+ return;
+
+ hash_seq_init(&stat, BeDBStats.checksum_failures);
+ while ((ent = (ChecksumFailureEnt *) hash_seq_search(&stat)) != NULL)
+ {
+ release_dbent = false;
+
+ if (dbentry->databaseid != ent->dboid)
+ {
+ dbentry = pgstat_get_db_entry(ent->dboid,
+  PGSTAT_EXCLUSIVE, NULL);
+ if (!dbentry)
+ continue;
+
+ release_dbent = true;
+ }
+
+ dbentry->n_checksum_failures += ent->count;
+
+ if (release_dbent)
+ dshash_release_lock(pgStatDBHash, dbentry);
+ }
+
+ hash_destroy(BeDBStats.checksum_failures);
+ BeDBStats.checksum_failures = NULL;
+}
 
 /*
  * Initialize function call usage data.
@@ -1764,7 +2155,8 @@ pgstat_initstats(Relation rel)
  return;
  }
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  {
  /* We're not counting at all */
  rel->pgstat_info = NULL;
@@ -1783,6 +2175,24 @@ pgstat_initstats(Relation rel)
  rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
 }
 
+/*
+ * create_tabstat_hash - create local hash as transactional storage
+ */
+static HTAB *
+create_tabstat_hash(void)
+{
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(ctl));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(TabStatHashEntry);
+
+ return hash_create("pgstat TabStatusArray lookup hash table",
+   TABSTAT_QUANTUM,
+   &ctl,
+   HASH_ELEM | HASH_BLOBS);
+}
+
 /*
  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
  */
@@ -1798,18 +2208,7 @@ get_tabstat_entry(Oid rel_id, bool isshared)
  * Create hash table if we don't have it already.
  */
  if (pgStatTabHash == NULL)
- {
- HASHCTL ctl;
-
- memset(&ctl, 0, sizeof(ctl));
- ctl.keysize = sizeof(Oid);
- ctl.entrysize = sizeof(TabStatHashEntry);
-
- pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table",
- TABSTAT_QUANTUM,
- &ctl,
- HASH_ELEM | HASH_BLOBS);
- }
+ pgStatTabHash = create_tabstat_hash();
 
  /*
  * Find an entry or create a new one.
@@ -2422,30 +2821,33 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info,
 /* ----------
  * pgstat_fetch_stat_dbentry() -
  *
- * Support function for the SQL-callable pgstat* functions. Returns
- * the collected statistics for one database or NULL. NULL doesn't mean
- * that the database doesn't exist, it is just not yet known by the
- * collector, so the caller is better off to report ZERO instead.
- * ----------
+ * Find database stats entry on backends. The returned entries are cached
+ * until transaction end or pgstat_clear_snapshot() is called.
  */
 PgStat_StatDBEntry *
 pgstat_fetch_stat_dbentry(Oid dbid)
 {
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "local database stats hash",
+ .hash_entsize = sizeof(PgStat_StatDBEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,   /* already attached */
+ .dsh_params = &dsh_dbparams,
+ .hash = &pgStatLocalHash,
+ .dshash = &pgStatDBHash
+ };
 
- /*
- * Lookup the requested database; return NULL if not found
- */
- return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-  (void *) &dbid,
-  HASH_FIND, NULL);
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ /* If not done for this transaction, take a snapshot of global stats */
+ pgstat_snapshot_global_stats();
+
+ /* caller doesn't have a business with snapshot-local members  */
+ return (PgStat_StatDBEntry *) snapshot_statentry(&param, dbid);
 }
 
-
 /* ----------
  * pgstat_fetch_stat_tabentry() -
  *
@@ -2458,51 +2860,66 @@ pgstat_fetch_stat_dbentry(Oid dbid)
 PgStat_StatTabEntry *
 pgstat_fetch_stat_tabentry(Oid relid)
 {
- Oid dbid;
  PgStat_StatDBEntry *dbentry;
  PgStat_StatTabEntry *tabentry;
 
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
+ /* Lookup our database, then look in its table hash table. */
+ dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
+ if (dbentry == NULL)
+ return NULL;
 
- /*
- * Lookup our database, then look in its table hash table.
- */
- dbid = MyDatabaseId;
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_FIND, NULL);
- if (dbentry != NULL && dbentry->tables != NULL)
- {
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &relid,
-   HASH_FIND, NULL);
- if (tabentry)
- return tabentry;
- }
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
+ if (tabentry != NULL)
+ return tabentry;
 
  /*
  * If we didn't find it, maybe it's a shared table.
  */
- dbid = InvalidOid;
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_FIND, NULL);
- if (dbentry != NULL && dbentry->tables != NULL)
- {
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &relid,
-   HASH_FIND, NULL);
- if (tabentry)
- return tabentry;
- }
+ dbentry = pgstat_fetch_stat_dbentry(InvalidOid);
+ if (dbentry == NULL)
+ return NULL;
+
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
+ if (tabentry != NULL)
+ return tabentry;
 
  return NULL;
 }
 
+/* ----------
+ * pgstat_fetch_stat_tabentry_extended() -
+ *
+ * Find table stats entry on backends. The returned entries are cached until
+ * transaction end or pgstat_clear_snapshot() is called.
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry_extended(PgStat_StatDBEntry *dbent, Oid reloid)
+{
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "table stats snapshot hash",
+ .hash_entsize = sizeof(PgStat_StatTabEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,
+ .dsh_params = &dsh_tblparams,
+ .hash = NULL,
+ .dshash = NULL
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ /* set target shared hash */
+ param.dsh_handle = dbent->tables;
+
+ /* tell snapshot_statentry what variables to use */
+ param.hash = &dbent->snapshot_tables;
+ param.dshash = &dbent->dshash_tables;
+
+ return (PgStat_StatTabEntry *)
+ snapshot_statentry(&param, reloid);
+}
+
 
 /* ----------
  * pgstat_fetch_stat_funcentry() -
@@ -2517,21 +2934,90 @@ pgstat_fetch_stat_funcentry(Oid func_id)
  PgStat_StatDBEntry *dbentry;
  PgStat_StatFuncEntry *funcentry = NULL;
 
- /* load the stats file if needed */
- backend_read_statsfile();
-
- /* Lookup our database, then find the requested function.  */
+ /* Lookup our database, then find the requested function */
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
- if (dbentry != NULL && dbentry->functions != NULL)
- {
- funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
- (void *) &func_id,
- HASH_FIND, NULL);
- }
+ if (dbentry == NULL)
+ return NULL;
+
+ funcentry = pgstat_fetch_stat_funcentry_extended(dbentry, func_id);
 
  return funcentry;
 }
 
+/* ----------
+ * pgstat_fetch_stat_funcentry_extended() -
+ *
+ * Find function stats entry on backends. The returned entries are cached
+ * until transaction end or pgstat_clear_snapshot() is called.
+ *
+ *  dbent is type of (PgStat_StatDBEntry *) but it's body must be an
+ *  PgSTat_StatDBEntry returned from pgstat_fetch_stat_dbentry().
+ */
+static PgStat_StatFuncEntry *
+pgstat_fetch_stat_funcentry_extended(PgStat_StatDBEntry *dbent, Oid funcid)
+{
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "function stats snapshot hash",
+ .hash_entsize = sizeof(PgStat_StatFuncEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,
+ .dsh_params = &dsh_funcparams,
+ .hash = NULL,
+ .dshash = NULL
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ if (dbent->functions == DSM_HANDLE_INVALID)
+ return NULL;
+
+ /* set target shared hash */
+ param.dsh_handle = dbent->functions;
+
+ /* tell snapshot_statentry what variables to use */
+ param.hash = &dbent->snapshot_functions;
+ param.dshash = &dbent->dshash_functions;
+
+ return (PgStat_StatFuncEntry *)
+ snapshot_statentry(&param, funcid);
+}
+
+/*
+ * pgstat_snapshot_global_stats() -
+ *
+ * Makes a snapshot of global stats if not done yet.  They will be kept until
+ * subsequent call of pgstat_clear_snapshot() or the end of the current
+ * memory context (typically TopTransactionContext).
+ */
+static void
+pgstat_snapshot_global_stats(void)
+{
+ MemoryContext oldcontext;
+
+ pgstat_attach_shared_stats();
+
+ /* Nothing to do if already done */
+ if (global_snapshot_is_valid)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(pgStatSnapshotContext);
+
+ LWLockAcquire(StatsLock, LW_SHARED);
+ memcpy(&snapshot_globalStats, shared_globalStats,
+   sizeof(PgStat_GlobalStats));
+
+ memcpy(&snapshot_archiverStats, shared_archiverStats,
+   sizeof(PgStat_ArchiverStats));
+ LWLockRelease(StatsLock);
+
+ global_snapshot_is_valid = true;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return;
+}
 
 /* ----------
  * pgstat_fetch_stat_beentry() -
@@ -2603,9 +3089,10 @@ pgstat_fetch_stat_numbackends(void)
 PgStat_ArchiverStats *
 pgstat_fetch_stat_archiver(void)
 {
- backend_read_statsfile();
+ /* If not done for this transaction, take a stats snapshot */
+ pgstat_snapshot_global_stats();
 
- return &archiverStats;
+ return &snapshot_archiverStats;
 }
 
 
@@ -2620,9 +3107,10 @@ pgstat_fetch_stat_archiver(void)
 PgStat_GlobalStats *
 pgstat_fetch_global(void)
 {
- backend_read_statsfile();
+ /* If not done for this transaction, take a stats snapshot */
+ pgstat_snapshot_global_stats();
 
- return &globalStats;
+ return &snapshot_globalStats;
 }
 
 
@@ -2836,8 +3324,8 @@ pgstat_initialize(void)
  MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType];
  }
 
- /* Set up a process-exit hook to clean up */
- on_shmem_exit(pgstat_beshutdown_hook, 0);
+ /* need to be called before dsm shutodwn */
+ before_shmem_exit(pgstat_beshutdown_hook, 0);
 }
 
 /* ----------
@@ -2935,7 +3423,7 @@ pgstat_bestart(void)
  lbeentry.st_backendType = B_STARTUP;
  break;
  case ArchiverProcess:
- beentry->st_backendType = B_ARCHIVER;
+ lbeentry.st_backendType = B_ARCHIVER;
  break;
  case BgWriterProcess:
  lbeentry.st_backendType = B_BG_WRITER;
@@ -3071,6 +3559,10 @@ pgstat_bestart(void)
  /* Update app name to current GUC setting */
  if (application_name)
  pgstat_report_appname(application_name);
+
+
+ /* attach shared database stats area */
+ pgstat_attach_shared_stats();
 }
 
 /*
@@ -3106,6 +3598,8 @@ pgstat_beshutdown_hook(int code, Datum arg)
  beentry->st_procpid = 0; /* mark invalid */
 
  PGSTAT_END_WRITE_ACTIVITY(beentry);
+
+ pgstat_detach_shared_stats(true);
 }
 
 
@@ -3366,7 +3860,8 @@ pgstat_read_current_status(void)
 #endif
  int i;
 
- Assert(!pgStatRunningInCollector);
+ Assert(IsUnderPostmaster);
+
  if (localBackendStatusTable)
  return; /* already done */
 
@@ -3661,9 +4156,6 @@ pgstat_get_wait_activity(WaitEventActivity w)
  case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
  event_name = "LogicalLauncherMain";
  break;
- case WAIT_EVENT_PGSTAT_MAIN:
- event_name = "PgStatMain";
- break;
  case WAIT_EVENT_RECOVERY_WAL_ALL:
  event_name = "RecoveryWalAll";
  break;
@@ -4323,75 +4815,43 @@ pgstat_get_backend_desc(BackendType backendType)
  * ------------------------------------------------------------
  */
 
-
-/* ----------
- * pgstat_setheader() -
- *
- * Set common header fields in a statistics message
- * ----------
- */
-static void
-pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
-{
- hdr->m_type = mtype;
-}
-
-
-/* ----------
- * pgstat_send() -
- *
- * Send out one statistics message to the collector
- * ----------
- */
-static void
-pgstat_send(void *msg, int len)
-{
- int rc;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
- ((PgStat_MsgHdr *) msg)->m_size = len;
-
- /* We'll retry after EINTR, but ignore all other failures */
- do
- {
- rc = send(pgStatSock, msg, len, 0);
- } while (rc < 0 && errno == EINTR);
-
-#ifdef USE_ASSERT_CHECKING
- /* In debug builds, log send failures ... */
- if (rc < 0)
- elog(LOG, "could not send to statistics collector: %m");
-#endif
-}
-
 /* ----------
  * pgstat_send_archiver() -
  *
- * Tell the collector about the WAL file that we successfully
- * archived or failed to archive.
+ * Report archiver statistics
  * ----------
  */
 void
 pgstat_send_archiver(const char *xlog, bool failed)
 {
- PgStat_MsgArchiver msg;
+ TimestampTz now = GetCurrentTimestamp();
 
- /*
- * Prepare and send the message
- */
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER);
- msg.m_failed = failed;
- StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
- msg.m_timestamp = GetCurrentTimestamp();
- pgstat_send(&msg, sizeof(msg));
+ if (failed)
+ {
+ /* Failed archival attempt */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ ++shared_archiverStats->failed_count;
+ memcpy(shared_archiverStats->last_failed_wal, xlog,
+   sizeof(shared_archiverStats->last_failed_wal));
+ shared_archiverStats->last_failed_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+ else
+ {
+ /* Successful archival operation */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ ++shared_archiverStats->archived_count;
+ memcpy(shared_archiverStats->last_archived_wal, xlog,
+   sizeof(shared_archiverStats->last_archived_wal));
+ shared_archiverStats->last_archived_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
 }
 
 /* ----------
  * pgstat_send_bgwriter() -
  *
- * Send bgwriter statistics to the collector
+ * Report bgwriter statistics
  * ----------
  */
 void
@@ -4400,6 +4860,8 @@ pgstat_send_bgwriter(void)
  /* We assume this initializes to zeroes */
  static const PgStat_MsgBgWriter all_zeroes;
 
+ PgStat_MsgBgWriter *s = &BgWriterStats;
+
  /*
  * This function can be called even if nothing at all has happened. In
  * this case, avoid sending a completely empty message to the stats
@@ -4408,11 +4870,18 @@ pgstat_send_bgwriter(void)
  if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
  return;
 
- /*
- * Prepare and send the message
- */
- pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
- pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ shared_globalStats->timed_checkpoints += s->m_timed_checkpoints;
+ shared_globalStats->requested_checkpoints += s->m_requested_checkpoints;
+ shared_globalStats->checkpoint_write_time += s->m_checkpoint_write_time;
+ shared_globalStats->checkpoint_sync_time += s->m_checkpoint_sync_time;
+ shared_globalStats->buf_written_checkpoints += s->m_buf_written_checkpoints;
+ shared_globalStats->buf_written_clean += s->m_buf_written_clean;
+ shared_globalStats->maxwritten_clean += s->m_maxwritten_clean;
+ shared_globalStats->buf_written_backend += s->m_buf_written_backend;
+ shared_globalStats->buf_fsync_backend += s->m_buf_fsync_backend;
+ shared_globalStats->buf_alloc += s->m_buf_alloc;
+ LWLockRelease(StatsLock);
 
  /*
  * Clear out the statistics buffer, so it can be re-used.
@@ -4421,305 +4890,164 @@ pgstat_send_bgwriter(void)
 }
 
 
-/* ----------
- * PgstatCollectorMain() -
+/*
+ * Pin and Unpin dbentry.
  *
- * Start up the statistics collector process.  This is the body of the
- * postmaster child process.
- *
- * The argc/argv parameters are valid only in EXEC_BACKEND case.
- * ----------
+ * To keep less memory usage, and for speed, counters are by recreation of
+ * dshash instead of removing entries one-by-one keeping whole-dshash lock. On
+ * the other hand dshash cannot be destroyed until all referrers have gone. As
+ * the result, other backend may be kept waiting the counter reset for not a
+ * short time. We isolate the hashes under destruction as another generation,
+ * which means no longer used but cannot be removed yet.
+
+ * When we start accessing hashes on a dbentry, call pin_hashes() and acquire
+ * the current "generation". Unlock removes the older generation's hashes when
+ * all refers have gone.
  */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+static int
+pin_hashes(PgStat_StatDBEntry *dbentry)
 {
- int len;
- PgStat_Msg msg;
- int wr;
+ int generation;
 
- /*
- * Ignore all signals usually bound to some action in the postmaster,
- * except SIGHUP and SIGQUIT.  Note we don't need a SIGUSR1 handler to
- * support latch operations, because we only use a local latch.
- */
- pqsignal(SIGHUP, pgstat_sighup_handler);
- pqsignal(SIGINT, SIG_IGN);
- pqsignal(SIGTERM, SIG_IGN);
- pqsignal(SIGQUIT, pgstat_exit);
- pqsignal(SIGALRM, SIG_IGN);
- pqsignal(SIGPIPE, SIG_IGN);
- pqsignal(SIGUSR1, SIG_IGN);
- pqsignal(SIGUSR2, SIG_IGN);
- /* Reset some signals that are accepted by postmaster but not here */
- pqsignal(SIGCHLD, SIG_DFL);
- PG_SETMASK(&UnBlockSig);
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->refcnt++;
+ generation = dbentry->generation;
+ LWLockRelease(&dbentry->lock);
 
- /*
- * Identify myself via ps
- */
- init_ps_display("stats collector", "", "", "");
+ dshash_release_lock(pgStatDBHash, dbentry);
 
- /*
- * Read in existing stats files or initialize the stats to zero.
- */
- pgStatRunningInCollector = true;
- pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
+ return generation;
+}
 
- /*
- * Loop to process messages until we get SIGQUIT or detect ungraceful
- * death of our parent postmaster.
- *
- * For performance reasons, we don't want to do ResetLatch/WaitLatch after
- * every message; instead, do that only after a recv() fails to obtain a
- * message.  (This effectively means that if backends are sending us stuff
- * like mad, we won't notice postmaster death until things slack off a
- * bit; which seems fine.) To do that, we have an inner loop that
- * iterates as long as recv() succeeds.  We do recognize got_SIGHUP inside
- * the inner loop, which means that such interrupts will get serviced but
- * the latch won't get cleared until next time there is a break in the
- * action.
- */
- for (;;)
+/*
+ * Unpin hashes in dbentry. If given generation is isolated, destroy it after
+ * all referrers has gone. Otherwise just decrease reference count then return.
+ */
+static void
+unpin_hashes(PgStat_StatDBEntry *dbentry, int generation)
+{
+ dshash_table *tables;
+ dshash_table *funcs = NULL;
+
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+
+ /* using current generation, just decrease refcount */
+ if (dbentry->generation == generation)
  {
- /* Clear any already-pending wakeups */
- ResetLatch(MyLatch);
-
- /*
- * Quit if we get SIGQUIT from the postmaster.
- */
- if (need_exit)
- break;
-
- /*
- * Inner loop iterates as long as we keep getting messages, or until
- * need_exit becomes set.
- */
- while (!need_exit)
- {
- /*
- * Reload configuration if we got SIGHUP from the postmaster.
- */
- if (got_SIGHUP)
- {
- got_SIGHUP = false;
- ProcessConfigFile(PGC_SIGHUP);
- }
-
- /*
- * Write the stats file(s) if a new request has arrived that is
- * not satisfied by existing file(s).
- */
- if (pgstat_write_statsfile_needed())
- pgstat_write_statsfiles(false, false);
-
- /*
- * Try to receive and process a message.  This will not block,
- * since the socket is set to non-blocking mode.
- *
- * XXX On Windows, we have to force pgwin32_recv to cooperate,
- * despite the previous use of pg_set_noblock() on the socket.
- * This is extremely broken and should be fixed someday.
- */
-#ifdef WIN32
- pgwin32_noblock = 1;
-#endif
-
- len = recv(pgStatSock, (char *) &msg,
-   sizeof(PgStat_Msg), 0);
-
-#ifdef WIN32
- pgwin32_noblock = 0;
-#endif
-
- if (len < 0)
- {
- if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
- break; /* out of inner loop */
- ereport(ERROR,
- (errcode_for_socket_access(),
- errmsg("could not read statistics message: %m")));
- }
-
- /*
- * We ignore messages that are smaller than our common header
- */
- if (len < sizeof(PgStat_MsgHdr))
- continue;
-
- /*
- * The received length must match the length in the header
- */
- if (msg.msg_hdr.m_size != len)
- continue;
-
- /*
- * O.K. - we accept this message.  Process it.
- */
- switch (msg.msg_hdr.m_type)
- {
- case PGSTAT_MTYPE_DUMMY:
- break;
-
- case PGSTAT_MTYPE_INQUIRY:
- pgstat_recv_inquiry(&msg.msg_inquiry, len);
- break;
-
- case PGSTAT_MTYPE_TABSTAT:
- pgstat_recv_tabstat(&msg.msg_tabstat, len);
- break;
-
- case PGSTAT_MTYPE_TABPURGE:
- pgstat_recv_tabpurge(&msg.msg_tabpurge, len);
- break;
-
- case PGSTAT_MTYPE_DROPDB:
- pgstat_recv_dropdb(&msg.msg_dropdb, len);
- break;
-
- case PGSTAT_MTYPE_RESETCOUNTER:
- pgstat_recv_resetcounter(&msg.msg_resetcounter, len);
- break;
-
- case PGSTAT_MTYPE_RESETSHAREDCOUNTER:
- pgstat_recv_resetsharedcounter(
-   &msg.msg_resetsharedcounter,
-   len);
- break;
-
- case PGSTAT_MTYPE_RESETSINGLECOUNTER:
- pgstat_recv_resetsinglecounter(
-   &msg.msg_resetsinglecounter,
-   len);
- break;
-
- case PGSTAT_MTYPE_AUTOVAC_START:
- pgstat_recv_autovac(&msg.msg_autovacuum_start, len);
- break;
-
- case PGSTAT_MTYPE_VACUUM:
- pgstat_recv_vacuum(&msg.msg_vacuum, len);
- break;
-
- case PGSTAT_MTYPE_ANALYZE:
- pgstat_recv_analyze(&msg.msg_analyze, len);
- break;
-
- case PGSTAT_MTYPE_ARCHIVER:
- pgstat_recv_archiver(&msg.msg_archiver, len);
- break;
-
- case PGSTAT_MTYPE_BGWRITER:
- pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
- break;
-
- case PGSTAT_MTYPE_FUNCSTAT:
- pgstat_recv_funcstat(&msg.msg_funcstat, len);
- break;
-
- case PGSTAT_MTYPE_FUNCPURGE:
- pgstat_recv_funcpurge(&msg.msg_funcpurge, len);
- break;
-
- case PGSTAT_MTYPE_RECOVERYCONFLICT:
- pgstat_recv_recoveryconflict(
- &msg.msg_recoveryconflict,
- len);
- break;
-
- case PGSTAT_MTYPE_DEADLOCK:
- pgstat_recv_deadlock(&msg.msg_deadlock, len);
- break;
-
- case PGSTAT_MTYPE_TEMPFILE:
- pgstat_recv_tempfile(&msg.msg_tempfile, len);
- break;
-
- case PGSTAT_MTYPE_CHECKSUMFAILURE:
- pgstat_recv_checksum_failure(
- &msg.msg_checksumfailure,
- len);
- break;
-
- default:
- break;
- }
- } /* end of inner message-processing loop */
-
- /* Sleep until there's something to do */
-#ifndef WIN32
- wr = WaitLatchOrSocket(MyLatch,
-   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-   pgStatSock, -1L,
-   WAIT_EVENT_PGSTAT_MAIN);
-#else
-
- /*
- * Windows, at least in its Windows Server 2003 R2 incarnation,
- * sometimes loses FD_READ events.  Waking up and retrying the recv()
- * fixes that, so don't sleep indefinitely.  This is a crock of the
- * first water, but until somebody wants to debug exactly what's
- * happening there, this is the best we can do.  The two-second
- * timeout matches our pre-9.2 behavior, and needs to be short enough
- * to not provoke "using stale statistics" complaints from
- * backend_read_statsfile.
- */
- wr = WaitLatchOrSocket(MyLatch,
-   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-   pgStatSock,
-   2 * 1000L /* msec */ ,
-   WAIT_EVENT_PGSTAT_MAIN);
-#endif
-
- /*
- * Emergency bailout if postmaster has died.  This is to avoid the
- * necessity for manual cleanup of all postmaster children.
- */
- if (wr & WL_POSTMASTER_DEATH)
- break;
- } /* end of outer loop */
+ dbentry->refcnt--;
+ LWLockRelease(&dbentry->lock);
+ return;
+ }
 
  /*
- * Save the final stats to reuse at next startup.
+ * It is isolated, waiting for all referrers to end.
  */
- pgstat_write_statsfiles(true, true);
+ Assert(dbentry->generation == generation + 1);
 
- exit(0);
+ if (--dbentry->prev_refcnt > 0)
+ {
+ LWLockRelease(&dbentry->lock);
+ return;
+ }
+
+ /* no referrer remains, remove the hashes */
+ tables = dshash_attach(area, &dsh_tblparams, dbentry->prev_tables, 0);
+ if (dbentry->prev_functions != DSM_HANDLE_INVALID)
+ funcs = dshash_attach(area, &dsh_funcparams,
+  dbentry->prev_functions, 0);
+
+ dbentry->prev_tables = DSM_HANDLE_INVALID;
+ dbentry->prev_functions = DSM_HANDLE_INVALID;
+
+ /* release the entry immediately */
+ LWLockRelease(&dbentry->lock);
+
+ dshash_destroy(tables);
+ if (funcs)
+ dshash_destroy(funcs);
+
+ return;
 }
 
-
-/* SIGQUIT signal handler for collector process */
-static void
-pgstat_exit(SIGNAL_ARGS)
+/*
+ * attach and return the specified generation of table hash
+ * Returns NULL on lock failure.
+ */
+static dshash_table *
+attach_table_hash(PgStat_StatDBEntry *dbent, int gen)
 {
- int save_errno = errno;
+ dshash_table *ret;
 
- need_exit = true;
- SetLatch(MyLatch);
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
 
- errno = save_errno;
+ if (dbent->generation == gen)
+ ret = dshash_attach(area, &dsh_tblparams, dbent->tables, 0);
+ else
+ {
+ Assert (dbent->generation == gen + 1);
+ Assert (dbent->prev_tables != DSM_HANDLE_INVALID);
+ ret = dshash_attach(area, &dsh_tblparams, dbent->prev_tables, 0);
+ }
+ LWLockRelease(&dbent->lock);
+
+ return ret;
 }
 
-/* SIGHUP handler for collector process */
-static void
-pgstat_sighup_handler(SIGNAL_ARGS)
+/* attach and return the specified generation of function hash */
+static dshash_table *
+attach_function_hash(PgStat_StatDBEntry *dbent, int gen)
 {
- int save_errno = errno;
+ dshash_table *ret = NULL;
 
- got_SIGHUP = true;
- SetLatch(MyLatch);
 
- errno = save_errno;
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
+
+ if (dbent->generation == gen)
+ {
+ if (dbent->functions == DSM_HANDLE_INVALID)
+ {
+ dshash_table *funchash =
+ dshash_create(area, &dsh_funcparams, 0);
+ dbent->functions = dshash_get_hash_table_handle(funchash);
+
+ ret = funchash;
+ }
+ else
+ ret =  dshash_attach(area, &dsh_funcparams, dbent->functions, 0);
+ }
+ /* don't bother creating useless hash */
+
+ LWLockRelease(&dbent->lock);
+
+ return  ret;
+}
+
+static void
+init_dbentry(PgStat_StatDBEntry *dbentry)
+{
+ LWLockInitialize(&dbentry->lock, LWTRANCHE_STATS);
+ dbentry->generation = 0;
+ dbentry->refcnt = 0;
+ dbentry->prev_refcnt = 0;
+ dbentry->tables = DSM_HANDLE_INVALID;
+ dbentry->prev_tables = DSM_HANDLE_INVALID;
+ dbentry->functions = DSM_HANDLE_INVALID;
+ dbentry->prev_functions = DSM_HANDLE_INVALID;
 }
 
 /*
  * Subroutine to clear stats in a database entry
  *
- * Tables and functions hashes are initialized to empty.
+ * Reset all counters in the dbentry. Tables and functions dshashes are
+ * destroyed.  If any backend is pinning this dbentry, the current dshashes
+ * are stashed out to the previous "generation" to wait for all accessors are
+ * gone. If the previous generation is already occupied, the current dshashes
+ * are so fresh that they doesn't need to be cleared.
  */
 static void
 reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 {
- HASHCTL hash_ctl;
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
 
  dbentry->n_xact_commit = 0;
  dbentry->n_xact_rollback = 0;
@@ -4744,72 +5072,865 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
  dbentry->n_block_read_time = 0;
  dbentry->n_block_write_time = 0;
 
+ if (dbentry->refcnt == 0)
+ {
+ /*
+ * No one is referring to the current hash. It's very costly to remove
+ * entries in dshash individually so just destroy the whole.  If
+ * someone pined this entry just after, pin_hashes() returns the
+ * current generation and attach will happen after the following
+ * LWLock released.
+ */
+ dshash_table *tbl;
+
+ if (dbentry->tables != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_destroy(tbl);
+ dbentry->tables = DSM_HANDLE_INVALID;
+ }
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_destroy(tbl);
+ dbentry->functions = DSM_HANDLE_INVALID;
+ }
+ }
+ else if (dbentry->prev_refcnt == 0)
+ {
+ /*
+ * Someone is still referring to the current hash and previous slot is
+ * vacant. Stash out the current hash to the previous slot.
+ */
+ dbentry->prev_refcnt = dbentry->refcnt;
+ dbentry->prev_tables = dbentry->tables;
+ dbentry->prev_functions = dbentry->functions;
+ dbentry->refcnt = 0;
+ dbentry->tables = DSM_HANDLE_INVALID;
+ dbentry->functions = DSM_HANDLE_INVALID;
+ dbentry->generation++;
+ }
+ else
+ {
+ Assert(dbentry->prev_refcnt > 0 && dbentry->refcnt > 0);
+ /*
+ * If we get here, we just have got another reset request and the old
+ * hashes are waiting to all referrers to be released. It must be
+ * quite a short time so we can just ignore this request.
+ *
+ * As the side effect, the resetter can see non-zero values before
+ * anyone updates them but it's not distinctive with someone updated
+ * them before reading.
+ */
+ }
+
+ /* Create new table hash if not exists */
+ if (dbentry->tables == DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->tables = dshash_get_hash_table_handle(tbl);
+ dshash_detach(tbl);
+ }
+
+ /* Create new function hash if not exists and needed. */
+ if (dbentry->functions == DSM_HANDLE_INVALID &&
+ pgstat_track_functions != TRACK_FUNC_OFF)
+ {
+ dshash_table *tbl = dshash_create(area, &dsh_funcparams, 0);
+ dbentry->functions = dshash_get_hash_table_handle(tbl);
+ dshash_detach(tbl);
+ }
+
  dbentry->stat_reset_timestamp = GetCurrentTimestamp();
- dbentry->stats_timestamp = 0;
 
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
- dbentry->tables = hash_create("Per-database table",
-  PGSTAT_TAB_HASH_SIZE,
-  &hash_ctl,
-  HASH_ELEM | HASH_BLOBS);
-
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
- dbentry->functions = hash_create("Per-database function",
- PGSTAT_FUNCTION_HASH_SIZE,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS);
+ LWLockRelease(&dbentry->lock);
 }
 
 /*
- * Lookup the hash table entry for the specified database. If no hash
- * table entry exists, initialize it, if the create parameter is true.
- * Else, return NULL.
+ * Create the filename for a DB stat file; filename is output parameter points
+ * to a character buffer of length len.
  */
-static PgStat_StatDBEntry *
-pgstat_get_db_entry(Oid databaseid, bool create)
+static void
+get_dbstat_filename(bool tempname, Oid databaseid, char *filename, int len)
 {
- PgStat_StatDBEntry *result;
- bool found;
- HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
+ int printed;
 
- /* Lookup or create the hash table entry for this database */
- result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- &databaseid,
- action, &found);
+ /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
+ printed = snprintf(filename, len, "%s/db_%u.%s",
+   PGSTAT_STAT_PERMANENT_DIRECTORY,
+   databaseid,
+   tempname ? "tmp" : "stat");
+ if (printed >= len)
+ elog(ERROR, "overlength pgstat path");
+}
 
- if (!create && !found)
- return NULL;
+/* ----------
+ * pgstat_write_statsfiles() -
+ * Write the global statistics file, as well as DB files.
+ * ----------
+ */
+void
+pgstat_write_statsfiles(void)
+{
+ dshash_seq_status hstat;
+ PgStat_StatDBEntry *dbentry;
+ FILE   *fpout;
+ int32 format_id;
+ const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+ int rc;
+
+ /* stats is not initialized yet. just return. */
+ if (StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID)
+ return;
+
+ elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
  /*
- * If not found, initialize the new one.  This creates empty hash tables
- * for tables and functions, too.
+ * Open the statistics temp file to write out the current values.
  */
+ fpout = AllocateFile(tmpfile, PG_BINARY_W);
+ if (fpout == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary statistics file \"%s\": %m",
+ tmpfile)));
+ return;
+ }
+
+ /*
+ * Set the timestamp of the stats file.
+ */
+ shared_globalStats->stats_timestamp = GetCurrentTimestamp();
+
+ /*
+ * Write the file header --- currently just a format ID.
+ */
+ format_id = PGSTAT_FILE_FORMAT_ID;
+ rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Write global stats struct
+ */
+ rc = fwrite(shared_globalStats, sizeof(*shared_globalStats), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Write archiver stats struct
+ */
+ rc = fwrite(shared_archiverStats, sizeof(*shared_archiverStats), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Walk through the database table.
+ */
+ dshash_seq_init(&hstat, pgStatDBHash, false, false);
+ while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL)
+ {
+ /*
+ * Write out the table and function stats for this DB into the
+ * appropriate per-DB stat file, if required.
+ */
+ /* Make DB's timestamp consistent with the global stats */
+ dbentry->stats_timestamp = shared_globalStats->stats_timestamp;
+
+ pgstat_write_pgStatDBHashfile(dbentry);
+
+ /*
+ * Write out the DB entry. We don't write the tables or functions
+ * pointers, since they're of no use to any other process.
+ */
+ fputc('D', fpout);
+ rc = fwrite(dbentry,
+ offsetof(PgStat_StatDBEntry, generation), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+
+ /*
+ * No more output to be done. Close the temp file and replace the old
+ * pgstat.stat with it.  The ferror() check replaces testing for error
+ * after each individual fputc or fwrite above.
+ */
+ fputc('E', fpout);
+
+ if (ferror(fpout))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write temporary statistics file \"%s\": %m",
+ tmpfile)));
+ FreeFile(fpout);
+ unlink(tmpfile);
+ }
+ else if (FreeFile(fpout) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not close temporary statistics file \"%s\": %m",
+ tmpfile)));
+ unlink(tmpfile);
+ }
+ else if (rename(tmpfile, statfile) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+ tmpfile, statfile)));
+ unlink(tmpfile);
+ }
+}
+
+/* ----------
+ * pgstat_write_pgStatDBHashfile() -
+ * Write the stat file for a single database.
+ * ----------
+ */
+static void
+pgstat_write_pgStatDBHashfile(PgStat_StatDBEntry *dbentry)
+{
+ dshash_seq_status tstat;
+ dshash_seq_status fstat;
+ PgStat_StatTabEntry *tabentry;
+ PgStat_StatFuncEntry *funcentry;
+ FILE   *fpout;
+ int32 format_id;
+ Oid dbid = dbentry->databaseid;
+ int rc;
+ char tmpfile[MAXPGPATH];
+ char statfile[MAXPGPATH];
+ dshash_table *tbl;
+
+ get_dbstat_filename(true, dbid, tmpfile, MAXPGPATH);
+ get_dbstat_filename(false, dbid, statfile, MAXPGPATH);
+
+ elog(DEBUG2, "writing stats file \"%s\"", statfile);
+
+ /*
+ * Open the statistics temp file to write out the current values.
+ */
+ fpout = AllocateFile(tmpfile, PG_BINARY_W);
+ if (fpout == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary statistics file \"%s\": %m",
+ tmpfile)));
+ return;
+ }
+
+ /*
+ * Write the file header --- currently just a format ID.
+ */
+ format_id = PGSTAT_FILE_FORMAT_ID;
+ rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Walk through the database's access stats per table.
+ */
+ tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_seq_init(&tstat, tbl, false, false);
+ while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&tstat)) != NULL)
+ {
+ fputc('T', fpout);
+ rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+ dshash_detach(tbl);
+
+ /*
+ * Walk through the database's function stats table.
+ */
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_seq_init(&fstat, tbl, false, false);
+ while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&fstat)) != NULL)
+ {
+ fputc('F', fpout);
+ rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+ dshash_detach(tbl);
+ }
+
+ /*
+ * No more output to be done. Close the temp file and replace the old
+ * pgstat.stat with it.  The ferror() check replaces testing for error
+ * after each individual fputc or fwrite above.
+ */
+ fputc('E', fpout);
+
+ if (ferror(fpout))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write temporary statistics file \"%s\": %m",
+ tmpfile)));
+ FreeFile(fpout);
+ unlink(tmpfile);
+ }
+ else if (FreeFile(fpout) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not close temporary statistics file \"%s\": %m",
+ tmpfile)));
+ unlink(tmpfile);
+ }
+ else if (rename(tmpfile, statfile) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+ tmpfile, statfile)));
+ unlink(tmpfile);
+ }
+}
+
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ * Reads in existing statistics collector files into the shared stats hash.
+ *
+ * ----------
+ */
+void
+pgstat_read_statsfiles(void)
+{
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatDBEntry dbbuf;
+ FILE   *fpin;
+ int32 format_id;
+ bool found;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+
+ /* shouldn't be called from postmaster  */
+ Assert(IsUnderPostmaster);
+
+ elog(DEBUG2, "reading stats file \"%s\"", statfile);
+
+ /*
+ * Set the current timestamp (will be kept only in case we can't load an
+ * existing statsfile).
+ */
+ shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
+ shared_archiverStats->stat_reset_timestamp =
+ shared_globalStats->stat_reset_timestamp;
+
+ /*
+ * Try to open the stats file. If it doesn't exist, the backends simply
+ * return zero for anything and the collector simply starts from scratch
+ * with empty counters.
+ *
+ * ENOENT is a possibility if the stats collector is not running or has
+ * not yet written the stats file the first time.  Any other failure
+ * condition is suspicious.
+ */
+ if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+ {
+ if (errno != ENOENT)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open statistics file \"%s\": %m",
+ statfile)));
+ return;
+ }
+
+ /*
+ * Verify it's of the expected format.
+ */
+ if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
+ format_id != PGSTAT_FILE_FORMAT_ID)
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ goto done;
+ }
+
+ /*
+ * Read global stats struct
+ */
+ if (fread(shared_globalStats, 1, sizeof(*shared_globalStats), fpin) !=
+ sizeof(*shared_globalStats))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ MemSet(shared_globalStats, 0, sizeof(*shared_globalStats));
+ goto done;
+ }
+
+ /*
+ * Read archiver stats struct
+ */
+ if (fread(shared_archiverStats, 1, sizeof(*shared_archiverStats), fpin) !=
+ sizeof(*shared_archiverStats))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ MemSet(shared_archiverStats, 0, sizeof(*shared_archiverStats));
+ goto done;
+ }
+
+ /*
+ * We found an existing collector stats file. Read it and put all the
+ * hashtable entries into place.
+ */
+ for (;;)
+ {
+ switch (fgetc(fpin))
+ {
+ /*
+ * 'D' A PgStat_StatDBEntry struct describing a database
+ * follows.
+ */
+ case 'D':
+ if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, generation),
+  fpin) != offsetof(PgStat_StatDBEntry, generation))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ /*
+ * Add to the DB hash
+ */
+ dbentry = (PgStat_StatDBEntry *)
+ dshash_find_or_insert(pgStatDBHash, (void *) &dbbuf.databaseid,
+  &found);
+
+ /* don't allow duplicate dbentries */
+ if (found)
+ {
+ dshash_release_lock(pgStatDBHash, dbentry);
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ /* initialize the new shared entry */
+ init_dbentry(dbentry);
+
+ memcpy(dbentry, &dbbuf,
+   offsetof(PgStat_StatDBEntry, generation));
+
+ /* Read the data from the database-specific file. */
+ pgstat_read_pgStatDBHashfile(dbentry);
+ dshash_release_lock(pgStatDBHash, dbentry);
+ break;
+
+ case 'E':
+ goto done;
+
+ default:
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+ }
+
+done:
+ FreeFile(fpin);
+
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
+
+ return;
+}
+
+
+/* ----------
+ * pgstat_read_pgStatDBHashfile() -
+ *
+ * Reads in the at-rest statistics file and create shared statistics
+ * tables. The file is removed after reading.
+ * ----------
+ */
+static void
+pgstat_read_pgStatDBHashfile(PgStat_StatDBEntry *dbentry)
+{
+ PgStat_StatTabEntry *tabentry;
+ PgStat_StatTabEntry tabbuf;
+ PgStat_StatFuncEntry funcbuf;
+ PgStat_StatFuncEntry *funcentry;
+ dshash_table *tabhash = NULL;
+ dshash_table *funchash = NULL;
+ FILE   *fpin;
+ int32 format_id;
+ bool found;
+ char statfile[MAXPGPATH];
+
+ get_dbstat_filename(false, dbentry->databaseid, statfile, MAXPGPATH);
+
+ /*
+ * Try to open the stats file. If it doesn't exist, the backends simply
+ * return zero for anything and the collector simply starts from scratch
+ * with empty counters.
+ *
+ * ENOENT is a possibility if the stats collector is not running or has
+ * not yet written the stats file the first time.  Any other failure
+ * condition is suspicious.
+ */
+ if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+ {
+ if (errno != ENOENT)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open statistics file \"%s\": %m",
+ statfile)));
+ return;
+ }
+
+ /*
+ * Verify it's of the expected format.
+ */
+ if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
+ format_id != PGSTAT_FILE_FORMAT_ID)
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ goto done;
+ }
+
+ /*
+ * We found an existing statistics file. Read it and put all the hashtable
+ * entries into place.
+ */
+ for (;;)
+ {
+ switch (fgetc(fpin))
+ {
+ /*
+ * 'T' A PgStat_StatTabEntry follows.
+ */
+ case 'T':
+ if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
+  fpin) != sizeof(PgStat_StatTabEntry))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ if (tabhash == NULL)
+ {
+ tabhash = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->tables =
+ dshash_get_hash_table_handle(tabhash);
+ }
+
+ tabentry = (PgStat_StatTabEntry *)
+ dshash_find_or_insert(tabhash,
+  (void *) &tabbuf.tableid, &found);
+
+ /* don't allow duplicate entries */
+ if (found)
+ {
+ dshash_release_lock(tabhash, tabentry);
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+ dshash_release_lock(tabhash, tabentry);
+ break;
+
+ /*
+ * 'F' A PgStat_StatFuncEntry follows.
+ */
+ case 'F':
+ if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
+  fpin) != sizeof(PgStat_StatFuncEntry))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ if (funchash == NULL)
+ {
+ funchash = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->functions =
+ dshash_get_hash_table_handle(funchash);
+ }
+
+ funcentry = (PgStat_StatFuncEntry *)
+ dshash_find_or_insert(funchash,
+  (void *) &funcbuf.functionid, &found);
+
+ if (found)
+ {
+ dshash_release_lock(funchash, funcentry);
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ memcpy(funcentry, &funcbuf, sizeof(funcbuf));
+ dshash_release_lock(funchash, funcentry);
+ break;
+
+ /*
+ * 'E' The EOF marker of a complete stats file.
+ */
+ case 'E':
+ goto done;
+
+ default:
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+ }
+
+done:
+ if (tabhash)
+ dshash_detach(tabhash);
+ if (funchash)
+ dshash_detach(funchash);
+
+ FreeFile(fpin);
+
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
+}
+
+/* ----------
+ * pgstat_setup_memcxt() -
+ *
+ * Create pgStatLocalContext and pgStatSnapshotContext, if not already done.
+ * ----------
+ */
+static void
+pgstat_setup_memcxt(void)
+{
+ if (!pgStatLocalContext)
+ pgStatLocalContext =
+ AllocSetContextCreate(TopMemoryContext,
+  "Backend statistics snapshot",
+  ALLOCSET_SMALL_SIZES);
+
+ if (!pgStatSnapshotContext)
+ pgStatSnapshotContext =
+ AllocSetContextCreate(TopMemoryContext,
+  "Database statistics snapshot",
+  ALLOCSET_SMALL_SIZES);
+}
+
+/* ----------
+ * pgstat_clear_snapshot() -
+ *
+ * Discard any data collected in the current transaction.  Any subsequent
+ * request will cause new snapshots to be read.
+ *
+ * This is also invoked during transaction commit or abort to discard
+ * the no-longer-wanted snapshot.
+ * ----------
+ */
+void
+pgstat_clear_snapshot(void)
+{
+ /* Release memory, if any was allocated */
+ if (pgStatLocalContext)
+ {
+ MemoryContextDelete(pgStatLocalContext);
+
+ /* Reset variables */
+ pgStatLocalContext = NULL;
+ localBackendStatusTable = NULL;
+ localNumBackends = 0;
+ }
+
+ if (pgStatSnapshotContext)
+ clear_snapshot  = true;
+}
+
+static bool
+pgstat_update_tabentry(dshash_table *tabhash, PgStat_TableStatus *stat,
+   bool nowait)
+{
+ PgStat_StatTabEntry *tabentry;
+ bool found;
+
+ if (tabhash == NULL)
+ return false;
+
+ tabentry = (PgStat_StatTabEntry *)
+ dshash_find_or_insert_extended(tabhash, (void *) &(stat->t_id),
+   &found, nowait);
+
+ /* failed to acquire lock */
+ if (tabentry == NULL)
+ return false;
+
  if (!found)
- reset_dbentry_counters(result);
+ {
+ /*
+ * If it's a new table entry, initialize counters to the values we
+ * just got.
+ */
+ tabentry->numscans = stat->t_counts.t_numscans;
+ tabentry->tuples_returned = stat->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched = stat->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted = stat->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated = stat->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted = stat->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated = stat->t_counts.t_tuples_hot_updated;
+ tabentry->n_live_tuples = stat->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples = stat->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze = stat->t_counts.t_changed_tuples;
+ tabentry->blocks_fetched = stat->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit = stat->t_counts.t_blocks_hit;
+
+ tabentry->vacuum_timestamp = 0;
+ tabentry->vacuum_count = 0;
+ tabentry->autovac_vacuum_timestamp = 0;
+ tabentry->autovac_vacuum_count = 0;
+ tabentry->analyze_timestamp = 0;
+ tabentry->analyze_count = 0;
+ tabentry->autovac_analyze_timestamp = 0;
+ tabentry->autovac_analyze_count = 0;
+ }
+ else
+ {
+ /*
+ * Otherwise add the values to the existing entry.
+ */
+ tabentry->numscans += stat->t_counts.t_numscans;
+ tabentry->tuples_returned += stat->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched += stat->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted += stat->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated += stat->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted += stat->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated += stat->t_counts.t_tuples_hot_updated;
+ /* If table was truncated, first reset the live/dead counters */
+ if (stat->t_counts.t_truncated)
+ {
+ tabentry->n_live_tuples = 0;
+ tabentry->n_dead_tuples = 0;
+ }
+ tabentry->n_live_tuples += stat->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples += stat->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze += stat->t_counts.t_changed_tuples;
+ tabentry->blocks_fetched += stat->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit += stat->t_counts.t_blocks_hit;
+ }
+
+ /* Clamp n_live_tuples in case of negative delta_live_tuples */
+ tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+ /* Likewise for n_dead_tuples */
+ tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
+
+ dshash_release_lock(tabhash, tabentry);
+
+ return true;
+}
+
+static void
+pgstat_update_dbentry(PgStat_StatDBEntry *dbentry, PgStat_TableStatus *stat)
+{
+ /*
+ * Add per-table stats to the per-database entry, too.
+ */
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->n_tuples_returned += stat->t_counts.t_tuples_returned;
+ dbentry->n_tuples_fetched += stat->t_counts.t_tuples_fetched;
+ dbentry->n_tuples_inserted += stat->t_counts.t_tuples_inserted;
+ dbentry->n_tuples_updated += stat->t_counts.t_tuples_updated;
+ dbentry->n_tuples_deleted += stat->t_counts.t_tuples_deleted;
+ dbentry->n_blocks_fetched += stat->t_counts.t_blocks_fetched;
+ dbentry->n_blocks_hit += stat->t_counts.t_blocks_hit;
+ LWLockRelease(&dbentry->lock);
+}
+
+/*
+ * Lookup shared stats hash table for the specified database. Returns NULL
+ * when PGSTAT_NOWAIT and required lock cannot be acquired.
+ */
+static PgStat_StatDBEntry *
+pgstat_get_db_entry(Oid databaseid, int op, PgStat_TableLookupResult *status)
+{
+ PgStat_StatDBEntry *result;
+ bool nowait = ((op & PGSTAT_NOWAIT) != 0);
+ bool lock_acquired = true;
+ bool found = true;
+
+ if (!IsUnderPostmaster || !pgStatDBHash)
+ return NULL;
+
+ /* Lookup or create the hash table entry for this database */
+ if (op & PGSTAT_EXCLUSIVE)
+ {
+ result = (PgStat_StatDBEntry *)
+ dshash_find_or_insert_extended(pgStatDBHash, &databaseid,
+   &found, nowait);
+ if (result == NULL)
+ lock_acquired = false;
+ else if (!found)
+ {
+ /*
+ * If not found, initialize the new one.  This creates empty hash
+ * tables hash, too.
+ */
+ init_dbentry(result);
+ reset_dbentry_counters(result);
+ }
+ }
+ else
+ {
+ result = (PgStat_StatDBEntry *)
+ dshash_find_extended(pgStatDBHash, &databaseid, true, nowait,
+ nowait ? &lock_acquired : NULL);
+ if (result == NULL)
+ found = false;
+ }
+
+ /* Set return status if requested */
+ if (status)
+ {
+ if (!lock_acquired)
+ {
+ Assert(nowait);
+ *status = LOCK_FAILED;
+ }
+ else if (!found)
+ *status = NOT_FOUND;
+ else
+ *status = FOUND;
+ }
 
  return result;
 }
 
-
 /*
  * Lookup the hash table entry for the specified table. If no hash
  * table entry exists, initialize it, if the create parameter is true.
  * Else, return NULL.
  */
 static PgStat_StatTabEntry *
-pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
+pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create)
 {
  PgStat_StatTabEntry *result;
  bool found;
- HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
 
  /* Lookup or create the hash table entry for this table */
- result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
- &tableoid,
- action, &found);
+ if (create)
+ result = (PgStat_StatTabEntry *)
+ dshash_find_or_insert(table, &tableoid, &found);
+ else
+ result = (PgStat_StatTabEntry *) dshash_find(table, &tableoid, false);
 
  if (!create && !found)
  return NULL;
@@ -4842,1702 +5963,6 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
  return result;
 }
 
-
-/* ----------
- * pgstat_write_statsfiles() -
- * Write the global statistics file, as well as requested DB files.
- *
- * 'permanent' specifies writing to the permanent files not temporary ones.
- * When true (happens only when the collector is shutting down), also remove
- * the temporary files so that backends starting up under a new postmaster
- * can't read old data before the new collector is ready.
- *
- * When 'allDbs' is false, only the requested databases (listed in
- * pending_write_requests) will be written; otherwise, all databases
- * will be written.
- * ----------
- */
-static void
-pgstat_write_statsfiles(bool permanent, bool allDbs)
-{
- HASH_SEQ_STATUS hstat;
- PgStat_StatDBEntry *dbentry;
- FILE   *fpout;
- int32 format_id;
- const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
- int rc;
-
- elog(DEBUG2, "writing stats file \"%s\"", statfile);
-
- /*
- * Open the statistics temp file to write out the current values.
- */
- fpout = AllocateFile(tmpfile, PG_BINARY_W);
- if (fpout == NULL)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not open temporary statistics file \"%s\": %m",
- tmpfile)));
- return;
- }
-
- /*
- * Set the timestamp of the stats file.
- */
- globalStats.stats_timestamp = GetCurrentTimestamp();
-
- /*
- * Write the file header --- currently just a format ID.
- */
- format_id = PGSTAT_FILE_FORMAT_ID;
- rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Write global stats struct
- */
- rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Write archiver stats struct
- */
- rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Walk through the database table.
- */
- hash_seq_init(&hstat, pgStatDBHash);
- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
- {
- /*
- * Write out the table and function stats for this DB into the
- * appropriate per-DB stat file, if required.
- */
- if (allDbs || pgstat_db_requested(dbentry->databaseid))
- {
- /* Make DB's timestamp consistent with the global stats */
- dbentry->stats_timestamp = globalStats.stats_timestamp;
-
- pgstat_write_db_statsfile(dbentry, permanent);
- }
-
- /*
- * Write out the DB entry. We don't write the tables or functions
- * pointers, since they're of no use to any other process.
- */
- fputc('D', fpout);
- rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
- }
-
- /*
- * No more output to be done. Close the temp file and replace the old
- * pgstat.stat with it.  The ferror() check replaces testing for error
- * after each individual fputc or fwrite above.
- */
- fputc('E', fpout);
-
- if (ferror(fpout))
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not write temporary statistics file \"%s\": %m",
- tmpfile)));
- FreeFile(fpout);
- unlink(tmpfile);
- }
- else if (FreeFile(fpout) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not close temporary statistics file \"%s\": %m",
- tmpfile)));
- unlink(tmpfile);
- }
- else if (rename(tmpfile, statfile) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
- tmpfile, statfile)));
- unlink(tmpfile);
- }
-
- if (permanent)
- unlink(pgstat_stat_filename);
-
- /*
- * Now throw away the list of requests.  Note that requests sent after we
- * started the write are still waiting on the network socket.
- */
- list_free(pending_write_requests);
- pending_write_requests = NIL;
-}
-
-/*
- * return the filename for a DB stat file; filename is the output buffer,
- * of length len.
- */
-static void
-get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
- char *filename, int len)
-{
- int printed;
-
- /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
- printed = snprintf(filename, len, "%s/db_%u.%s",
-   permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
-   pgstat_stat_directory,
-   databaseid,
-   tempname ? "tmp" : "stat");
- if (printed >= len)
- elog(ERROR, "overlength pgstat path");
-}
-
-/* ----------
- * pgstat_write_db_statsfile() -
- * Write the stat file for a single database.
- *
- * If writing to the permanent file (happens when the collector is
- * shutting down only), remove the temporary file so that backends
- * starting up under a new postmaster can't read the old data before
- * the new collector is ready.
- * ----------
- */
-static void
-pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
-{
- HASH_SEQ_STATUS tstat;
- HASH_SEQ_STATUS fstat;
- PgStat_StatTabEntry *tabentry;
- PgStat_StatFuncEntry *funcentry;
- FILE   *fpout;
- int32 format_id;
- Oid dbid = dbentry->databaseid;
- int rc;
- char tmpfile[MAXPGPATH];
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
- get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "writing stats file \"%s\"", statfile);
-
- /*
- * Open the statistics temp file to write out the current values.
- */
- fpout = AllocateFile(tmpfile, PG_BINARY_W);
- if (fpout == NULL)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not open temporary statistics file \"%s\": %m",
- tmpfile)));
- return;
- }
-
- /*
- * Write the file header --- currently just a format ID.
- */
- format_id = PGSTAT_FILE_FORMAT_ID;
- rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Walk through the database's access stats per table.
- */
- hash_seq_init(&tstat, dbentry->tables);
- while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
- {
- fputc('T', fpout);
- rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
- }
-
- /*
- * Walk through the database's function stats table.
- */
- hash_seq_init(&fstat, dbentry->functions);
- while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
- {
- fputc('F', fpout);
- rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
- }
-
- /*
- * No more output to be done. Close the temp file and replace the old
- * pgstat.stat with it.  The ferror() check replaces testing for error
- * after each individual fputc or fwrite above.
- */
- fputc('E', fpout);
-
- if (ferror(fpout))
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not write temporary statistics file \"%s\": %m",
- tmpfile)));
- FreeFile(fpout);
- unlink(tmpfile);
- }
- else if (FreeFile(fpout) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not close temporary statistics file \"%s\": %m",
- tmpfile)));
- unlink(tmpfile);
- }
- else if (rename(tmpfile, statfile) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
- tmpfile, statfile)));
- unlink(tmpfile);
- }
-
- if (permanent)
- {
- get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
- unlink(statfile);
- }
-}
-
-/* ----------
- * pgstat_read_statsfiles() -
- *
- * Reads in some existing statistics collector files and returns the
- * databases hash table that is the top level of the data.
- *
- * If 'onlydb' is not InvalidOid, it means we only want data for that DB
- * plus the shared catalogs ("DB 0").  We'll still populate the DB hash
- * table for all databases, but we don't bother even creating table/function
- * hash tables for other databases.
- *
- * 'permanent' specifies reading from the permanent files not temporary ones.
- * When true (happens only when the collector is starting up), remove the
- * files after reading; the in-memory status is now authoritative, and the
- * files would be out of date in case somebody else reads them.
- *
- * If a 'deep' read is requested, table/function stats are read, otherwise
- * the table/function hash tables remain empty.
- * ----------
- */
-static HTAB *
-pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatDBEntry dbbuf;
- HASHCTL hash_ctl;
- HTAB   *dbhash;
- FILE   *fpin;
- int32 format_id;
- bool found;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
- /*
- * The tables will live in pgStatLocalContext.
- */
- pgstat_setup_memcxt();
-
- /*
- * Create the DB hashtable
- */
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- /*
- * Clear out global and archiver statistics so they start from zero in
- * case we can't load an existing statsfile.
- */
- memset(&globalStats, 0, sizeof(globalStats));
- memset(&archiverStats, 0, sizeof(archiverStats));
-
- /*
- * Set the current timestamp (will be kept only in case we can't load an
- * existing statsfile).
- */
- globalStats.stat_reset_timestamp = GetCurrentTimestamp();
- archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
-
- /*
- * Try to open the stats file. If it doesn't exist, the backends simply
- * return zero for anything and the collector simply starts from scratch
- * with empty counters.
- *
- * ENOENT is a possibility if the stats collector is not running or has
- * not yet written the stats file the first time.  Any other failure
- * condition is suspicious.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return dbhash;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- goto done;
- }
-
- /*
- * Read global stats struct
- */
- if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- memset(&globalStats, 0, sizeof(globalStats));
- goto done;
- }
-
- /*
- * In the collector, disregard the timestamp we read from the permanent
- * stats file; we should be willing to write a temp stats file immediately
- * upon the first request from any backend.  This only matters if the old
- * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
- * an unusual scenario.
- */
- if (pgStatRunningInCollector)
- globalStats.stats_timestamp = 0;
-
- /*
- * Read archiver stats struct
- */
- if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- memset(&archiverStats, 0, sizeof(archiverStats));
- goto done;
- }
-
- /*
- * We found an existing collector stats file. Read it and put all the
- * hashtable entries into place.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'D' A PgStat_StatDBEntry struct describing a database
- * follows.
- */
- case 'D':
- if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
-  fpin) != offsetof(PgStat_StatDBEntry, tables))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * Add to the DB hash
- */
- dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
- (void *) &dbbuf.databaseid,
- HASH_ENTER,
- &found);
- if (found)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
- dbentry->tables = NULL;
- dbentry->functions = NULL;
-
- /*
- * In the collector, disregard the timestamp we read from the
- * permanent stats file; we should be willing to write a temp
- * stats file immediately upon the first request from any
- * backend.
- */
- if (pgStatRunningInCollector)
- dbentry->stats_timestamp = 0;
-
- /*
- * Don't create tables/functions hashtables for uninteresting
- * databases.
- */
- if (onlydb != InvalidOid)
- {
- if (dbbuf.databaseid != onlydb &&
- dbbuf.databaseid != InvalidOid)
- break;
- }
-
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbentry->tables = hash_create("Per-database table",
-  PGSTAT_TAB_HASH_SIZE,
-  &hash_ctl,
-  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbentry->functions = hash_create("Per-database function",
- PGSTAT_FUNCTION_HASH_SIZE,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- /*
- * If requested, read the data from the database-specific
- * file.  Otherwise we just leave the hashtables empty.
- */
- if (deep)
- pgstat_read_db_statsfile(dbentry->databaseid,
- dbentry->tables,
- dbentry->functions,
- permanent);
-
- break;
-
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
- FreeFile(fpin);
-
- /* If requested to read the permanent file, also get rid of it. */
- if (permanent)
- {
- elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
- unlink(statfile);
- }
-
- return dbhash;
-}
-
-
-/* ----------
- * pgstat_read_db_statsfile() -
- *
- * Reads in the existing statistics collector file for the given database,
- * filling the passed-in tables and functions hash tables.
- *
- * As in pgstat_read_statsfiles, if the permanent file is requested, it is
- * removed after reading.
- *
- * Note: this code has the ability to skip storing per-table or per-function
- * data, if NULL is passed for the corresponding hashtable.  That's not used
- * at the moment though.
- * ----------
- */
-static void
-pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
- bool permanent)
-{
- PgStat_StatTabEntry *tabentry;
- PgStat_StatTabEntry tabbuf;
- PgStat_StatFuncEntry funcbuf;
- PgStat_StatFuncEntry *funcentry;
- FILE   *fpin;
- int32 format_id;
- bool found;
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
-
- /*
- * Try to open the stats file. If it doesn't exist, the backends simply
- * return zero for anything and the collector simply starts from scratch
- * with empty counters.
- *
- * ENOENT is a possibility if the stats collector is not running or has
- * not yet written the stats file the first time.  Any other failure
- * condition is suspicious.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- goto done;
- }
-
- /*
- * We found an existing collector stats file. Read it and put all the
- * hashtable entries into place.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'T' A PgStat_StatTabEntry follows.
- */
- case 'T':
- if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
-  fpin) != sizeof(PgStat_StatTabEntry))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * Skip if table data not wanted.
- */
- if (tabhash == NULL)
- break;
-
- tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-   (void *) &tabbuf.tableid,
-   HASH_ENTER, &found);
-
- if (found)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- memcpy(tabentry, &tabbuf, sizeof(tabbuf));
- break;
-
- /*
- * 'F' A PgStat_StatFuncEntry follows.
- */
- case 'F':
- if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
-  fpin) != sizeof(PgStat_StatFuncEntry))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * Skip if function data not wanted.
- */
- if (funchash == NULL)
- break;
-
- funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
- (void *) &funcbuf.functionid,
- HASH_ENTER, &found);
-
- if (found)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- memcpy(funcentry, &funcbuf, sizeof(funcbuf));
- break;
-
- /*
- * 'E' The EOF marker of a complete stats file.
- */
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
- FreeFile(fpin);
-
- if (permanent)
- {
- elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
- unlink(statfile);
- }
-}
-
-/* ----------
- * pgstat_read_db_statsfile_timestamp() -
- *
- * Attempt to determine the timestamp of the last db statfile write.
- * Returns true if successful; the timestamp is stored in *ts.
- *
- * This needs to be careful about handling databases for which no stats file
- * exists, such as databases without a stat entry or those not yet written:
- *
- * - if there's a database entry in the global file, return the corresponding
- * stats_timestamp value.
- *
- * - if there's no db stat entry (e.g. for a new or inactive database),
- * there's no stats_timestamp value, but also nothing to write so we return
- * the timestamp of the global statfile.
- * ----------
- */
-static bool
-pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
-   TimestampTz *ts)
-{
- PgStat_StatDBEntry dbentry;
- PgStat_GlobalStats myGlobalStats;
- PgStat_ArchiverStats myArchiverStats;
- FILE   *fpin;
- int32 format_id;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
- /*
- * Try to open the stats file.  As above, anything but ENOENT is worthy of
- * complaining about.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return false;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /*
- * Read global stats struct
- */
- if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
-  fpin) != sizeof(myGlobalStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /*
- * Read archiver stats struct
- */
- if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
-  fpin) != sizeof(myArchiverStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /* By default, we're going to return the timestamp of the global file. */
- *ts = myGlobalStats.stats_timestamp;
-
- /*
- * We found an existing collector stats file.  Read it and look for a
- * record for the requested database.  If found, use its timestamp.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'D' A PgStat_StatDBEntry struct describing a database
- * follows.
- */
- case 'D':
- if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
-  fpin) != offsetof(PgStat_StatDBEntry, tables))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * If this is the DB we're looking for, save its timestamp and
- * we're done.
- */
- if (dbentry.databaseid == databaseid)
- {
- *ts = dbentry.stats_timestamp;
- goto done;
- }
-
- break;
-
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
- FreeFile(fpin);
- return true;
-}
-
-/*
- * If not already done, read the statistics collector stats file into
- * some hash tables.  The results will be kept until pgstat_clear_snapshot()
- * is called (typically, at end of transaction).
- */
-static void
-backend_read_statsfile(void)
-{
- TimestampTz min_ts = 0;
- TimestampTz ref_ts = 0;
- Oid inquiry_db;
- int count;
-
- /* already read it? */
- if (pgStatDBHash)
- return;
- Assert(!pgStatRunningInCollector);
-
- /*
- * In a normal backend, we check staleness of the data for our own DB, and
- * so we send MyDatabaseId in inquiry messages.  In the autovac launcher,
- * check staleness of the shared-catalog data, and send InvalidOid in
- * inquiry messages so as not to force writing unnecessary data.
- */
- if (IsAutoVacuumLauncherProcess())
- inquiry_db = InvalidOid;
- else
- inquiry_db = MyDatabaseId;
-
- /*
- * Loop until fresh enough stats file is available or we ran out of time.
- * The stats inquiry message is sent repeatedly in case collector drops
- * it; but not every single time, as that just swamps the collector.
- */
- for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
- {
- bool ok;
- TimestampTz file_ts = 0;
- TimestampTz cur_ts;
-
- CHECK_FOR_INTERRUPTS();
-
- ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
-
- cur_ts = GetCurrentTimestamp();
- /* Calculate min acceptable timestamp, if we didn't already */
- if (count == 0 || cur_ts < ref_ts)
- {
- /*
- * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
- * msec before now.  This indirectly ensures that the collector
- * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
- * an autovacuum worker, however, we want a lower delay to avoid
- * using stale data, so we use PGSTAT_RETRY_DELAY (since the
- * number of workers is low, this shouldn't be a problem).
- *
- * We don't recompute min_ts after sleeping, except in the
- * unlikely case that cur_ts went backwards.  So we might end up
- * accepting a file a bit older than PGSTAT_STAT_INTERVAL.  In
- * practice that shouldn't happen, though, as long as the sleep
- * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
- * tell the collector that our cutoff time is less than what we'd
- * actually accept.
- */
- ref_ts = cur_ts;
- if (IsAutoVacuumWorkerProcess())
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_RETRY_DELAY);
- else
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_STAT_INTERVAL);
- }
-
- /*
- * If the file timestamp is actually newer than cur_ts, we must have
- * had a clock glitch (system time went backwards) or there is clock
- * skew between our processor and the stats collector's processor.
- * Accept the file, but send an inquiry message anyway to make
- * pgstat_recv_inquiry do a sanity check on the collector's time.
- */
- if (ok && file_ts > cur_ts)
- {
- /*
- * A small amount of clock skew between processors isn't terribly
- * surprising, but a large difference is worth logging.  We
- * arbitrarily define "large" as 1000 msec.
- */
- if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
- {
- char   *filetime;
- char   *mytime;
-
- /* Copy because timestamptz_to_str returns a static buffer */
- filetime = pstrdup(timestamptz_to_str(file_ts));
- mytime = pstrdup(timestamptz_to_str(cur_ts));
- elog(LOG, "stats collector's time %s is later than backend local time %s",
- filetime, mytime);
- pfree(filetime);
- pfree(mytime);
- }
-
- pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
- break;
- }
-
- /* Normal acceptance case: file is not older than cutoff time */
- if (ok && file_ts >= min_ts)
- break;
-
- /* Not there or too old, so kick the collector and wait a bit */
- if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
- pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
-
- pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
- }
-
- if (count >= PGSTAT_POLL_LOOP_COUNT)
- ereport(LOG,
- (errmsg("using stale statistics instead of current ones "
- "because stats collector is not responding")));
-
- /*
- * Autovacuum launcher wants stats about all databases, but a shallow read
- * is sufficient.  Regular backends want a deep read for just the tables
- * they can see (MyDatabaseId + shared catalogs).
- */
- if (IsAutoVacuumLauncherProcess())
- pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
- else
- pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
-}
-
-
-/* ----------
- * pgstat_setup_memcxt() -
- *
- * Create pgStatLocalContext, if not already done.
- * ----------
- */
-static void
-pgstat_setup_memcxt(void)
-{
- if (!pgStatLocalContext)
- pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
-   "Statistics snapshot",
-   ALLOCSET_SMALL_SIZES);
-}
-
-
-/* ----------
- * pgstat_clear_snapshot() -
- *
- * Discard any data collected in the current transaction.  Any subsequent
- * request will cause new snapshots to be read.
- *
- * This is also invoked during transaction commit or abort to discard
- * the no-longer-wanted snapshot.
- * ----------
- */
-void
-pgstat_clear_snapshot(void)
-{
- /* Release memory, if any was allocated */
- if (pgStatLocalContext)
- MemoryContextDelete(pgStatLocalContext);
-
- /* Reset variables */
- pgStatLocalContext = NULL;
- pgStatDBHash = NULL;
- localBackendStatusTable = NULL;
- localNumBackends = 0;
-}
-
-
-/* ----------
- * pgstat_recv_inquiry() -
- *
- * Process stat inquiry requests.
- * ----------
- */
-static void
-pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
-
- /*
- * If there's already a write request for this DB, there's nothing to do.
- *
- * Note that if a request is found, we return early and skip the below
- * check for clock skew.  This is okay, since the only way for a DB
- * request to be present in the list is that we have been here since the
- * last write round.  It seems sufficient to check for clock skew once per
- * write round.
- */
- if (list_member_oid(pending_write_requests, msg->databaseid))
- return;
-
- /*
- * Check to see if we last wrote this database at a time >= the requested
- * cutoff time.  If so, this is a stale request that was generated before
- * we updated the DB file, and we don't need to do so again.
- *
- * If the requestor's local clock time is older than stats_timestamp, we
- * should suspect a clock glitch, ie system time going backwards; though
- * the more likely explanation is just delayed message receipt.  It is
- * worth expending a GetCurrentTimestamp call to be sure, since a large
- * retreat in the system clock reading could otherwise cause us to neglect
- * to update the stats file for a long time.
- */
- dbentry = pgstat_get_db_entry(msg->databaseid, false);
- if (dbentry == NULL)
- {
- /*
- * We have no data for this DB.  Enter a write request anyway so that
- * the global stats will get updated.  This is needed to prevent
- * backend_read_statsfile from waiting for data that we cannot supply,
- * in the case of a new DB that nobody has yet reported any stats for.
- * See the behavior of pgstat_read_db_statsfile_timestamp.
- */
- }
- else if (msg->clock_time < dbentry->stats_timestamp)
- {
- TimestampTz cur_ts = GetCurrentTimestamp();
-
- if (cur_ts < dbentry->stats_timestamp)
- {
- /*
- * Sure enough, time went backwards.  Force a new stats file write
- * to get back in sync; but first, log a complaint.
- */
- char   *writetime;
- char   *mytime;
-
- /* Copy because timestamptz_to_str returns a static buffer */
- writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
- mytime = pstrdup(timestamptz_to_str(cur_ts));
- elog(LOG,
- "stats_timestamp %s is later than collector's time %s for database %u",
- writetime, mytime, dbentry->databaseid);
- pfree(writetime);
- pfree(mytime);
- }
- else
- {
- /*
- * Nope, it's just an old request.  Assuming msg's clock_time is
- * >= its cutoff_time, it must be stale, so we can ignore it.
- */
- return;
- }
- }
- else if (msg->cutoff_time <= dbentry->stats_timestamp)
- {
- /* Stale request, ignore it */
- return;
- }
-
- /*
- * We need to write this DB, so create a request.
- */
- pending_write_requests = lappend_oid(pending_write_requests,
- msg->databaseid);
-}
-
-
-/* ----------
- * pgstat_recv_tabstat() -
- *
- * Count what the backend has done.
- * ----------
- */
-static void
-pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
- int i;
- bool found;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- /*
- * Update database-wide stats.
- */
- dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
- dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
- dbentry->n_block_read_time += msg->m_block_read_time;
- dbentry->n_block_write_time += msg->m_block_write_time;
-
- /*
- * Process all table entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
-
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &(tabmsg->t_id),
-   HASH_ENTER, &found);
-
- if (!found)
- {
- /*
- * If it's a new table entry, initialize counters to the values we
- * just got.
- */
- tabentry->numscans = tabmsg->t_counts.t_numscans;
- tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned;
- tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched;
- tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted;
- tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated;
- tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted;
- tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated;
- tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples;
- tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples;
- tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples;
- tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched;
- tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit;
-
- tabentry->vacuum_timestamp = 0;
- tabentry->vacuum_count = 0;
- tabentry->autovac_vacuum_timestamp = 0;
- tabentry->autovac_vacuum_count = 0;
- tabentry->analyze_timestamp = 0;
- tabentry->analyze_count = 0;
- tabentry->autovac_analyze_timestamp = 0;
- tabentry->autovac_analyze_count = 0;
- }
- else
- {
- /*
- * Otherwise add the values to the existing entry.
- */
- tabentry->numscans += tabmsg->t_counts.t_numscans;
- tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned;
- tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
- tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
- tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated;
- tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
- tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated;
- /* If table was truncated, first reset the live/dead counters */
- if (tabmsg->t_counts.t_truncated)
- {
- tabentry->n_live_tuples = 0;
- tabentry->n_dead_tuples = 0;
- }
- tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples;
- tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples;
- tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples;
- tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
- tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit;
- }
-
- /* Clamp n_live_tuples in case of negative delta_live_tuples */
- tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
- /* Likewise for n_dead_tuples */
- tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
-
- /*
- * Add per-table stats to the per-database entry, too.
- */
- dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned;
- dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
- dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
- dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated;
- dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
- dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
- dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit;
- }
-}
-
-
-/* ----------
- * pgstat_recv_tabpurge() -
- *
- * Arrange for dead table removal.
- * ----------
- */
-static void
-pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- int i;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- /*
- * No need to purge if we don't even know the database.
- */
- if (!dbentry || !dbentry->tables)
- return;
-
- /*
- * Process all table entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- /* Remove from hashtable if present; we don't care if it's not. */
- (void) hash_search(dbentry->tables,
-   (void *) &(msg->m_tableid[i]),
-   HASH_REMOVE, NULL);
- }
-}
-
-
-/* ----------
- * pgstat_recv_dropdb() -
- *
- * Arrange for dead database removal
- * ----------
- */
-static void
-pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
-{
- Oid dbid = msg->m_databaseid;
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Lookup the database in the hashtable.
- */
- dbentry = pgstat_get_db_entry(dbid, false);
-
- /*
- * If found, remove it (along with the db statfile).
- */
- if (dbentry)
- {
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "removing stats file \"%s\"", statfile);
- unlink(statfile);
-
- if (dbentry->tables != NULL)
- hash_destroy(dbentry->tables);
- if (dbentry->functions != NULL)
- hash_destroy(dbentry->functions);
-
- if (hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_REMOVE, NULL) == NULL)
- ereport(ERROR,
- (errmsg("database hash table corrupted during cleanup --- abort")));
- }
-}
-
-
-/* ----------
- * pgstat_recv_resetcounter() -
- *
- * Reset the statistics for the specified database.
- * ----------
- */
-static void
-pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Lookup the database in the hashtable.  Nothing to do if not there.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- if (!dbentry)
- return;
-
- /*
- * We simply throw away all the database's table entries by recreating a
- * new hash table for them.
- */
- if (dbentry->tables != NULL)
- hash_destroy(dbentry->tables);
- if (dbentry->functions != NULL)
- hash_destroy(dbentry->functions);
-
- dbentry->tables = NULL;
- dbentry->functions = NULL;
-
- /*
- * Reset database-level stats, too.  This creates empty hash tables for
- * tables and functions.
- */
- reset_dbentry_counters(dbentry);
-}
-
-/* ----------
- * pgstat_recv_resetsharedcounter() -
- *
- * Reset some shared statistics of the cluster.
- * ----------
- */
-static void
-pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
-{
- if (msg->m_resettarget == RESET_BGWRITER)
- {
- /* Reset the global background writer statistics for the cluster. */
- memset(&globalStats, 0, sizeof(globalStats));
- globalStats.stat_reset_timestamp = GetCurrentTimestamp();
- }
- else if (msg->m_resettarget == RESET_ARCHIVER)
- {
- /* Reset the archiver statistics for the cluster. */
- memset(&archiverStats, 0, sizeof(archiverStats));
- archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
- }
-
- /*
- * Presumably the sender of this message validated the target, don't
- * complain here if it's not valid
- */
-}
-
-/* ----------
- * pgstat_recv_resetsinglecounter() -
- *
- * Reset a statistics for a single object
- * ----------
- */
-static void
-pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- if (!dbentry)
- return;
-
- /* Set the reset timestamp for the whole database */
- dbentry->stat_reset_timestamp = GetCurrentTimestamp();
-
- /* Remove object if it exists, ignore it if not */
- if (msg->m_resettype == RESET_TABLE)
- (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid),
-   HASH_REMOVE, NULL);
- else if (msg->m_resettype == RESET_FUNCTION)
- (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid),
-   HASH_REMOVE, NULL);
-}
-
-/* ----------
- * pgstat_recv_autovac() -
- *
- * Process an autovacuum signalling message.
- * ----------
- */
-static void
-pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Store the last autovacuum time in the database's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->last_autovac_time = msg->m_start_time;
-}
-
-/* ----------
- * pgstat_recv_vacuum() -
- *
- * Process a VACUUM message.
- * ----------
- */
-static void
-pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
-
- /*
- * Store the data in the table's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
-
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
-
- if (msg->m_autovacuum)
- {
- tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
- tabentry->autovac_vacuum_count++;
- }
- else
- {
- tabentry->vacuum_timestamp = msg->m_vacuumtime;
- tabentry->vacuum_count++;
- }
-}
-
-/* ----------
- * pgstat_recv_analyze() -
- *
- * Process an ANALYZE message.
- * ----------
- */
-static void
-pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
-
- /*
- * Store the data in the table's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
-
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
-
- /*
- * If commanded, reset changes_since_analyze to zero.  This forgets any
- * changes that were committed while the ANALYZE was in progress, but we
- * have no good way to estimate how many of those there were.
- */
- if (msg->m_resetcounter)
- tabentry->changes_since_analyze = 0;
-
- if (msg->m_autovacuum)
- {
- tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
- tabentry->autovac_analyze_count++;
- }
- else
- {
- tabentry->analyze_timestamp = msg->m_analyzetime;
- tabentry->analyze_count++;
- }
-}
-
-
-/* ----------
- * pgstat_recv_archiver() -
- *
- * Process a ARCHIVER message.
- * ----------
- */
-static void
-pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
-{
- if (msg->m_failed)
- {
- /* Failed archival attempt */
- ++archiverStats.failed_count;
- memcpy(archiverStats.last_failed_wal, msg->m_xlog,
-   sizeof(archiverStats.last_failed_wal));
- archiverStats.last_failed_timestamp = msg->m_timestamp;
- }
- else
- {
- /* Successful archival operation */
- ++archiverStats.archived_count;
- memcpy(archiverStats.last_archived_wal, msg->m_xlog,
-   sizeof(archiverStats.last_archived_wal));
- archiverStats.last_archived_timestamp = msg->m_timestamp;
- }
-}
-
-/* ----------
- * pgstat_recv_bgwriter() -
- *
- * Process a BGWRITER message.
- * ----------
- */
-static void
-pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
-{
- globalStats.timed_checkpoints += msg->m_timed_checkpoints;
- globalStats.requested_checkpoints += msg->m_requested_checkpoints;
- globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
- globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
- globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
- globalStats.buf_written_clean += msg->m_buf_written_clean;
- globalStats.maxwritten_clean += msg->m_maxwritten_clean;
- globalStats.buf_written_backend += msg->m_buf_written_backend;
- globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
- globalStats.buf_alloc += msg->m_buf_alloc;
-}
-
-/* ----------
- * pgstat_recv_recoveryconflict() -
- *
- * Process a RECOVERYCONFLICT message.
- * ----------
- */
-static void
-pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- switch (msg->m_reason)
- {
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
-
- /*
- * Since we drop the information about the database as soon as it
- * replicates, there is no point in counting these conflicts.
- */
- break;
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
- dbentry->n_conflict_tablespace++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
- dbentry->n_conflict_lock++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
- dbentry->n_conflict_snapshot++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
- dbentry->n_conflict_bufferpin++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
- dbentry->n_conflict_startup_deadlock++;
- break;
- }
-}
-
-/* ----------
- * pgstat_recv_deadlock() -
- *
- * Process a DEADLOCK message.
- * ----------
- */
-static void
-pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_deadlocks++;
-}
-
-/* ----------
- * pgstat_recv_checksum_failure() -
- *
- * Process a CHECKSUMFAILURE message.
- * ----------
- */
-static void
-pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_checksum_failures += msg->m_failurecount;
- dbentry->last_checksum_failure = msg->m_failure_time;
-}
-
-/* ----------
- * pgstat_recv_tempfile() -
- *
- * Process a TEMPFILE message.
- * ----------
- */
-static void
-pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_temp_bytes += msg->m_filesize;
- dbentry->n_temp_files += 1;
-}
-
-/* ----------
- * pgstat_recv_funcstat() -
- *
- * Count what the backend has done.
- * ----------
- */
-static void
-pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
-{
- PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
- PgStat_StatDBEntry *dbentry;
- PgStat_StatFuncEntry *funcentry;
- int i;
- bool found;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- /*
- * Process all function entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++, funcmsg++)
- {
- funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
- (void *) &(funcmsg->f_id),
- HASH_ENTER, &found);
-
- if (!found)
- {
- /*
- * If it's a new function entry, initialize counters to the values
- * we just got.
- */
- funcentry->f_numcalls = funcmsg->f_numcalls;
- funcentry->f_total_time = funcmsg->f_total_time;
- funcentry->f_self_time = funcmsg->f_self_time;
- }
- else
- {
- /*
- * Otherwise add the values to the existing entry.
- */
- funcentry->f_numcalls += funcmsg->f_numcalls;
- funcentry->f_total_time += funcmsg->f_total_time;
- funcentry->f_self_time += funcmsg->f_self_time;
- }
- }
-}
-
-/* ----------
- * pgstat_recv_funcpurge() -
- *
- * Arrange for dead function removal.
- * ----------
- */
-static void
-pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- int i;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- /*
- * No need to purge if we don't even know the database.
- */
- if (!dbentry || !dbentry->functions)
- return;
-
- /*
- * Process all function entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- /* Remove from hashtable if present; we don't care if it's not. */
- (void) hash_search(dbentry->functions,
-   (void *) &(msg->m_functionid[i]),
-   HASH_REMOVE, NULL);
- }
-}
-
-/* ----------
- * pgstat_write_statsfile_needed() -
- *
- * Do we need to write out any stats files?
- * ----------
- */
-static bool
-pgstat_write_statsfile_needed(void)
-{
- if (pending_write_requests != NIL)
- return true;
-
- /* Everything was written recently */
- return false;
-}
-
-/* ----------
- * pgstat_db_requested() -
- *
- * Checks whether stats for a particular DB need to be written to a file.
- * ----------
- */
-static bool
-pgstat_db_requested(Oid databaseid)
-{
- /*
- * If any requests are outstanding at all, we should write the stats for
- * shared catalogs (the "database" with OID 0).  This ensures that
- * backends will see up-to-date stats for shared catalogs, even though
- * they send inquiry messages mentioning only their own DB.
- */
- if (databaseid == InvalidOid && pending_write_requests != NIL)
- return true;
-
- /* Search to see if there's an open request to write this database. */
- if (list_member_oid(pending_write_requests, databaseid))
- return true;
-
- return false;
-}
-
 /*
  * Convert a potentially unsafely truncated activity string (see
  * PgBackendStatus.st_activity_raw's documentation) into a correctly truncated
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 582434252f..bb438df2fc 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -255,7 +255,6 @@ static pid_t StartupPID = 0,
  WalReceiverPID = 0,
  AutoVacPID = 0,
  PgArchPID = 0,
- PgStatPID = 0,
  SysLoggerPID = 0;
 
 /* Startup process's status */
@@ -503,7 +502,6 @@ typedef struct
  PGPROC   *AuxiliaryProcs;
  PGPROC   *PreparedXactProcs;
  PMSignalData *PMSignalState;
- InheritableSocket pgStatSock;
  pid_t PostmasterPid;
  TimestampTz PgStartTime;
  TimestampTz PgReloadTime;
@@ -1249,66 +1247,6 @@ PostmasterMain(int argc, char *argv[])
  */
  RemovePgTempFiles();
 
- /*
- * Forcibly remove the files signaling a standby promotion request.
- * Otherwise, the existence of those files triggers a promotion too early,
- * whether a user wants that or not.
- *
- * This removal of files is usually unnecessary because they can exist
- * only during a few moments during a standby promotion. However there is
- * a race condition: if pg_ctl promote is executed and creates the files
- * during a promotion, the files can stay around even after the server is
- * brought up to new master. Then, if new standby starts by using the
- * backup taken from that master, the files can exist at the server
- * startup and should be removed in order to avoid an unexpected
- * promotion.
- *
- * Note that promotion signal files need to be removed before the startup
- * process is invoked. Because, after that, they can be used by
- * postmaster's SIGUSR1 signal handler.
- */
- RemovePromoteSignalFiles();
-
- /* Do the same for logrotate signal file */
- RemoveLogrotateSignalFiles();
-
- /* Remove any outdated file holding the current log filenames. */
- if (unlink(LOG_METAINFO_DATAFILE) < 0 && errno != ENOENT)
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not remove file \"%s\": %m",
- LOG_METAINFO_DATAFILE)));
-
- /*
- * If enabled, start up syslogger collection subprocess
- */
- SysLoggerPID = SysLogger_Start();
-
- /*
- * Reset whereToSendOutput from DestDebug (its starting state) to
- * DestNone. This stops ereport from sending log messages to stderr unless
- * Log_destination permits.  We don't do this until the postmaster is
- * fully launched, since startup failures may as well be reported to
- * stderr.
- *
- * If we are in fact disabling logging to stderr, first emit a log message
- * saying so, to provide a breadcrumb trail for users who may not remember
- * that their logging is configured to go somewhere else.
- */
- if (!(Log_destination & LOG_DESTINATION_STDERR))
- ereport(LOG,
- (errmsg("ending log output to stderr"),
- errhint("Future log output will go to log destination \"%s\".",
- Log_destination_string)));
-
- whereToSendOutput = DestNone;
-
- /*
- * Initialize stats collection subsystem (this does NOT start the
- * collector process!)
- */
- pgstat_init();
-
  /*
  * Initialize the autovacuum subsystem (again, no process start yet)
  */
@@ -1757,11 +1695,6 @@ ServerLoop(void)
  start_autovac_launcher = false; /* signal processed */
  }
 
- /* If we have lost the stats collector, try to start a new one */
- if (PgStatPID == 0 &&
- (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
- PgStatPID = pgstat_start();
-
  /* If we have lost the archiver, try to start a new one. */
  if (PgArchPID == 0 && PgArchStartupAllowed())
  PgArchPID = StartArchiver();
@@ -2646,8 +2579,6 @@ SIGHUP_handler(SIGNAL_ARGS)
  signal_child(PgArchPID, SIGHUP);
  if (SysLoggerPID != 0)
  signal_child(SysLoggerPID, SIGHUP);
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGHUP);
 
  /* Reload authentication config files too */
  if (!load_hba())
@@ -2994,8 +2925,6 @@ reaper(SIGNAL_ARGS)
  AutoVacPID = StartAutoVacLauncher();
  if (PgArchStartupAllowed() && PgArchPID == 0)
  PgArchPID = StartArchiver();
- if (PgStatPID == 0)
- PgStatPID = pgstat_start();
 
  /* workers may be scheduled to start now */
  maybe_start_bgworkers();
@@ -3062,13 +2991,6 @@ reaper(SIGNAL_ARGS)
  SignalChildren(SIGUSR2);
 
  pmState = PM_SHUTDOWN_2;
-
- /*
- * We can also shut down the stats collector now; there's
- * nothing left for it to do.
- */
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
  }
  else
  {
@@ -3143,22 +3065,6 @@ reaper(SIGNAL_ARGS)
  continue;
  }
 
- /*
- * Was it the statistics collector?  If so, just try to start a new
- * one; no need to force reset of the rest of the system.  (If fail,
- * we'll try again in future cycles of the main loop.)
- */
- if (pid == PgStatPID)
- {
- PgStatPID = 0;
- if (!EXIT_STATUS_0(exitstatus))
- LogChildExit(LOG, _("statistics collector process"),
- pid, exitstatus);
- if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
- PgStatPID = pgstat_start();
- continue;
- }
-
  /* Was it the system logger?  If so, try to start a new one */
  if (pid == SysLoggerPID)
  {
@@ -3617,22 +3523,6 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  signal_child(PgArchPID, SIGQUIT);
  }
 
- /*
- * Force a power-cycle of the pgstat process too.  (This isn't absolutely
- * necessary, but it seems like a good idea for robustness, and it
- * simplifies the state-machine logic in the case where a shutdown request
- * arrives during crash processing.)
- */
- if (PgStatPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- "SIGQUIT",
- (int) PgStatPID)));
- signal_child(PgStatPID, SIGQUIT);
- allow_immediate_pgstat_restart();
- }
-
  /* We do NOT restart the syslogger */
 
  if (Shutdown != ImmediateShutdown)
@@ -3828,8 +3718,6 @@ PostmasterStateMachine(void)
  SignalChildren(SIGQUIT);
  if (PgArchPID != 0)
  signal_child(PgArchPID, SIGQUIT);
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
  }
  }
  }
@@ -3864,8 +3752,7 @@ PostmasterStateMachine(void)
  * normal state transition leading up to PM_WAIT_DEAD_END, or during
  * FatalError processing.
  */
- if (dlist_is_empty(&BackendList) &&
- PgArchPID == 0 && PgStatPID == 0)
+ if (dlist_is_empty(&BackendList) && PgArchPID == 0)
  {
  /* These other guys should be dead already */
  Assert(StartupPID == 0);
@@ -4066,8 +3953,6 @@ TerminateChildren(int signal)
  signal_child(AutoVacPID, signal);
  if (PgArchPID != 0)
  signal_child(PgArchPID, signal);
- if (PgStatPID != 0)
- signal_child(PgStatPID, signal);
 }
 
 /*
@@ -5040,18 +4925,6 @@ SubPostmasterMain(int argc, char *argv[])
 
  StartBackgroundWorker();
  }
- if (strcmp(argv[1], "--forkarch") == 0)
- {
- /* Do not want to attach to shared memory */
-
- PgArchiverMain(argc, argv); /* does not return */
- }
- if (strcmp(argv[1], "--forkcol") == 0)
- {
- /* Do not want to attach to shared memory */
-
- PgstatCollectorMain(argc, argv); /* does not return */
- }
  if (strcmp(argv[1], "--forklog") == 0)
  {
  /* Do not want to attach to shared memory */
@@ -5164,12 +5037,6 @@ sigusr1_handler(SIGNAL_ARGS)
  if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
  pmState == PM_RECOVERY && Shutdown == NoShutdown)
  {
- /*
- * Likewise, start other special children as needed.
- */
- Assert(PgStatPID == 0);
- PgStatPID = pgstat_start();
-
  ereport(LOG,
  (errmsg("database system is ready to accept read only connections")));
 
@@ -6058,7 +5925,6 @@ extern slock_t *ShmemLock;
 extern slock_t *ProcStructLock;
 extern PGPROC *AuxiliaryProcs;
 extern PMSignalData *PMSignalState;
-extern pgsocket pgStatSock;
 extern pg_time_t first_syslogger_file_time;
 
 #ifndef WIN32
@@ -6114,8 +5980,6 @@ save_backend_variables(BackendParameters *param, Port *port,
  param->AuxiliaryProcs = AuxiliaryProcs;
  param->PreparedXactProcs = PreparedXactProcs;
  param->PMSignalState = PMSignalState;
- if (!write_inheritable_socket(&param->pgStatSock, pgStatSock, childPid))
- return false;
 
  param->PostmasterPid = PostmasterPid;
  param->PgStartTime = PgStartTime;
@@ -6350,7 +6214,6 @@ restore_backend_variables(BackendParameters *param, Port *port)
  AuxiliaryProcs = param->AuxiliaryProcs;
  PreparedXactProcs = param->PreparedXactProcs;
  PMSignalState = param->PMSignalState;
- read_inheritable_socket(&pgStatSock, &param->pgStatSock);
 
  PostmasterPid = param->PostmasterPid;
  PgStartTime = param->PgStartTime;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 885370698f..cfb3b91b11 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -147,6 +147,7 @@ CreateSharedMemoryAndSemaphores(void)
  size = add_size(size, BTreeShmemSize());
  size = add_size(size, SyncScanShmemSize());
  size = add_size(size, AsyncShmemSize());
+ size = add_size(size, StatsShmemSize());
 #ifdef EXEC_BACKEND
  size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -263,6 +264,7 @@ CreateSharedMemoryAndSemaphores(void)
  BTreeShmemInit();
  SyncScanShmemInit();
  AsyncShmemInit();
+ StatsShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index fb0bf44264..b423aaaf02 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -522,6 +522,7 @@ RegisterLWLockTranches(void)
  LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
  LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join");
  LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact");
+ LWLockRegisterTranche(LWTRANCHE_STATS, "activity stats");
 
  /* Register named tranches. */
  for (i = 0; i < NamedLWLockTrancheRequests; i++)
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index e8d8e6f828..bec27c3034 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3159,6 +3159,12 @@ ProcessInterrupts(void)
 
  if (ParallelMessagePending)
  HandleParallelMessages();
+
+ if (IdleStatsUpdateTimeoutPending)
+ {
+ IdleStatsUpdateTimeoutPending = false;
+ pgstat_report_stat(true);
+ }
 }
 
 
@@ -3733,6 +3739,7 @@ PostgresMain(int argc, char *argv[],
  sigjmp_buf local_sigjmp_buf;
  volatile bool send_ready_for_query = true;
  bool disable_idle_in_transaction_timeout = false;
+ bool disable_idle_stats_update_timeout = false;
 
  /* Initialize startup process environment if necessary. */
  if (!IsUnderPostmaster)
@@ -4173,9 +4180,17 @@ PostgresMain(int argc, char *argv[],
  }
  else
  {
- ProcessCompletedNotifies();
- pgstat_report_stat(false);
+ long stats_timeout;
 
+ ProcessCompletedNotifies();
+
+ stats_timeout = pgstat_report_stat(false);
+ if (stats_timeout > 0)
+ {
+ disable_idle_stats_update_timeout = true;
+ enable_timeout_after(IDLE_STATS_UPDATE_TIMEOUT,
+ stats_timeout);
+ }
  set_ps_display("idle", false);
  pgstat_report_activity(STATE_IDLE, NULL);
  }
@@ -4210,7 +4225,7 @@ PostgresMain(int argc, char *argv[],
  DoingCommandRead = false;
 
  /*
- * (5) turn off the idle-in-transaction timeout
+ * (5) turn off the idle-in-transaction timeout and stats update timeout
  */
  if (disable_idle_in_transaction_timeout)
  {
@@ -4218,6 +4233,12 @@ PostgresMain(int argc, char *argv[],
  disable_idle_in_transaction_timeout = false;
  }
 
+ if (disable_idle_stats_update_timeout)
+ {
+ disable_timeout(IDLE_STATS_UPDATE_TIMEOUT, false);
+ disable_idle_stats_update_timeout = false;
+ }
+
  /*
  * (6) check for any other interesting events that happened while we
  * slept.
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 3bf96de256..9c694f20c9 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -32,6 +32,7 @@ volatile sig_atomic_t QueryCancelPending = false;
 volatile sig_atomic_t ProcDiePending = false;
 volatile sig_atomic_t ClientConnectionLost = false;
 volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
+volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false;
 volatile sig_atomic_t ConfigReloadPending = false;
 volatile uint32 InterruptHoldoffCount = 0;
 volatile uint32 QueryCancelHoldoffCount = 0;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 29c5ec7b58..66c6a2b1e8 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -74,6 +74,7 @@ static void ShutdownPostgres(int code, Datum arg);
 static void StatementTimeoutHandler(void);
 static void LockTimeoutHandler(void);
 static void IdleInTransactionSessionTimeoutHandler(void);
+static void IdleStatsUpdateTimeoutHandler(void);
 static bool ThereIsAtLeastOneRole(void);
 static void process_startup_options(Port *port, bool am_superuser);
 static void process_settings(Oid databaseid, Oid roleid);
@@ -631,6 +632,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
  RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler);
  RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
  IdleInTransactionSessionTimeoutHandler);
+ RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT,
+ IdleStatsUpdateTimeoutHandler);
  }
 
  /*
@@ -1241,6 +1244,14 @@ IdleInTransactionSessionTimeoutHandler(void)
  SetLatch(MyLatch);
 }
 
+static void
+IdleStatsUpdateTimeoutHandler(void)
+{
+ IdleStatsUpdateTimeoutPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
 /*
  * Returns true if at least one role is defined in this database cluster.
  */
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b7d36b65dd..13be46c172 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 106;
+use Test::More tests => 105;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -123,7 +123,7 @@ is_deeply(
 
 # Contents of these directories should not be copied.
 foreach my $dirname (
- qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_stat_tmp pg_subtrans)
+ qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_subtrans)
   )
 {
  is_deeply(
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1f4db67f3f..43250c3885 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -82,6 +82,7 @@ extern PGDLLIMPORT volatile sig_atomic_t InterruptPending;
 extern PGDLLIMPORT volatile sig_atomic_t QueryCancelPending;
 extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
 extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
+extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending;
 extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
 
 extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 65713abc2b..c9fbcead3f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1,7 +1,7 @@
 /* ----------
  * pgstat.h
  *
- * Definitions for the PostgreSQL statistics collector daemon.
+ * Definitions for the PostgreSQL statistics collector facility.
  *
  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  *
@@ -13,10 +13,11 @@
 
 #include "datatype/timestamp.h"
 #include "libpq/pqcomm.h"
-#include "port/atomics.h"
+#include "lib/dshash.h"
 #include "portability/instr_time.h"
 #include "postmaster/pgarch.h"
 #include "storage/proc.h"
+#include "storage/lwlock.h"
 #include "utils/hsearch.h"
 #include "utils/relcache.h"
 
@@ -40,33 +41,6 @@ typedef enum TrackFunctionsLevel
  TRACK_FUNC_ALL
 } TrackFunctionsLevel;
 
-/* ----------
- * The types of backend -> collector messages
- * ----------
- */
-typedef enum StatMsgType
-{
- PGSTAT_MTYPE_DUMMY,
- PGSTAT_MTYPE_INQUIRY,
- PGSTAT_MTYPE_TABSTAT,
- PGSTAT_MTYPE_TABPURGE,
- PGSTAT_MTYPE_DROPDB,
- PGSTAT_MTYPE_RESETCOUNTER,
- PGSTAT_MTYPE_RESETSHAREDCOUNTER,
- PGSTAT_MTYPE_RESETSINGLECOUNTER,
- PGSTAT_MTYPE_AUTOVAC_START,
- PGSTAT_MTYPE_VACUUM,
- PGSTAT_MTYPE_ANALYZE,
- PGSTAT_MTYPE_ARCHIVER,
- PGSTAT_MTYPE_BGWRITER,
- PGSTAT_MTYPE_FUNCSTAT,
- PGSTAT_MTYPE_FUNCPURGE,
- PGSTAT_MTYPE_RECOVERYCONFLICT,
- PGSTAT_MTYPE_TEMPFILE,
- PGSTAT_MTYPE_DEADLOCK,
- PGSTAT_MTYPE_CHECKSUMFAILURE
-} StatMsgType;
-
 /* ----------
  * The data type used for counters.
  * ----------
@@ -77,9 +51,8 @@ typedef int64 PgStat_Counter;
  * PgStat_TableCounts The actual per-table counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
- * it against zeroes to detect whether there are any counts to transmit.
- * It is a component of PgStat_TableStatus (within-backend state) and
- * PgStat_TableEntry (the transmitted message format).
+ * it against zeroes to detect whether there are any counts to write.
+ * It is a component of PgStat_TableStatus (within-backend state).
  *
  * Note: for a table, tuples_returned is the number of tuples successfully
  * fetched by heap_getnext, while tuples_fetched is the number of tuples
@@ -115,13 +88,6 @@ typedef struct PgStat_TableCounts
  PgStat_Counter t_blocks_hit;
 } PgStat_TableCounts;
 
-/* Possible targets for resetting cluster-wide shared values */
-typedef enum PgStat_Shared_Reset_Target
-{
- RESET_ARCHIVER,
- RESET_BGWRITER
-} PgStat_Shared_Reset_Target;
-
 /* Possible object types for resetting single counters */
 typedef enum PgStat_Single_Reset_Type
 {
@@ -180,236 +146,12 @@ typedef struct PgStat_TableXactStatus
 } PgStat_TableXactStatus;
 
 
-/* ------------------------------------------------------------
- * Message formats follow
- * ------------------------------------------------------------
- */
-
-
 /* ----------
- * PgStat_MsgHdr The common message header
- * ----------
- */
-typedef struct PgStat_MsgHdr
-{
- StatMsgType m_type;
- int m_size;
-} PgStat_MsgHdr;
-
-/* ----------
- * Space available in a message.  This will keep the UDP packets below 1K,
- * which should fit unfragmented into the MTU of the loopback interface.
- * (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most
- * platforms, but we're being conservative here.)
- * ----------
- */
-#define PGSTAT_MAX_MSG_SIZE 1000
-#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr))
-
-
-/* ----------
- * PgStat_MsgDummy A dummy message, ignored by the collector
- * ----------
- */
-typedef struct PgStat_MsgDummy
-{
- PgStat_MsgHdr m_hdr;
-} PgStat_MsgDummy;
-
-
-/* ----------
- * PgStat_MsgInquiry Sent by a backend to ask the collector
- * to write the stats file(s).
- *
- * Ordinarily, an inquiry message prompts writing of the global stats file,
- * the stats file for shared catalogs, and the stats file for the specified
- * database.  If databaseid is InvalidOid, only the first two are written.
- *
- * New file(s) will be written only if the existing file has a timestamp
- * older than the specified cutoff_time; this prevents duplicated effort
- * when multiple requests arrive at nearly the same time, assuming that
- * backends send requests with cutoff_times a little bit in the past.
- *
- * clock_time should be the requestor's current local time; the collector
- * uses this to check for the system clock going backward, but it has no
- * effect unless that occurs.  We assume clock_time >= cutoff_time, though.
- * ----------
- */
-
-typedef struct PgStat_MsgInquiry
-{
- PgStat_MsgHdr m_hdr;
- TimestampTz clock_time; /* observed local clock time */
- TimestampTz cutoff_time; /* minimum acceptable file timestamp */
- Oid databaseid; /* requested DB (InvalidOid => shared only) */
-} PgStat_MsgInquiry;
-
-
-/* ----------
- * PgStat_TableEntry Per-table info in a MsgTabstat
- * ----------
- */
-typedef struct PgStat_TableEntry
-{
- Oid t_id;
- PgStat_TableCounts t_counts;
-} PgStat_TableEntry;
-
-/* ----------
- * PgStat_MsgTabstat Sent by the backend to report table
- * and buffer access statistics.
- * ----------
- */
-#define PGSTAT_NUM_TABENTRIES  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter)) \
- / sizeof(PgStat_TableEntry))
-
-typedef struct PgStat_MsgTabstat
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- int m_xact_commit;
- int m_xact_rollback;
- PgStat_Counter m_block_read_time; /* times in microseconds */
- PgStat_Counter m_block_write_time;
- PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
-} PgStat_MsgTabstat;
-
-
-/* ----------
- * PgStat_MsgTabpurge Sent by the backend to tell the collector
- * about dead tables.
- * ----------
- */
-#define PGSTAT_NUM_TABPURGE  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(Oid))
-
-typedef struct PgStat_MsgTabpurge
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- Oid m_tableid[PGSTAT_NUM_TABPURGE];
-} PgStat_MsgTabpurge;
-
-
-/* ----------
- * PgStat_MsgDropdb Sent by the backend to tell the collector
- * about a dropped database
- * ----------
- */
-typedef struct PgStat_MsgDropdb
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgDropdb;
-
-
-/* ----------
- * PgStat_MsgResetcounter Sent by the backend to tell the collector
- * to reset counters
- * ----------
- */
-typedef struct PgStat_MsgResetcounter
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgResetcounter;
-
-/* ----------
- * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
- * to reset a shared counter
- * ----------
- */
-typedef struct PgStat_MsgResetsharedcounter
-{
- PgStat_MsgHdr m_hdr;
- PgStat_Shared_Reset_Target m_resettarget;
-} PgStat_MsgResetsharedcounter;
-
-/* ----------
- * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector
- * to reset a single counter
- * ----------
- */
-typedef struct PgStat_MsgResetsinglecounter
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- PgStat_Single_Reset_Type m_resettype;
- Oid m_objectid;
-} PgStat_MsgResetsinglecounter;
-
-/* ----------
- * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal
- * that a database is going to be processed
- * ----------
- */
-typedef struct PgStat_MsgAutovacStart
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- TimestampTz m_start_time;
-} PgStat_MsgAutovacStart;
-
-
-/* ----------
- * PgStat_MsgVacuum Sent by the backend or autovacuum daemon
- * after VACUUM
- * ----------
- */
-typedef struct PgStat_MsgVacuum
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- Oid m_tableoid;
- bool m_autovacuum;
- TimestampTz m_vacuumtime;
- PgStat_Counter m_live_tuples;
- PgStat_Counter m_dead_tuples;
-} PgStat_MsgVacuum;
-
-
-/* ----------
- * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon
- * after ANALYZE
- * ----------
- */
-typedef struct PgStat_MsgAnalyze
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- Oid m_tableoid;
- bool m_autovacuum;
- bool m_resetcounter;
- TimestampTz m_analyzetime;
- PgStat_Counter m_live_tuples;
- PgStat_Counter m_dead_tuples;
-} PgStat_MsgAnalyze;
-
-
-/* ----------
- * PgStat_MsgArchiver Sent by the archiver to update statistics.
- * ----------
- */
-typedef struct PgStat_MsgArchiver
-{
- PgStat_MsgHdr m_hdr;
- bool m_failed; /* Failed attempt */
- char m_xlog[MAX_XFN_CHARS + 1];
- TimestampTz m_timestamp;
-} PgStat_MsgArchiver;
-
-/* ----------
- * PgStat_MsgBgWriter Sent by the bgwriter to update statistics.
+ * PgStat_MsgBgWriter bgwriter statistics
  * ----------
  */
 typedef struct PgStat_MsgBgWriter
 {
- PgStat_MsgHdr m_hdr;
-
  PgStat_Counter m_timed_checkpoints;
  PgStat_Counter m_requested_checkpoints;
  PgStat_Counter m_buf_written_checkpoints;
@@ -422,38 +164,14 @@ typedef struct PgStat_MsgBgWriter
  PgStat_Counter m_checkpoint_sync_time;
 } PgStat_MsgBgWriter;
 
-/* ----------
- * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict
- * ----------
- */
-typedef struct PgStat_MsgRecoveryConflict
-{
- PgStat_MsgHdr m_hdr;
-
- Oid m_databaseid;
- int m_reason;
-} PgStat_MsgRecoveryConflict;
-
-/* ----------
- * PgStat_MsgTempFile Sent by the backend upon creating a temp file
- * ----------
- */
-typedef struct PgStat_MsgTempFile
-{
- PgStat_MsgHdr m_hdr;
-
- Oid m_databaseid;
- size_t m_filesize;
-} PgStat_MsgTempFile;
-
 /* ----------
  * PgStat_FunctionCounts The actual per-function counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
- * it against zeroes to detect whether there are any counts to transmit.
+ * it against zeroes to detect whether there are any counts to write.
  *
  * Note that the time counters are in instr_time format here.  We convert to
- * microseconds in PgStat_Counter format when transmitting to the collector.
+ * microseconds in PgStat_Counter format when writing to shared statsitics.
  * ----------
  */
 typedef struct PgStat_FunctionCounts
@@ -485,96 +203,8 @@ typedef struct PgStat_FunctionEntry
  PgStat_Counter f_self_time;
 } PgStat_FunctionEntry;
 
-/* ----------
- * PgStat_MsgFuncstat Sent by the backend to report function
- * usage statistics.
- * ----------
- */
-#define PGSTAT_NUM_FUNCENTRIES \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(PgStat_FunctionEntry))
-
-typedef struct PgStat_MsgFuncstat
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES];
-} PgStat_MsgFuncstat;
-
-/* ----------
- * PgStat_MsgFuncpurge Sent by the backend to tell the collector
- * about dead functions.
- * ----------
- */
-#define PGSTAT_NUM_FUNCPURGE  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(Oid))
-
-typedef struct PgStat_MsgFuncpurge
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- Oid m_functionid[PGSTAT_NUM_FUNCPURGE];
-} PgStat_MsgFuncpurge;
-
-/* ----------
- * PgStat_MsgDeadlock Sent by the backend to tell the collector
- * about a deadlock that occurred.
- * ----------
- */
-typedef struct PgStat_MsgDeadlock
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgDeadlock;
-
-/* ----------
- * PgStat_MsgChecksumFailure Sent by the backend to tell the collector
- * about checksum failures noticed.
- * ----------
- */
-typedef struct PgStat_MsgChecksumFailure
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_failurecount;
- TimestampTz m_failure_time;
-} PgStat_MsgChecksumFailure;
-
-
-/* ----------
- * PgStat_Msg Union over all possible messages.
- * ----------
- */
-typedef union PgStat_Msg
-{
- PgStat_MsgHdr msg_hdr;
- PgStat_MsgDummy msg_dummy;
- PgStat_MsgInquiry msg_inquiry;
- PgStat_MsgTabstat msg_tabstat;
- PgStat_MsgTabpurge msg_tabpurge;
- PgStat_MsgDropdb msg_dropdb;
- PgStat_MsgResetcounter msg_resetcounter;
- PgStat_MsgResetsharedcounter msg_resetsharedcounter;
- PgStat_MsgResetsinglecounter msg_resetsinglecounter;
- PgStat_MsgAutovacStart msg_autovacuum_start;
- PgStat_MsgVacuum msg_vacuum;
- PgStat_MsgAnalyze msg_analyze;
- PgStat_MsgArchiver msg_archiver;
- PgStat_MsgBgWriter msg_bgwriter;
- PgStat_MsgFuncstat msg_funcstat;
- PgStat_MsgFuncpurge msg_funcpurge;
- PgStat_MsgRecoveryConflict msg_recoveryconflict;
- PgStat_MsgDeadlock msg_deadlock;
- PgStat_MsgTempFile msg_tempfile;
- PgStat_MsgChecksumFailure msg_checksumfailure;
-} PgStat_Msg;
-
-
 /* ------------------------------------------------------------
- * Statistic collector data structures follow
+ * Statistic collector data structures on file and shared memory follow
  *
  * PGSTAT_FILE_FORMAT_ID should be changed whenever any of these
  * data structures change.
@@ -614,16 +244,29 @@ typedef struct PgStat_StatDBEntry
  PgStat_Counter n_block_write_time;
 
  TimestampTz stat_reset_timestamp;
- TimestampTz stats_timestamp; /* time of db stats file update */
+ TimestampTz stats_timestamp; /* time of db stats update */
 
  /*
- * tables and functions must be last in the struct, because we don't write
- * the pointers out to the stats file.
+ * The followings must be last in the struct, because we don't write them
+ * out to the stats file.
  */
- HTAB   *tables;
- HTAB   *functions;
+ int generation; /* current generation of the below */
+ int refcnt; /* current gen reference count */
+ dshash_table_handle tables; /* current gen tables hash */
+ dshash_table_handle functions; /* current gen functions hash */
+ int prev_refcnt; /* prev gen reference count */
+ dshash_table_handle prev_tables; /* prev gen tables hash */
+ dshash_table_handle prev_functions; /* prev gen functions hash */
+ LWLock lock; /* Lock for the above members */
+
+ /* non-shared members */
+ HTAB *snapshot_tables; /* table entry snapshot */
+ HTAB *snapshot_functions; /* function entry snapshot */
+ dshash_table *dshash_tables; /* attached tables dshash */
+ dshash_table *dshash_functions; /* attached functions dshash */
 } PgStat_StatDBEntry;
 
+#define SHARED_DBENT_SIZE offsetof(PgStat_StatDBEntry, snapshot_tables)
 
 /* ----------
  * PgStat_StatTabEntry The collector's data per table (or index)
@@ -662,7 +305,7 @@ typedef struct PgStat_StatTabEntry
 
 
 /* ----------
- * PgStat_StatFuncEntry The collector's data per function
+ * PgStat_StatFuncEntry per function stats data
  * ----------
  */
 typedef struct PgStat_StatFuncEntry
@@ -677,7 +320,7 @@ typedef struct PgStat_StatFuncEntry
 
 
 /*
- * Archiver statistics kept in the stats collector
+ * Archiver statistics kept in the shared stats
  */
 typedef struct PgStat_ArchiverStats
 {
@@ -693,7 +336,7 @@ typedef struct PgStat_ArchiverStats
 } PgStat_ArchiverStats;
 
 /*
- * Global statistics kept in the stats collector
+ * Global statistics kept in the shared stats
  */
 typedef struct PgStat_GlobalStats
 {
@@ -779,7 +422,6 @@ typedef enum
  WAIT_EVENT_CHECKPOINTER_MAIN,
  WAIT_EVENT_LOGICAL_APPLY_MAIN,
  WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
- WAIT_EVENT_PGSTAT_MAIN,
  WAIT_EVENT_RECOVERY_WAL_ALL,
  WAIT_EVENT_RECOVERY_WAL_STREAM,
  WAIT_EVENT_SYSLOGGER_MAIN,
@@ -1214,6 +856,8 @@ extern bool pgstat_track_counts;
 extern int pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
+
+/* No longer used, but will be removed with GUC */
 extern char *pgstat_stat_tmpname;
 extern char *pgstat_stat_filename;
 
@@ -1235,29 +879,26 @@ extern PgStat_Counter pgStatBlockWriteTime;
 extern Size BackendStatusShmemSize(void);
 extern void CreateSharedBackendStatus(void);
 
-extern void pgstat_init(void);
-extern int pgstat_start(void);
+extern Size StatsShmemSize(void);
+extern void StatsShmemInit(void);
+
 extern void pgstat_reset_all(void);
-extern void allow_immediate_pgstat_restart(void);
-
-#ifdef EXEC_BACKEND
-extern void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-#endif
 
+/* File input/output functions  */
+extern void pgstat_read_statsfiles(void);
+extern void pgstat_write_statsfiles(void);
 
 /* ----------
  * Functions called from backends
  * ----------
  */
-extern void pgstat_ping(void);
-
-extern void pgstat_report_stat(bool force);
+extern long pgstat_report_stat(bool force);
 extern void pgstat_vacuum_stat(void);
 extern void pgstat_drop_database(Oid databaseid);
 
 extern void pgstat_clear_snapshot(void);
 extern void pgstat_reset_counters(void);
-extern void pgstat_reset_shared_counters(const char *);
+extern void pgstat_reset_shared_counters(const char *target);
 extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type type);
 
 extern void pgstat_report_autovac(Oid dboid);
@@ -1429,11 +1070,13 @@ extern void pgstat_send_bgwriter(void);
  */
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
+extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_extended(PgStat_StatDBEntry *dbent, Oid relid);
 extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
 extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid);
 extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern void pgstat_clear_snapshot(void);
 
 #endif /* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index f627dfedc5..97801f4791 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -220,6 +220,7 @@ typedef enum BuiltinTrancheIds
  LWTRANCHE_TBM,
  LWTRANCHE_PARALLEL_APPEND,
  LWTRANCHE_SXACT,
+ LWTRANCHE_STATS,
  LWTRANCHE_FIRST_USER_DEFINED
 } BuiltinTrancheIds;
 
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index 9244a2a7b7..a9b625211b 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -31,6 +31,7 @@ typedef enum TimeoutId
  STANDBY_TIMEOUT,
  STANDBY_LOCK_TIMEOUT,
  IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
+ IDLE_STATS_UPDATE_TIMEOUT,
  /* First user-definable timeout reason */
  USER_TIMEOUT,
  /* Maximum number of timeout reasons */
--
2.16.3


From 1b7207c2069debf4888a3d526554b2354ccca855 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Tue, 27 Nov 2018 14:42:12 +0900
Subject: [PATCH v22 5/5] Remove the GUC stats_temp_directory

The guc used to specifie the directory to store temporary statistics
files. It is no longer needed by the stats collector but still used by
the programs in bin and contirb, and maybe other extensions. Thus this
patch removes the GUC but some backing variables and macro definitions
are left alone for backward comptibility.
---
 doc/src/sgml/backup.sgml                      |  2 --
 doc/src/sgml/config.sgml                      | 19 -------------
 doc/src/sgml/monitoring.sgml                  |  7 +----
 doc/src/sgml/storage.sgml                     |  3 +-
 src/backend/postmaster/pgstat.c               | 13 ++++-----
 src/backend/replication/basebackup.c          | 13 ++-------
 src/backend/utils/misc/guc.c                  | 41 ---------------------------
 src/backend/utils/misc/postgresql.conf.sample |  1 -
 src/include/pgstat.h                          |  5 +++-
 src/test/perl/PostgresNode.pm                 |  4 ---
 10 files changed, 14 insertions(+), 94 deletions(-)

diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml
index bdc9026c62..2885540362 100644
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -1146,8 +1146,6 @@ SELECT pg_stop_backup();
     <filename>pg_snapshots/</filename>, <filename>pg_stat_tmp/</filename>,
     and <filename>pg_subtrans/</filename> (but not the directories themselves) can be
     omitted from the backup as they will be initialized on postmaster startup.
-    If <xref linkend="guc-stats-temp-directory"/> is set and is under the data
-    directory then the contents of that directory can also be omitted.
    </para>
 
    <para>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 7f9ce8fcba..a8bed31232 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6818,25 +6818,6 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-stats-temp-directory" xreflabel="stats_temp_directory">
-      <term><varname>stats_temp_directory</varname> (<type>string</type>)
-      <indexterm>
-       <primary><varname>stats_temp_directory</varname> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        Sets the directory to store temporary statistics data in. This can be
-        a path relative to the data directory or an absolute path. The default
-        is <filename>pg_stat_tmp</filename>. Pointing this at a RAM-based
-        file system will decrease physical I/O requirements and can lead to
-        improved performance.
-        This parameter can only be set in the <filename>postgresql.conf</filename>
-        file or on the server command line.
-       </para>
-      </listitem>
-     </varlistentry>
-
      </variablelist>
     </sect2>
 
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index ea6aad4d1e..33ad2b8be8 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -195,12 +195,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
 
   <para>
    The statistics collector transmits the collected information to other
-   <productname>PostgreSQL</productname> processes through temporary files.
-   These files are stored in the directory named by the
-   <xref linkend="guc-stats-temp-directory"/> parameter,
-   <filename>pg_stat_tmp</filename> by default.
-   For better performance, <varname>stats_temp_directory</varname> can be
-   pointed at a RAM-based file system, decreasing physical I/O requirements.
+   <productname>PostgreSQL</productname> processes through shared memory.
    When the server shuts down cleanly, a permanent copy of the statistics
    data is stored in the <filename>pg_stat</filename> subdirectory, so that
    statistics can be retained across server restarts.  When recovery is
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index 1c19e863d2..2f04bb68bb 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -122,8 +122,7 @@ Item
 
 <row>
  <entry><filename>pg_stat_tmp</filename></entry>
- <entry>Subdirectory containing temporary files for the statistics
-  subsystem</entry>
+ <entry>Subdirectory containing ephemeral files for extensions</entry>
 </row>
 
 <row>
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index c0b20763b0..6b8025ad13 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -108,15 +108,12 @@ bool pgstat_track_counts = false;
 int pgstat_track_functions = TRACK_FUNC_OFF;
 int pgstat_track_activity_query_size = 1024;
 
-/* ----------
- * Built from GUC parameter
- * ----------
+/*
+ * This used to be a GUC variable and is no longer used in this file, but left
+ * alone just for backward comptibility for extensions, having the default
+ * value.
  */
-char   *pgstat_stat_directory = NULL;
-
-/* No longer used, but will be removed with GUC */
-char   *pgstat_stat_filename = NULL;
-char   *pgstat_stat_tmpname = NULL;
+char   *pgstat_stat_directory = PG_STAT_TMP_DIR;
 
 #define StatsLock (&StatsShmem->StatsMainLock)
 
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 6aab8d7b5f..2eb49924b9 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -242,11 +242,8 @@ perform_base_backup(basebackup_options *opt)
  TimeLineID endtli;
  StringInfo labelfile;
  StringInfo tblspc_map_file = NULL;
- int datadirpathlen;
  List   *tablespaces = NIL;
 
- datadirpathlen = strlen(DataDir);
-
  backup_started_in_recovery = RecoveryInProgress();
 
  labelfile = makeStringInfo();
@@ -277,13 +274,9 @@ perform_base_backup(basebackup_options *opt)
  * Calculate the relative path of temporary statistics directory in
  * order to skip the files which are located in that directory later.
  */
- if (is_absolute_path(pgstat_stat_directory) &&
- strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
- statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
- else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
- statrelpath = psprintf("./%s", pgstat_stat_directory);
- else
- statrelpath = pgstat_stat_directory;
+
+ Assert(strchr(PG_STAT_TMP_DIR, '/') == NULL);
+ statrelpath = psprintf("./%s", PG_STAT_TMP_DIR);
 
  /* Add a node for the base directory at the end */
  ti = palloc0(sizeof(tablespaceinfo));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 90ffd89339..753e30ebb7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -194,7 +194,6 @@ static bool check_max_wal_senders(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static void assign_effective_io_concurrency(int newval, void *extra);
-static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
 static bool check_cluster_name(char **newval, void **extra, GucSource source);
@@ -4072,17 +4071,6 @@ static struct config_string ConfigureNamesString[] =
  NULL, NULL, NULL
  },
 
- {
- {"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR,
- gettext_noop("Writes temporary statistics files to the specified directory."),
- NULL,
- GUC_SUPERUSER_ONLY
- },
- &pgstat_temp_directory,
- PG_STAT_TMP_DIR,
- check_canonical_path, assign_pgstat_temp_directory, NULL
- },
-
  {
  {"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER,
  gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."),
@@ -11352,35 +11340,6 @@ assign_effective_io_concurrency(int newval, void *extra)
 #endif /* USE_PREFETCH */
 }
 
-static void
-assign_pgstat_temp_directory(const char *newval, void *extra)
-{
- /* check_canonical_path already canonicalized newval for us */
- char   *dname;
- char   *tname;
- char   *fname;
-
- /* directory */
- dname = guc_malloc(ERROR, strlen(newval) + 1); /* runtime dir */
- sprintf(dname, "%s", newval);
-
- /* global stats */
- tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */
- sprintf(tname, "%s/global.tmp", newval);
- fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */
- sprintf(fname, "%s/global.stat", newval);
-
- if (pgstat_stat_directory)
- free(pgstat_stat_directory);
- pgstat_stat_directory = dname;
- if (pgstat_stat_tmpname)
- free(pgstat_stat_tmpname);
- pgstat_stat_tmpname = tname;
- if (pgstat_stat_filename)
- free(pgstat_stat_filename);
- pgstat_stat_filename = fname;
-}
-
 static bool
 check_application_name(char **newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0fc23e3a61..66f539c4bb 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -558,7 +558,6 @@
 #track_io_timing = off
 #track_functions = none # none, pl, all
 #track_activity_query_size = 1024 # (change requires restart)
-#stats_temp_directory = 'pg_stat_tmp'
 
 
 # - Monitoring -
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c9fbcead3f..e9e18ed27a 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -30,7 +30,10 @@
 #define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat"
 #define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp"
 
-/* Default directory to store temporary statistics data in */
+/*
+ * This used to be the directory to store temporary statistics data in but is
+ * no longer used. Defined here for backward compatibility.
+ */
 #define PG_STAT_TMP_DIR "pg_stat_tmp"
 
 /* Values for track_functions GUC variable --- order is significant! */
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 270bd6c856..c604c5e90b 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -455,10 +455,6 @@ sub init
  print $conf TestLib::slurp_file($ENV{TEMP_CONFIG})
   if defined $ENV{TEMP_CONFIG};
 
- # XXX Neutralize any stats_temp_directory in TEMP_CONFIG.  Nodes running
- # concurrently must not share a stats_temp_directory.
- print $conf "stats_temp_directory = 'pg_stat_tmp'\n";
-
  if ($params{allows_streaming})
  {
  if ($params{allows_streaming} eq "logical")
--
2.16.3

Reply | Threaded
Open this post in threaded view
|

Re: shared-memory based stats collector

Alvaro Herrera-9
On 2019-Sep-10, Kyotaro Horiguchi wrote:

> At Tue, 3 Sep 2019 18:28:05 -0400, Alvaro Herrera <[hidden email]> wrote in <[hidden email]>
> > > Found a bug in initialization. StatsShememInit() was placed at a
> > > wrong place and stats code on child processes accessed
> > > uninitialized pointer. It is a leftover from the previous shape
> > > where dsm was activated on postmaster.
> >
> > This doesn't apply anymore.  Can you please rebase?
>
> Thanks! I forgot to post rebased version after doing. Here it is.
>
> - (Re)Rebased to the current master.
> - Passed all tests for me.

This seems to have very trivial conflicts -- please rebase again?

--
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


Reply | Threaded
Open this post in threaded view
|

Re: shared-memory based stats collector

Kyotaro Horiguchi-4
At Wed, 25 Sep 2019 18:01:02 -0300, Alvaro Herrera <[hidden email]> wrote in <[hidden email]>

> On 2019-Sep-10, Kyotaro Horiguchi wrote:
>
> > At Tue, 3 Sep 2019 18:28:05 -0400, Alvaro Herrera <[hidden email]> wrote in <[hidden email]>
> > > > Found a bug in initialization. StatsShememInit() was placed at a
> > > > wrong place and stats code on child processes accessed
> > > > uninitialized pointer. It is a leftover from the previous shape
> > > > where dsm was activated on postmaster.
> > >
> > > This doesn't apply anymore.  Can you please rebase?
> >
> > Thanks! I forgot to post rebased version after doing. Here it is.
> >
> > - (Re)Rebased to the current master.
> > - Passed all tests for me.
>
> This seems to have very trivial conflicts -- please rebase again?
Affected by the code movement in 9a86f03b4e. Just
rebased. Thanks.

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

From 70dfe750e365fa9ba15312662b72a13326fb22e3 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Fri, 29 Jun 2018 16:41:04 +0900
Subject: [PATCH v23 1/5] sequential scan for dshash

Add sequential scan feature to dshash.
---
 src/backend/lib/dshash.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/include/lib/dshash.h |  23 +++++-
 2 files changed, 206 insertions(+), 5 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 350f8c0a66..4f0c7ec840 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -112,6 +112,7 @@ struct dshash_table
  size_t size_log2; /* log2(number of buckets) */
  bool find_locked; /* Is any partition lock held by 'find'? */
  bool find_exclusively_locked; /* ... exclusively? */
+ bool seqscan_running;/* now under sequential scan */
 };
 
 /* Given a pointer to an item, find the entry (user data) it holds. */
@@ -127,6 +128,10 @@ struct dshash_table
 #define NUM_SPLITS(size_log2) \
  (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
 
+/* How many buckets are there in a given size? */
+#define NUM_BUCKETS(size_log2) \
+ (((size_t) 1) << (size_log2))
+
 /* How many buckets are there in each partition at a given size? */
 #define BUCKETS_PER_PARTITION(size_log2) \
  (((size_t) 1) << NUM_SPLITS(size_log2))
@@ -153,6 +158,10 @@ struct dshash_table
 #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
  ((partition) << NUM_SPLITS(size_log2))
 
+/* Choose partition based on bucket index. */
+#define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
+ ((bucket_idx) >> NUM_SPLITS(size_log2))
+
 /* The head of the active bucket for a given hash value (lvalue). */
 #define BUCKET_FOR_HASH(hash_table, hash) \
  (hash_table->buckets[ \
@@ -228,6 +237,7 @@ dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
 
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
+ hash_table->seqscan_running = false;
 
  /*
  * Set up the initial array of buckets.  Our initial size is the same as
@@ -279,6 +289,7 @@ dshash_attach(dsa_area *area, const dshash_parameters *params,
  hash_table->control = dsa_get_address(area, control);
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
+ hash_table->seqscan_running = false;
  Assert(hash_table->control->magic == DSHASH_MAGIC);
 
  /*
@@ -324,7 +335,7 @@ dshash_destroy(dshash_table *hash_table)
  ensure_valid_bucket_pointers(hash_table);
 
  /* Free all the entries. */
- size = ((size_t) 1) << hash_table->size_log2;
+ size = NUM_BUCKETS(hash_table->size_log2);
  for (i = 0; i < size; ++i)
  {
  dsa_pointer item_pointer = hash_table->buckets[i];
@@ -549,9 +560,14 @@ dshash_delete_entry(dshash_table *hash_table, void *entry)
  LW_EXCLUSIVE));
 
  delete_item(hash_table, item);
- hash_table->find_locked = false;
- hash_table->find_exclusively_locked = false;
- LWLockRelease(PARTITION_LOCK(hash_table, partition));
+
+ /* We need to keep partition lock while sequential scan */
+ if (!hash_table->seqscan_running)
+ {
+ hash_table->find_locked = false;
+ hash_table->find_exclusively_locked = false;
+ LWLockRelease(PARTITION_LOCK(hash_table, partition));
+ }
 }
 
 /*
@@ -568,6 +584,8 @@ dshash_release_lock(dshash_table *hash_table, void *entry)
  Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition_index),
  hash_table->find_exclusively_locked
  ? LW_EXCLUSIVE : LW_SHARED));
+ /* lock is under control of sequential scan */
+ Assert(!hash_table->seqscan_running);
 
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
@@ -592,6 +610,168 @@ dshash_memhash(const void *v, size_t size, void *arg)
  return tag_hash(v, size);
 }
 
+/*
+ * dshash_seq_init/_next/_term
+ *           Sequentially scan trhough dshash table and return all the
+ *           elements one by one, return NULL when no more.
+ *
+ * dshash_seq_term should be called if and only if the scan is abandoned
+ * before completion; if dshash_seq_next returns NULL then it has already done
+ * the end-of-scan cleanup.
+ *
+ * On returning element, it is locked as is the case with dshash_find.
+ * However, the caller must not release the lock. The lock is released as
+ * necessary in continued scan.
+ *
+ * As opposed to the equivalent for dynanash, the caller is not supposed to
+ * delete the returned element before continuing the scan.
+ *
+ * If consistent is set for dshash_seq_init, the whole hash table is
+ * non-exclusively locked. Otherwise a part of the hash table is locked in the
+ * same mode (partition lock).
+ */
+void
+dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+ bool consistent, bool exclusive)
+{
+ /* allowed at most one scan at once */
+ Assert(!hash_table->seqscan_running);
+
+ status->hash_table = hash_table;
+ status->curbucket = 0;
+ status->nbuckets = 0;
+ status->curitem = NULL;
+ status->pnextitem = InvalidDsaPointer;
+ status->curpartition = -1;
+ status->consistent = consistent;
+ status->exclusive = exclusive;
+ hash_table->seqscan_running = true;
+
+ /*
+ * Protect all partitions from modification if the caller wants a
+ * consistent result.
+ */
+ if (consistent)
+ {
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+
+ LWLockAcquire(PARTITION_LOCK(hash_table, i),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ }
+ ensure_valid_bucket_pointers(hash_table);
+ }
+}
+
+void *
+dshash_seq_next(dshash_seq_status *status)
+{
+ dsa_pointer next_item_pointer;
+
+ Assert(status->hash_table->seqscan_running);
+ if (status->curitem == NULL)
+ {
+ int partition;
+
+ Assert (status->curbucket == 0);
+ Assert(!status->hash_table->find_locked);
+
+ /* first shot. grab the first item. */
+ if (!status->consistent)
+ {
+ partition =
+ PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+   status->hash_table->size_log2);
+ LWLockAcquire(PARTITION_LOCK(status->hash_table, partition),
+  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ status->curpartition = partition;
+
+ /* resize doesn't happen from now until seq scan ends */
+ status->nbuckets =
+ NUM_BUCKETS(status->hash_table->control->size_log2);
+ ensure_valid_bucket_pointers(status->hash_table);
+ }
+
+ next_item_pointer = status->hash_table->buckets[status->curbucket];
+ }
+ else
+ next_item_pointer = status->pnextitem;
+
+ /* Move to the next bucket if we finished the current bucket */
+ while (!DsaPointerIsValid(next_item_pointer))
+ {
+ if (++status->curbucket >= status->nbuckets)
+ {
+ /* all buckets have been scanned. finsih. */
+ dshash_seq_term(status);
+ return NULL;
+ }
+
+ /* Also move parititon lock if needed */
+ if (!status->consistent)
+ {
+ int next_partition =
+ PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+   status->hash_table->size_log2);
+
+ /* Move lock along with partition for the bucket */
+ if (status->curpartition != next_partition)
+ {
+ /*
+ * Take lock on the next partition then release the current,
+ * not in the reverse order. This is required to avoid
+ * resizing from happening during a sequential scan. Locks are
+ * taken in partition order so no dead lock happen with other
+ * seq scans or resizing.
+ */
+ LWLockAcquire(PARTITION_LOCK(status->hash_table,
+ next_partition),
+  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ LWLockRelease(PARTITION_LOCK(status->hash_table,
+ status->curpartition));
+ status->curpartition = next_partition;
+ }
+ }
+
+ next_item_pointer = status->hash_table->buckets[status->curbucket];
+ }
+
+ status->curitem =
+ dsa_get_address(status->hash_table->area, next_item_pointer);
+ status->hash_table->find_locked = true;
+ status->hash_table->find_exclusively_locked = status->exclusive;
+
+ /*
+ * This item can be deleted by the caller. Store the next item for the
+ * next iteration for the occasion.
+ */
+ status->pnextitem = status->curitem->next;
+
+ return ENTRY_FROM_ITEM(status->curitem);
+}
+
+void
+dshash_seq_term(dshash_seq_status *status)
+{
+ Assert(status->hash_table->seqscan_running);
+ status->hash_table->find_locked = false;
+ status->hash_table->find_exclusively_locked = false;
+ status->hash_table->seqscan_running = false;
+
+ if (status->consistent)
+ {
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ LWLockRelease(PARTITION_LOCK(status->hash_table, i));
+ }
+ else if (status->curpartition >= 0)
+ LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
+}
+
 /*
  * Print debugging information about the internal state of the hash table to
  * stderr.  The caller must hold no partition locks.
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index fa2e28ff3e..79698a6ad6 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -59,6 +59,23 @@ typedef struct dshash_parameters
 struct dshash_table_item;
 typedef struct dshash_table_item dshash_table_item;
 
+/*
+ * Sequential scan state of dshash. The detail is exposed since the storage
+ * size should be known to users but it should be considered as an opaque
+ * type by callers.
+ */
+typedef struct dshash_seq_status
+{
+ dshash_table   *hash_table;
+ int curbucket;
+ int nbuckets;
+ dshash_table_item  *curitem;
+ dsa_pointer pnextitem;
+ int curpartition;
+ bool consistent;
+ bool exclusive;
+} dshash_seq_status;
+
 /* Creating, sharing and destroying from hash tables. */
 extern dshash_table *dshash_create(dsa_area *area,
    const dshash_parameters *params,
@@ -70,7 +87,6 @@ extern dshash_table *dshash_attach(dsa_area *area,
 extern void dshash_detach(dshash_table *hash_table);
 extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table);
 extern void dshash_destroy(dshash_table *hash_table);
-
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
  const void *key, bool exclusive);
@@ -80,6 +96,11 @@ extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
 
+/* seq scan support */
+extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+ bool consistent, bool exclusive);
+extern void *dshash_seq_next(dshash_seq_status *status);
+extern void dshash_seq_term(dshash_seq_status *status);
 /* Convenience hash and compare functions wrapping memcmp and tag_hash. */
 extern int dshash_memcmp(const void *a, const void *b, size_t size, void *arg);
 extern dshash_hash dshash_memhash(const void *v, size_t size, void *arg);
--
2.16.3


From ce97ee23026c410b199358cbe472dff06177dc40 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Thu, 27 Sep 2018 11:15:19 +0900
Subject: [PATCH v23 2/5] Add conditional lock feature to dshash

Dshash currently waits for lock unconditionally. This commit adds new
interfaces for dshash_find and dshash_find_or_insert. The new
interfaces have an extra parameter "nowait" taht commands not to wait
for lock.
---
 src/backend/lib/dshash.c | 69 +++++++++++++++++++++++++++++++++++++++++++-----
 src/include/lib/dshash.h |  6 +++++
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 4f0c7ec840..60a6e3c0bc 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -394,19 +394,48 @@ dshash_get_hash_table_handle(dshash_table *hash_table)
  */
 void *
 dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
+{
+ return dshash_find_extended(hash_table, key, exclusive, false, NULL);
+}
+
+/*
+ * Addition to dshash_find, returns immediately when nowait is true and lock
+ * was not acquired. Lock status is set to *lock_failed if any.
+ */
+void *
+dshash_find_extended(dshash_table *hash_table, const void *key,
+ bool exclusive, bool nowait, bool *lock_acquired)
 {
  dshash_hash hash;
  size_t partition;
  dshash_table_item *item;
 
+ /* allowing !nowait returning the result is just not sensible */
+ Assert(nowait || !lock_acquired);
+
  hash = hash_key(hash_table, key);
  partition = PARTITION_FOR_HASH(hash);
 
  Assert(hash_table->control->magic == DSHASH_MAGIC);
  Assert(!hash_table->find_locked);
 
- LWLockAcquire(PARTITION_LOCK(hash_table, partition),
-  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ if (nowait)
+ {
+ if (!LWLockConditionalAcquire(PARTITION_LOCK(hash_table, partition),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED))
+ {
+ if (lock_acquired)
+ *lock_acquired = false;
+ return NULL;
+ }
+ }
+ else
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+
+ if (lock_acquired)
+ *lock_acquired = true;
+
  ensure_valid_bucket_pointers(hash_table);
 
  /* Search the active bucket. */
@@ -441,6 +470,22 @@ void *
 dshash_find_or_insert(dshash_table *hash_table,
   const void *key,
   bool *found)
+{
+ return dshash_find_or_insert_extended(hash_table, key, found, false);
+}
+
+/*
+ * Addition to dshash_find_or_insert, returns NULL if nowait is true and lock
+ * was not acquired.
+ *
+ * Notes above dshash_find_extended() regarding locking and error handling
+ * equally apply here.
+ */
+void *
+dshash_find_or_insert_extended(dshash_table *hash_table,
+   const void *key,
+   bool *found,
+   bool nowait)
 {
  dshash_hash hash;
  size_t partition_index;
@@ -455,8 +500,16 @@ dshash_find_or_insert(dshash_table *hash_table,
  Assert(!hash_table->find_locked);
 
 restart:
- LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
-  LW_EXCLUSIVE);
+ if (nowait)
+ {
+ if (!LWLockConditionalAcquire(
+ PARTITION_LOCK(hash_table, partition_index),
+ LW_EXCLUSIVE))
+ return NULL;
+ }
+ else
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
+  LW_EXCLUSIVE);
  ensure_valid_bucket_pointers(hash_table);
 
  /* Search the active bucket. */
@@ -626,9 +679,11 @@ dshash_memhash(const void *v, size_t size, void *arg)
  * As opposed to the equivalent for dynanash, the caller is not supposed to
  * delete the returned element before continuing the scan.
  *
- * If consistent is set for dshash_seq_init, the whole hash table is
- * non-exclusively locked. Otherwise a part of the hash table is locked in the
- * same mode (partition lock).
+ * If consistent is set for dshash_seq_init, the all hash table
+ * partitions are locked in the requested mode (as determined by the
+ * exclusive flag), and the locks are held until the end of the scan.
+ * Otherwise the partition locks are acquired and released as needed
+ * during the scan (up to two partitions may be locked at the same time).
  */
 void
 dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index 79698a6ad6..67f7d77f71 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -90,8 +90,14 @@ extern void dshash_destroy(dshash_table *hash_table);
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
  const void *key, bool exclusive);
+extern void *dshash_find_extended(dshash_table *hash_table, const void *key,
+  bool exclusive, bool nowait,
+  bool *lock_acquired);
 extern void *dshash_find_or_insert(dshash_table *hash_table,
    const void *key, bool *found);
+extern void *dshash_find_or_insert_extended(dshash_table *hash_table,
+ const void *key, bool *found,
+ bool nowait);
 extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
--
2.16.3


From 1db30db706a405b03048194a7c25216149e1abaf Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Wed, 7 Nov 2018 16:53:49 +0900
Subject: [PATCH v23 3/5] Make archiver process an auxiliary process

This is a preliminary patch for shared-memory based stats collector.
Archiver process must be a auxiliary process since it uses shared
memory after stats data wes moved onto shared-memory. Make the process
an auxiliary process in order to make it work.
---
 src/backend/bootstrap/bootstrap.c   |  8 +++
 src/backend/postmaster/pgarch.c     | 98 +++++++++----------------------------
 src/backend/postmaster/pgstat.c     |  6 +++
 src/backend/postmaster/postmaster.c | 35 +++++++++----
 src/include/miscadmin.h             |  2 +
 src/include/pgstat.h                |  1 +
 src/include/postmaster/pgarch.h     |  4 +-
 7 files changed, 67 insertions(+), 87 deletions(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 9238fbe98d..dde2485b14 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -329,6 +329,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
  case BgWriterProcess:
  statmsg = pgstat_get_backend_desc(B_BG_WRITER);
  break;
+ case ArchiverProcess:
+ statmsg = pgstat_get_backend_desc(B_ARCHIVER);
+ break;
  case CheckpointerProcess:
  statmsg = pgstat_get_backend_desc(B_CHECKPOINTER);
  break;
@@ -456,6 +459,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
  BackgroundWriterMain();
  proc_exit(1); /* should never return */
 
+ case ArchiverProcess:
+ /* don't set signals, archiver has its own agenda */
+ PgArchiverMain();
+ proc_exit(1); /* should never return */
+
  case CheckpointerProcess:
  /* don't set signals, checkpointer has its own agenda */
  CheckpointerMain();
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index f84f882c4c..4342ebdab4 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -77,7 +77,6 @@
  * Local data
  * ----------
  */
-static time_t last_pgarch_start_time;
 static time_t last_sigterm_time = 0;
 
 /*
@@ -96,7 +95,6 @@ static volatile sig_atomic_t ready_to_stop = false;
 static pid_t pgarch_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]) pg_attribute_noreturn();
 static void pgarch_exit(SIGNAL_ARGS);
 static void ArchSigHupHandler(SIGNAL_ARGS);
 static void ArchSigTermHandler(SIGNAL_ARGS);
@@ -114,75 +112,6 @@ static void pgarch_archiveDone(char *xlog);
  * ------------------------------------------------------------
  */
 
-/*
- * pgarch_start
- *
- * Called from postmaster at startup or after an existing archiver
- * died.  Attempt to fire up a fresh archiver process.
- *
- * Returns PID of child process, or 0 if fail.
- *
- * Note: if fail, we will be called again from the postmaster main loop.
- */
-int
-pgarch_start(void)
-{
- time_t curtime;
- pid_t pgArchPid;
-
- /*
- * Do nothing if no archiver needed
- */
- if (!XLogArchivingActive())
- return 0;
-
- /*
- * Do nothing if too soon since last archiver start.  This is a safety
- * valve to protect against continuous respawn attempts if the archiver is
- * dying immediately at launch. Note that since we will be re-called from
- * the postmaster main loop, we will get another chance later.
- */
- curtime = time(NULL);
- if ((unsigned int) (curtime - last_pgarch_start_time) <
- (unsigned int) PGARCH_RESTART_INTERVAL)
- return 0;
- last_pgarch_start_time = curtime;
-
-#ifdef EXEC_BACKEND
- switch ((pgArchPid = pgarch_forkexec()))
-#else
- switch ((pgArchPid = fork_process()))
-#endif
- {
- case -1:
- ereport(LOG,
- (errmsg("could not fork archiver: %m")));
- return 0;
-
-#ifndef EXEC_BACKEND
- case 0:
- /* in postmaster child ... */
- InitPostmasterChild();
-
- /* Close the postmaster's sockets */
- ClosePostmasterPorts(false);
-
- /* Drop our connection to postmaster's shared memory, as well */
- dsm_detach_all();
- PGSharedMemoryDetach();
-
- PgArchiverMain(0, NULL);
- break;
-#endif
-
- default:
- return (int) pgArchPid;
- }
-
- /* shouldn't get here */
- return 0;
-}
-
 /* ------------------------------------------------------------
  * Local functions called by archiver follow
  * ------------------------------------------------------------
@@ -222,8 +151,8 @@ pgarch_forkexec(void)
  * The argc/argv parameters are valid only in EXEC_BACKEND case.  However,
  * since we don't use 'em, it hardly matters...
  */
-NON_EXEC_STATIC void
-PgArchiverMain(int argc, char *argv[])
+void
+PgArchiverMain(void)
 {
  /*
  * Ignore all signals usually bound to some action in the postmaster,
@@ -255,8 +184,27 @@ PgArchiverMain(int argc, char *argv[])
 static void
 pgarch_exit(SIGNAL_ARGS)
 {
- /* SIGQUIT means curl up and die ... */
- exit(1);
+ PG_SETMASK(&BlockSig);
+
+ /*
+ * We DO NOT want to run proc_exit() callbacks -- we're here because
+ * shared memory may be corrupted, so we don't want to try to clean up our
+ * transaction.  Just nail the windows shut and get out of town.  Now that
+ * there's an atexit callback to prevent third-party code from breaking
+ * things by calling exit() directly, we have to reset the callbacks
+ * explicitly to make this work as intended.
+ */
+ on_exit_reset();
+
+ /*
+ * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+ * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+ * backend.  This is necessary precisely because we don't clean up our
+ * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+ * should ensure the postmaster sees this as a crash, too, but no harm in
+ * being doubly sure.)
+ */
+ exit(2);
 }
 
 /* SIGHUP signal handler for archiver process */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 011076c3e3..043e3ff9d2 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2934,6 +2934,9 @@ pgstat_bestart(void)
  case StartupProcess:
  lbeentry.st_backendType = B_STARTUP;
  break;
+ case ArchiverProcess:
+ beentry->st_backendType = B_ARCHIVER;
+ break;
  case BgWriterProcess:
  lbeentry.st_backendType = B_BG_WRITER;
  break;
@@ -4277,6 +4280,9 @@ pgstat_get_backend_desc(BackendType backendType)
 
  switch (backendType)
  {
+ case B_ARCHIVER:
+ backendDesc = "archiver";
+ break;
  case B_AUTOVAC_LAUNCHER:
  backendDesc = "autovacuum launcher";
  break;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index eb9e0221f8..27a9e45074 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -146,7 +146,8 @@
 #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */
 #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */
 #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */
-#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */
+#define BACKEND_TYPE_ARCHIVER 0x0010 /* archiver process */
+#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */
 
 #define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER)
 
@@ -539,6 +540,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #endif /* EXEC_BACKEND */
 
 #define StartupDataBase() StartChildProcess(StartupProcess)
+#define StartArchiver() StartChildProcess(ArchiverProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
 #define StartCheckpointer() StartChildProcess(CheckpointerProcess)
 #define StartWalWriter() StartChildProcess(WalWriterProcess)
@@ -1776,7 +1778,7 @@ ServerLoop(void)
 
  /* If we have lost the archiver, try to start a new one. */
  if (PgArchPID == 0 && PgArchStartupAllowed())
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
 
  /* If we need to signal the autovacuum launcher, do so now */
  if (avlauncher_needs_signal)
@@ -3005,7 +3007,7 @@ reaper(SIGNAL_ARGS)
  if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0)
  AutoVacPID = StartAutoVacLauncher();
  if (PgArchStartupAllowed() && PgArchPID == 0)
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
  if (PgStatPID == 0)
  PgStatPID = pgstat_start();
 
@@ -3150,10 +3152,8 @@ reaper(SIGNAL_ARGS)
  {
  PgArchPID = 0;
  if (!EXIT_STATUS_0(exitstatus))
- LogChildExit(LOG, _("archiver process"),
- pid, exitstatus);
- if (PgArchStartupAllowed())
- PgArchPID = pgarch_start();
+ HandleChildCrash(pid, exitstatus,
+ _("archiver process"));
  continue;
  }
 
@@ -3399,7 +3399,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, archiver or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -3604,6 +3604,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
  }
 
+ /* Take care of the archiver too */
+ if (pid == PgArchPID)
+ PgArchPID = 0;
+ else if (PgArchPID != 0 && take_action)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) PgArchPID)));
+ signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
  /*
  * Force a power-cycle of the pgarch process too.  (This isn't absolutely
  * necessary, but it seems like a good idea for robustness, and it
@@ -3876,6 +3888,7 @@ PostmasterStateMachine(void)
  Assert(CheckpointerPID == 0);
  Assert(WalWriterPID == 0);
  Assert(AutoVacPID == 0);
+ Assert(PgArchPID == 0);
  /* syslogger is not considered here */
  pmState = PM_NO_CHILDREN;
  }
@@ -5145,7 +5158,7 @@ sigusr1_handler(SIGNAL_ARGS)
  */
  Assert(PgArchPID == 0);
  if (XLogArchivingAlways())
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
 
  /*
  * If we aren't planning to enter hot standby mode later, treat
@@ -5428,6 +5441,10 @@ StartChildProcess(AuxProcType type)
  ereport(LOG,
  (errmsg("could not fork startup process: %m")));
  break;
+ case ArchiverProcess:
+ ereport(LOG,
+ (errmsg("could not fork archiver process: %m")));
+ break;
  case BgWriterProcess:
  ereport(LOG,
  (errmsg("could not fork background writer process: %m")));
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index bc6e03fbc7..1f4db67f3f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -399,6 +399,7 @@ typedef enum
  BootstrapProcess,
  StartupProcess,
  BgWriterProcess,
+ ArchiverProcess,
  CheckpointerProcess,
  WalWriterProcess,
  WalReceiverProcess,
@@ -411,6 +412,7 @@ extern AuxProcType MyAuxProcType;
 #define AmBootstrapProcess() (MyAuxProcType == BootstrapProcess)
 #define AmStartupProcess() (MyAuxProcType == StartupProcess)
 #define AmBackgroundWriterProcess() (MyAuxProcType == BgWriterProcess)
+#define AmArchiverProcess() (MyAuxProcType == ArchiverProcess)
 #define AmCheckpointerProcess() (MyAuxProcType == CheckpointerProcess)
 #define AmWalWriterProcess() (MyAuxProcType == WalWriterProcess)
 #define AmWalReceiverProcess() (MyAuxProcType == WalReceiverProcess)
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe076d823d..65713abc2b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -718,6 +718,7 @@ typedef struct PgStat_GlobalStats
  */
 typedef enum BackendType
 {
+ B_ARCHIVER,
  B_AUTOVAC_LAUNCHER,
  B_AUTOVAC_WORKER,
  B_BACKEND,
diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h
index 2474eac26a..88f16863d4 100644
--- a/src/include/postmaster/pgarch.h
+++ b/src/include/postmaster/pgarch.h
@@ -32,8 +32,6 @@
  */
 extern int pgarch_start(void);
 
-#ifdef EXEC_BACKEND
-extern void PgArchiverMain(int argc, char *argv[]) pg_attribute_noreturn();
-#endif
+extern void PgArchiverMain(void) pg_attribute_noreturn();
 
 #endif /* _PGARCH_H */
--
2.16.3


From c0d5b123c79995507fb9b340723e7152980d158b Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Thu, 21 Feb 2019 12:44:56 +0900
Subject: [PATCH v23 4/5] Shared-memory based stats collector

Previously activity statistics is shared via files on disk. Every
backend sends the numbers to the stats collector process via a socket.
It makes snapshots as a set of files on disk with a certain interval
then every backend reads them as necessary. It worked fine for
comparatively small set of statistics but the set is under the
pressure to growing up and the file size has reached the order of
megabytes. To deal with larger statistics set, this patch let backends
directly share the statistics via shared memory.
---
 doc/src/sgml/monitoring.sgml                 |    6 +-
 src/backend/postmaster/autovacuum.c          |   12 +-
 src/backend/postmaster/pgstat.c              | 5661 ++++++++++++--------------
 src/backend/postmaster/postmaster.c          |   85 +-
 src/backend/storage/ipc/ipci.c               |    2 +
 src/backend/storage/lmgr/lwlock.c            |    1 +
 src/backend/tcop/postgres.c                  |   27 +-
 src/backend/utils/init/globals.c             |    1 +
 src/backend/utils/init/postinit.c            |   11 +
 src/bin/pg_basebackup/t/010_pg_basebackup.pl |    4 +-
 src/include/miscadmin.h                      |    1 +
 src/include/pgstat.h                         |  441 +-
 src/include/storage/lwlock.h                 |    1 +
 src/include/utils/timeout.h                  |    1 +
 14 files changed, 2637 insertions(+), 3617 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 828e9084dd..ea6aad4d1e 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -53,7 +53,6 @@ postgres  15554  0.0  0.0  57536  1184 ?        Ss   18:02   0:00 postgres: back
 postgres  15555  0.0  0.0  57536   916 ?        Ss   18:02   0:00 postgres: checkpointer
 postgres  15556  0.0  0.0  57536   916 ?        Ss   18:02   0:00 postgres: walwriter
 postgres  15557  0.0  0.0  58504  2244 ?        Ss   18:02   0:00 postgres: autovacuum launcher
-postgres  15558  0.0  0.0  17512  1068 ?        Ss   18:02   0:00 postgres: stats collector
 postgres  15582  0.0  0.0  58772  3080 ?        Ss   18:04   0:00 postgres: joe runbug 127.0.0.1 idle
 postgres  15606  0.0  0.0  58772  3052 ?        Ss   18:07   0:00 postgres: tgl regression [local] SELECT waiting
 postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl regression [local] idle in transaction
@@ -65,9 +64,8 @@ postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl
    master server process.  The command arguments
    shown for it are the same ones used when it was launched.  The next five
    processes are background worker processes automatically launched by the
-   master process.  (The <quote>stats collector</quote> process will not be present
-   if you have set the system not to start the statistics collector; likewise
-   the <quote>autovacuum launcher</quote> process can be disabled.)
+   master process.  (The <quote>autovacuum launcher</quote> process will not
+   be present if you have set the system not to start it.)
    Each of the remaining
    processes is a server process handling one client connection.  Each such
    process sets its command line display in the form
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 073f313337..a222817f55 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -1958,15 +1958,15 @@ do_autovacuum(void)
   ALLOCSET_DEFAULT_SIZES);
  MemoryContextSwitchTo(AutovacMemCxt);
 
+ /* Start a transaction so our commands have one to play into. */
+ StartTransactionCommand();
+
  /*
  * may be NULL if we couldn't find an entry (only happens if we are
  * forcing a vacuum for anti-wrap purposes).
  */
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
 
- /* Start a transaction so our commands have one to play into. */
- StartTransactionCommand();
-
  /*
  * Clean up any dead statistics collector entries for this DB. We always
  * want to do this exactly once per DB-processing cycle, even if we find
@@ -2749,12 +2749,10 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
  if (isshared)
  {
  if (PointerIsValid(shared))
- tabentry = hash_search(shared->tables, &relid,
-   HASH_FIND, NULL);
+ tabentry = pgstat_fetch_stat_tabentry_extended(shared, relid);
  }
  else if (PointerIsValid(dbentry))
- tabentry = hash_search(dbentry->tables, &relid,
-   HASH_FIND, NULL);
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
 
  return tabentry;
 }
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 043e3ff9d2..c0b20763b0 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1,15 +1,23 @@
 /* ----------
  * pgstat.c
  *
- * All the statistics collector stuff hacked up in one big, ugly file.
+ * Statistics collector facility.
  *
- * TODO: - Separate collector, postmaster and backend stuff
- *  into different files.
+ *  Collects per-table and per-function usage statistics of all backends on
+ *  shared memory. pg_count_*() and friends interfaces stores activity of
+ *  every backend during a transaction. Then pgstat_flush_stat() is called at
+ *  the end of a transaction to flush out the local numbers to shared memory.
  *
- * - Add some automatic call for pgstat vacuuming.
+ *  To avoid congestion on the shared memory, we update shared stats no more
+ *  often than intervals of PGSTAT_STAT_MIN_INTERVAL(500ms). Still it is
+ *  possible that a backend cannot flush all or a part of local numbers
+ *  immediately, we postpone updates and try the next chance after the
+ *  interval of PGSTAT_STAT_RETRY_INTERVAL(100ms), but they are not kept
+ *  longer than PGSTAT_STAT_MAX_INTERVAL(1000ms).
  *
- * - Add a pgstat config column to pg_database, so this
- *  entire thing can be enabled/disabled on a per db basis.
+ *  The first process that uses stats collector creates the area then load the
+ *  stored stats file if any, and the last process at shutdown writes the
+ *  shared stats to the file then destroy the area before exit.
  *
  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  *
@@ -19,18 +27,6 @@
 #include "postgres.h"
 
 #include <unistd.h>
-#include <fcntl.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/socket.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <signal.h>
-#include <time.h>
-#ifdef HAVE_SYS_SELECT_H
-#include <sys/select.h>
-#endif
 
 #include "pgstat.h"
 
@@ -42,66 +38,38 @@
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_proc.h"
-#include "common/ip.h"
 #include "libpq/libpq.h"
-#include "libpq/pqsignal.h"
-#include "mb/pg_wchar.h"
 #include "miscadmin.h"
-#include "pg_trace.h"
 #include "postmaster/autovacuum.h"
-#include "postmaster/fork_process.h"
-#include "postmaster/postmaster.h"
 #include "replication/walsender.h"
-#include "storage/backendid.h"
-#include "storage/dsm.h"
-#include "storage/fd.h"
 #include "storage/ipc.h"
-#include "storage/latch.h"
 #include "storage/lmgr.h"
-#include "storage/pg_shmem.h"
+#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "storage/sinvaladt.h"
 #include "utils/ascii.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
-#include "utils/ps_status.h"
-#include "utils/rel.h"
+#include "utils/probes.h"
 #include "utils/snapmgr.h"
-#include "utils/timestamp.h"
-
 
 /* ----------
  * Timer definitions.
  * ----------
  */
-#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file
- * updates; in milliseconds. */
+#define PGSTAT_STAT_MIN_INTERVAL 500 /* Minimum time between stats data
+ * updates; in milliseconds. */
 
-#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a
- * new file; in milliseconds. */
-
-#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats
- * file update; in milliseconds. */
-
-#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a
- * new file; in milliseconds. */
-
-#define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a
- * failed statistics collector; in
- * seconds. */
-
-#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
-#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
-
-/* Minimum receive buffer size for the collector's socket. */
-#define PGSTAT_MIN_RCVBUF (100 * 1024)
+#define PGSTAT_STAT_RETRY_INTERVAL 100 /* Retry interval between after
+ * elapsed PGSTAT_MIN_INTERVAL*/
 
+#define PGSTAT_STAT_MAX_INTERVAL   1000 /* Maximum time between stats data
+ * updates; in milliseconds. */
 
 /* ----------
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
-#define PGSTAT_DB_HASH_SIZE 16
 #define PGSTAT_TAB_HASH_SIZE 512
 #define PGSTAT_FUNCTION_HASH_SIZE 512
 
@@ -117,6 +85,19 @@
  */
 #define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
 
+/*
+ * Operation mode and return code of pgstat_get_db_entry.
+ */
+#define PGSTAT_SHARED 0
+#define PGSTAT_EXCLUSIVE 1
+#define PGSTAT_NOWAIT 2
+
+typedef enum PgStat_TableLookupResult
+{
+ NOT_FOUND,
+ FOUND,
+ LOCK_FAILED
+} PgStat_TableLookupResult;
 
 /* ----------
  * GUC parameters
@@ -132,31 +113,63 @@ int pgstat_track_activity_query_size = 1024;
  * ----------
  */
 char   *pgstat_stat_directory = NULL;
+
+/* No longer used, but will be removed with GUC */
 char   *pgstat_stat_filename = NULL;
 char   *pgstat_stat_tmpname = NULL;
 
+#define StatsLock (&StatsShmem->StatsMainLock)
+
+/* Shared stats bootstrap information */
+typedef struct StatsShmemStruct
+{
+ LWLock StatsMainLock; /* lock protecting this struct */
+ dsa_handle stats_dsa_handle; /* DSA handle for stats collector */
+ dshash_table_handle db_hash_handle;
+ dsa_pointer global_stats;
+ dsa_pointer archiver_stats;
+ int refcount;
+} StatsShmemStruct;
+
 /*
- * BgWriter global statistics counters (unused in other processes).
- * Stored directly in a stats message structure so it can be sent
- * without needing to copy things around.  We assume this inits to zeroes.
+ * BgWriter global statistics counters. The name is the remnant from the time
+ * when the stats collector was a dedicate process, which used sockets to send
+ * it.
  */
-PgStat_MsgBgWriter BgWriterStats;
+PgStat_MsgBgWriter BgWriterStats = {0};
 
-/* ----------
- * Local data
- * ----------
- */
-NON_EXEC_STATIC pgsocket pgStatSock = PGINVALID_SOCKET;
+/* Variables lives for the backend lifetime */
+static StatsShmemStruct * StatsShmem = NULL;
+static dsa_area *area = NULL;
+static dshash_table *pgStatDBHash = NULL;
 
-static struct sockaddr_storage pgStatAddr;
 
-static time_t last_pgstat_start_time;
-
-static bool pgStatRunningInCollector = false;
+/* parameter for each type of shared hash */
+static const dshash_parameters dsh_dbparams = {
+ sizeof(Oid),
+ SHARED_DBENT_SIZE,
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
+static const dshash_parameters dsh_tblparams = {
+ sizeof(Oid),
+ sizeof(PgStat_StatTabEntry),
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
+static const dshash_parameters dsh_funcparams = {
+ sizeof(Oid),
+ sizeof(PgStat_StatFuncEntry),
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
 
 /*
  * Structures in which backends store per-table info that's waiting to be
- * sent to the collector.
+ * written to shared memory.
  *
  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
  * for the life of the backend.  Also, we zero out the t_id fields of the
@@ -191,8 +204,8 @@ typedef struct TabStatHashEntry
 static HTAB *pgStatTabHash = NULL;
 
 /*
- * Backends store per-function info that's waiting to be sent to the collector
- * in this hash table (indexed by function OID).
+ * Backends store per-function info that's waiting to be flushed out to shared
+ * memory in this hash table (indexed by function OID).
  */
 static HTAB *pgStatFunctions = NULL;
 
@@ -202,6 +215,68 @@ static HTAB *pgStatFunctions = NULL;
  */
 static bool have_function_stats = false;
 
+/* common header of snapshot entry in backend snapshot hash */
+typedef struct PgStat_snapshot
+{
+ Oid key;
+ bool negative;
+ void   *body; /* end of header part: to keep alignment */
+} PgStat_snapshot;
+
+/* context struct for snapshot_statentry */
+typedef struct pgstat_snapshot_param
+{
+ char   *hash_name; /* name of the snapshot hash */
+ int hash_entsize; /* element size of hash entry */
+ dshash_table_handle dsh_handle; /* dsh handle to attach */
+ const dshash_parameters *dsh_params;/* dshash params */
+ HTAB  **hash; /* points to variable to hold hash */
+ dshash_table  **dshash; /* ditto for dshash */
+} pgstat_snapshot_param;
+
+/*
+ * Backends store various database-wide info that's waiting to be flushed out
+ * to shared memory in these variables.
+ *
+ * checksum_failures is the exception in that it is cluster-wide.
+ */
+typedef struct BackendDBStats
+{
+ int n_conflict_tablespace;
+ int n_conflict_lock;
+ int n_conflict_snapshot;
+ int n_conflict_bufferpin;
+ int n_conflict_startup_deadlock;
+ int n_deadlocks;
+ size_t n_tmpfiles;
+ size_t tmpfilesize;
+ HTAB *checksum_failures;
+} BackendDBStats;
+
+/* Hash entry struct for checksum_failures above */
+typedef struct ChecksumFailureEnt
+{
+ Oid dboid;
+ int count;
+} ChecksumFailureEnt;
+
+static BackendDBStats BeDBStats = {0};
+
+/* macros to check BeDBStats at once */
+#define HAVE_PENDING_CONFLICTS() \
+ (BeDBStats.n_conflict_tablespace > 0 || \
+ BeDBStats.n_conflict_lock > 0 || \
+ BeDBStats.n_conflict_bufferpin > 0 || \
+ BeDBStats.n_conflict_startup_deadlock > 0)
+
+#define HAVE_PENDING_DBSTATS() \
+ (HAVE_PENDING_CONFLICTS() || \
+ BeDBStats.n_deadlocks > 0 || \
+ BeDBStats.n_tmpfiles > 0 || \
+ /* no need to check tmpfilesize */ \
+ BeDBStats.checksum_failures != NULL)
+
+
 /*
  * Tuple insertion/deletion counts for an open transaction can't be propagated
  * into PgStat_TableStatus counters until we know if it is going to commit
@@ -237,11 +312,11 @@ typedef struct TwoPhasePgStatRecord
  bool t_truncated; /* was the relation truncated? */
 } TwoPhasePgStatRecord;
 
-/*
- * Info about current "snapshot" of stats file
- */
+/* Variables for backend status snapshot */
 static MemoryContext pgStatLocalContext = NULL;
-static HTAB *pgStatDBHash = NULL;
+static MemoryContext pgStatSnapshotContext = NULL;
+static HTAB *pgStatLocalHash = NULL;
+static bool clear_snapshot = false;
 
 /* Status for backends including auxiliary */
 static LocalPgBackendStatus *localBackendStatusTable = NULL;
@@ -250,23 +325,35 @@ static LocalPgBackendStatus *localBackendStatusTable = NULL;
 static int localNumBackends = 0;
 
 /*
- * Cluster wide statistics, kept in the stats collector.
- * Contains statistics that are not collected per database
- * or per table.
+ * Struct for context for pgstat_flush_* functions
+ *
+ * To avoid repeated attach/detch of the same dshash, dshashes once attached
+ * is stored in this structure and moved around multiple calls and multiple
+ * functions. generation here means the value returned by pin_hashes().
  */
-static PgStat_ArchiverStats archiverStats;
-static PgStat_GlobalStats globalStats;
+typedef struct pgstat_flush_stat_context
+{
+ int shgeneration; /* "generation" of shdb_tabhash below */
+ PgStat_StatDBEntry *shdbentry; /* dbentry for shared tables (oid = 0) */
+ dshash_table *shdb_tabhash; /* tabentry dshash of shared tables */
+
+ int mygeneration; /* "generation" of mydb_tabhash below */
+ PgStat_StatDBEntry *mydbentry; /* dbengry for my database */
+ dshash_table *mydb_tabhash; /* tabentry dshash of my database */
+} pgstat_flush_stat_context;
 
 /*
- * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
- * it means to write only the shared-catalog stats ("DB 0"); otherwise, we
- * will write both that DB's data and the shared stats.
+ * Cluster wide statistics.
+ *
+ * Contains statistics that are collected not per database nor per table
+ * basis.  shared_* points to shared memroy and snapshot_* are backend
+ * snapshots. Their validity is indicated by global_snapshot_is_valid.
  */
-static List *pending_write_requests = NIL;
-
-/* Signal handler flags */
-static volatile bool need_exit = false;
-static volatile bool got_SIGHUP = false;
+static bool global_snapshot_is_valid = false;
+static PgStat_ArchiverStats *shared_archiverStats;
+static PgStat_ArchiverStats snapshot_archiverStats;
+static PgStat_GlobalStats *shared_globalStats;
+static PgStat_GlobalStats snapshot_globalStats;
 
 /*
  * Total time charged to functions so far in the current backend.
@@ -280,35 +367,41 @@ static instr_time total_func_time;
  * Local function forward declarations
  * ----------
  */
-#ifdef EXEC_BACKEND
-static pid_t pgstat_forkexec(void);
-#endif
 
-NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-static void pgstat_exit(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
-static void pgstat_sighup_handler(SIGNAL_ARGS);
-
-static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
+static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, int op,
+ PgStat_TableLookupResult *status);
+static PgStat_StatTabEntry *pgstat_get_tab_entry(dshash_table *table,
  Oid tableoid, bool create);
-static void pgstat_write_statsfiles(bool permanent, bool allDbs);
-static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
-static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
-static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
-static void backend_read_statsfile(void);
+static void pgstat_write_pgStatDBHashfile(PgStat_StatDBEntry *dbentry);
+static void pgstat_read_pgStatDBHashfile(PgStat_StatDBEntry *dbentry);
 static void pgstat_read_current_status(void);
-
-static bool pgstat_write_statsfile_needed(void);
-static bool pgstat_db_requested(Oid databaseid);
-
-static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
-static void pgstat_send_funcstats(void);
+static bool pgstat_flush_stat(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_flush_tabstat(pgstat_flush_stat_context *cxt, bool nowait,
+ PgStat_TableStatus *entry);
+static bool pgstat_flush_funcstats(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_flush_dbstats(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_update_tabentry(dshash_table *tabhash,
+   PgStat_TableStatus *stat, bool nowait);
+static void pgstat_update_dbentry(PgStat_StatDBEntry *dbentry,
+  PgStat_TableStatus *stat);
 static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid);
 
+static void pgstat_remove_useless_entries(const dshash_table_handle dshhandle,
+  const dshash_parameters *dshparams,
+  HTAB *oidtab);
 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
 
 static void pgstat_setup_memcxt(void);
+static void pgstat_flush_recovery_conflict(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_deadlock(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_checksum_failure(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_tempfile(PgStat_StatDBEntry *dbentry);
+static HTAB *create_tabstat_hash(void);
+static PgStat_SubXactStatus *get_tabstat_stack_level(int nest_level);
+static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level);
+static PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry_extended(PgStat_StatDBEntry *dbent, Oid funcid);
+static void pgstat_snapshot_global_stats(void);
 
 static const char *pgstat_get_wait_activity(WaitEventActivity w);
 static const char *pgstat_get_wait_client(WaitEventClient w);
@@ -316,481 +409,197 @@ static const char *pgstat_get_wait_ipc(WaitEventIPC w);
 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
 static const char *pgstat_get_wait_io(WaitEventIO w);
 
-static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
-static void pgstat_send(void *msg, int len);
-
-static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
-static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
-static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
-static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
-static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
-static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len);
-static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len);
-static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
-static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
-static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
-static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
-static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
-static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
-static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
-static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
-static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
-static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
-static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
+/* ------------------------------------------------------------
+ * Local support functions follow
+ * ------------------------------------------------------------
+ */
+static int pin_hashes(PgStat_StatDBEntry *dbentry);
+static void unpin_hashes(PgStat_StatDBEntry *dbentry, int generation);
+static dshash_table *attach_table_hash(PgStat_StatDBEntry *dbent, int gen);
+static dshash_table *attach_function_hash(PgStat_StatDBEntry *dbent, int gen);
+static void reset_dbentry_counters(PgStat_StatDBEntry *dbentry);
 
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
  */
 
-/* ----------
- * pgstat_init() -
- *
- * Called from postmaster at startup. Create the resources required
- * by the statistics collector process.  If unable to do so, do not
- * fail --- better to let the postmaster start with stats collection
- * disabled.
- * ----------
+/*
+ * StatsShmemSize
+ * Compute space needed for stats collector's shared memory
  */
-void
-pgstat_init(void)
+Size
+StatsShmemSize(void)
 {
- ACCEPT_TYPE_ARG3 alen;
- struct addrinfo *addrs = NULL,
-   *addr,
- hints;
- int ret;
- fd_set rset;
- struct timeval tv;
- char test_byte;
- int sel_res;
- int tries = 0;
-
-#define TESTBYTEVAL ((char) 199)
-
- /*
- * This static assertion verifies that we didn't mess up the calculations
- * involved in selecting maximum payload sizes for our UDP messages.
- * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would
- * be silent performance loss from fragmentation, it seems worth having a
- * compile-time cross-check that we didn't.
- */
- StaticAssertStmt(sizeof(PgStat_Msg) <= PGSTAT_MAX_MSG_SIZE,
- "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE");
-
- /*
- * Create the UDP socket for sending and receiving statistic messages
- */
- hints.ai_flags = AI_PASSIVE;
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_DGRAM;
- hints.ai_protocol = 0;
- hints.ai_addrlen = 0;
- hints.ai_addr = NULL;
- hints.ai_canonname = NULL;
- hints.ai_next = NULL;
- ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
- if (ret || !addrs)
- {
- ereport(LOG,
- (errmsg("could not resolve \"localhost\": %s",
- gai_strerror(ret))));
- goto startup_failed;
- }
-
- /*
- * On some platforms, pg_getaddrinfo_all() may return multiple addresses
- * only one of which will actually work (eg, both IPv6 and IPv4 addresses
- * when kernel will reject IPv6).  Worse, the failure may occur at the
- * bind() or perhaps even connect() stage.  So we must loop through the
- * results till we find a working combination. We will generate LOG
- * messages, but no error, for bogus combinations.
- */
- for (addr = addrs; addr; addr = addr->ai_next)
- {
-#ifdef HAVE_UNIX_SOCKETS
- /* Ignore AF_UNIX sockets, if any are returned. */
- if (addr->ai_family == AF_UNIX)
- continue;
-#endif
-
- if (++tries > 1)
- ereport(LOG,
- (errmsg("trying another address for the statistics collector")));
-
- /*
- * Create the socket.
- */
- if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not create socket for statistics collector: %m")));
- continue;
- }
-
- /*
- * Bind it to a kernel assigned port on localhost and get the assigned
- * port via getsockname().
- */
- if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not bind socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- alen = sizeof(pgStatAddr);
- if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not get address of socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /*
- * Connect the socket to its own address.  This saves a few cycles by
- * not having to respecify the target address on every send. This also
- * provides a kernel-level check that only packets from this same
- * address will be received.
- */
- if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not connect socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /*
- * Try to send and receive a one-byte test message on the socket. This
- * is to catch situations where the socket can be created but will not
- * actually pass data (for instance, because kernel packet filtering
- * rules prevent it).
- */
- test_byte = TESTBYTEVAL;
-
-retry1:
- if (send(pgStatSock, &test_byte, 1, 0) != 1)
- {
- if (errno == EINTR)
- goto retry1; /* if interrupted, just retry */
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not send test message on socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /*
- * There could possibly be a little delay before the message can be
- * received.  We arbitrarily allow up to half a second before deciding
- * it's broken.
- */
- for (;;) /* need a loop to handle EINTR */
- {
- FD_ZERO(&rset);
- FD_SET(pgStatSock, &rset);
-
- tv.tv_sec = 0;
- tv.tv_usec = 500000;
- sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
- if (sel_res >= 0 || errno != EINTR)
- break;
- }
- if (sel_res < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("select() failed in statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
- if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
- {
- /*
- * This is the case we actually think is likely, so take pains to
- * give a specific message for it.
- *
- * errno will not be set meaningfully here, so don't use it.
- */
- ereport(LOG,
- (errcode(ERRCODE_CONNECTION_FAILURE),
- errmsg("test message did not get through on socket for statistics collector")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- test_byte++; /* just make sure variable is changed */
-
-retry2:
- if (recv(pgStatSock, &test_byte, 1, 0) != 1)
- {
- if (errno == EINTR)
- goto retry2; /* if interrupted, just retry */
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not receive test message on socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */
- {
- ereport(LOG,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("incorrect test message transmission on socket for statistics collector")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
-
- /* If we get here, we have a working socket */
- break;
- }
-
- /* Did we find a working address? */
- if (!addr || pgStatSock == PGINVALID_SOCKET)
- goto startup_failed;
-
- /*
- * Set the socket to non-blocking IO.  This ensures that if the collector
- * falls behind, statistics messages will be discarded; backends won't
- * block waiting to send messages to the collector.
- */
- if (!pg_set_noblock(pgStatSock))
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not set statistics collector socket to nonblocking mode: %m")));
- goto startup_failed;
- }
-
- /*
- * Try to ensure that the socket's receive buffer is at least
- * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose
- * data.  Use of UDP protocol means that we are willing to lose data under
- * heavy load, but we don't want it to happen just because of ridiculously
- * small default buffer sizes (such as 8KB on older Windows versions).
- */
- {
- int old_rcvbuf;
- int new_rcvbuf;
- ACCEPT_TYPE_ARG3 rcvbufsize = sizeof(old_rcvbuf);
-
- if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
-   (char *) &old_rcvbuf, &rcvbufsize) < 0)
- {
- elog(LOG, "getsockopt(SO_RCVBUF) failed: %m");
- /* if we can't get existing size, always try to set it */
- old_rcvbuf = 0;
- }
-
- new_rcvbuf = PGSTAT_MIN_RCVBUF;
- if (old_rcvbuf < new_rcvbuf)
- {
- if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
-   (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0)
- elog(LOG, "setsockopt(SO_RCVBUF) failed: %m");
- }
- }
-
- pg_freeaddrinfo_all(hints.ai_family, addrs);
-
- return;
-
-startup_failed:
- ereport(LOG,
- (errmsg("disabling statistics collector for lack of working socket")));
-
- if (addrs)
- pg_freeaddrinfo_all(hints.ai_family, addrs);
-
- if (pgStatSock != PGINVALID_SOCKET)
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
-
- /*
- * Adjust GUC variables to suppress useless activity, and for debugging
- * purposes (seeing track_counts off is a clue that we failed here). We
- * use PGC_S_OVERRIDE because there is no point in trying to turn it back
- * on from postgresql.conf without a restart.
- */
- SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
+ return sizeof(StatsShmemStruct);
 }
 
 /*
- * subroutine for pgstat_reset_all
+ * StatsShmemInit - initialize during shared-memory creation
+ */
+void
+StatsShmemInit(void)
+{
+ bool found;
+
+ StatsShmem = (StatsShmemStruct *)
+ ShmemInitStruct("Stats area", StatsShmemSize(),
+ &found);
+
+ if (!IsUnderPostmaster)
+ {
+ Assert(!found);
+
+ StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+ }
+
+ LWLockInitialize(StatsLock, LWTRANCHE_STATS);
+}
+
+/* ----------
+ * pgstat_attach_shared_stats() -
+ *
+ * Attach shared or create stats memory.
+ * ---------
  */
 static void
-pgstat_reset_remove_files(const char *directory)
+pgstat_attach_shared_stats(void)
 {
- DIR   *dir;
- struct dirent *entry;
- char fname[MAXPGPATH * 2];
+ MemoryContext oldcontext;
 
- dir = AllocateDir(directory);
- while ((entry = ReadDir(dir, directory)) != NULL)
+ /*
+ * Don't use dsm under postmaster, when not tracking counts.
+ */
+ if (!pgstat_track_counts || !IsUnderPostmaster)
+ return;
+
+ pgstat_setup_memcxt();
+
+ if (area)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+
+ if (StatsShmem->refcount > 0)
+ StatsShmem->refcount++;
+ else
  {
- int nchars;
- Oid tmp_oid;
+ /* Need to create shared memory area and load saved stats if any. */
+ Assert(StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID);
 
- /*
- * Skip directory entries that don't match the file names we write.
- * See get_dbstat_filename for the database-specific pattern.
- */
- if (strncmp(entry->d_name, "global.", 7) == 0)
- nchars = 7;
- else
- {
- nchars = 0;
- (void) sscanf(entry->d_name, "db_%u.%n",
-  &tmp_oid, &nchars);
- if (nchars <= 0)
- continue;
- /* %u allows leading whitespace, so reject that */
- if (strchr("0123456789", entry->d_name[3]) == NULL)
- continue;
- }
+ /* Initialize shared memory area */
+ area = dsa_create(LWTRANCHE_STATS);
+ pgStatDBHash = dshash_create(area, &dsh_dbparams, 0);
 
- if (strcmp(entry->d_name + nchars, "tmp") != 0 &&
- strcmp(entry->d_name + nchars, "stat") != 0)
- continue;
+ StatsShmem->stats_dsa_handle = dsa_get_handle(area);
+ StatsShmem->global_stats =
+ dsa_allocate0(area, sizeof(PgStat_GlobalStats));
+ StatsShmem->archiver_stats =
+ dsa_allocate0(area, sizeof(PgStat_ArchiverStats));
+ StatsShmem->db_hash_handle = dshash_get_hash_table_handle(pgStatDBHash);
 
- snprintf(fname, sizeof(fname), "%s/%s", directory,
- entry->d_name);
- unlink(fname);
+ shared_globalStats = (PgStat_GlobalStats *)
+ dsa_get_address(area, StatsShmem->global_stats);
+ shared_archiverStats = (PgStat_ArchiverStats *)
+ dsa_get_address(area, StatsShmem->archiver_stats);
+
+ /* Load saved data if any. */
+ pgstat_read_statsfiles();
+
+ StatsShmem->refcount = 1;
  }
- FreeDir(dir);
+
+ LWLockRelease(StatsLock);
+
+ /*
+ * If we're not the first process, attach existing shared stats area
+ * outside StatsLock.
+ */
+ if (!area)
+ {
+ /* Shared area already exists. Just attach it. */
+ area = dsa_attach(StatsShmem->stats_dsa_handle);
+ pgStatDBHash = dshash_attach(area, &dsh_dbparams,
+ StatsShmem->db_hash_handle, 0);
+
+ /* Setup local variables */
+ pgStatLocalHash = NULL;
+ shared_globalStats = (PgStat_GlobalStats *)
+ dsa_get_address(area, StatsShmem->global_stats);
+ shared_archiverStats = (PgStat_ArchiverStats *)
+ dsa_get_address(area, StatsShmem->archiver_stats);
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ dsa_pin_mapping(area);
+ global_snapshot_is_valid = false;
+}
+
+/* ----------
+ * pgstat_detach_shared_stats() -
+ *
+ * Detach shared stats. Write out to file if we're the last process and
+ * instructed to write file.
+ * ----------
+ */
+static void
+pgstat_detach_shared_stats(bool write_stats)
+{
+ if (!area || !IsUnderPostmaster)
+ return;
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+
+ /* write out the shared stats to file if needed */
+ if (--StatsShmem->refcount < 1)
+ {
+ if (write_stats)
+ pgstat_write_statsfiles();
+
+ /* We're the last process. Invalidate the dsa area handle. */
+ StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+ }
+
+ LWLockRelease(StatsLock);
+
+ /*
+ * Detach the area. Automatically destroyed when the last process detached
+ * it.
+ */
+ dsa_detach(area);
+
+ area = NULL;
+ pgStatDBHash = NULL;
+ shared_globalStats = NULL;
+ shared_archiverStats = NULL;
+ pgStatLocalHash = NULL;
+ global_snapshot_is_valid = false;
 }
 
 /*
  * pgstat_reset_all() -
  *
- * Remove the stats files.  This is currently used only if WAL
- * recovery is needed after a crash.
+ * Remove the stats file.  This is currently used only if WAL recovery is
+ * needed after a crash.
  */
 void
 pgstat_reset_all(void)
 {
- pgstat_reset_remove_files(pgstat_stat_directory);
- pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY);
-}
+ /* we must have shared stats attached */
+ Assert (StatsShmem->stats_dsa_handle != DSM_HANDLE_INVALID);
 
-#ifdef EXEC_BACKEND
-
-/*
- * pgstat_forkexec() -
- *
- * Format up the arglist for, then fork and exec, statistics collector process
- */
-static pid_t
-pgstat_forkexec(void)
-{
- char   *av[10];
- int ac = 0;
-
- av[ac++] = "postgres";
- av[ac++] = "--forkcol";
- av[ac++] = NULL; /* filled in by postmaster_forkexec */
-
- av[ac] = NULL;
- Assert(ac < lengthof(av));
-
- return postmaster_forkexec(ac, av);
-}
-#endif /* EXEC_BACKEND */
-
-
-/*
- * pgstat_start() -
- *
- * Called from postmaster at startup or after an existing collector
- * died.  Attempt to fire up a fresh statistics collector.
- *
- * Returns PID of child process, or 0 if fail.
- *
- * Note: if fail, we will be called again from the postmaster main loop.
- */
-int
-pgstat_start(void)
-{
- time_t curtime;
- pid_t pgStatPid;
+ /* Startup must be the only user of shared stats */
+ Assert (StatsShmem->refcount == 1);
 
  /*
- * Check that the socket is there, else pgstat_init failed and we can do
- * nothing useful.
+ * We could directly remove files and recreate the shared memory area. But
+ * detach then attach for simplicity.
  */
- if (pgStatSock == PGINVALID_SOCKET)
- return 0;
-
- /*
- * Do nothing if too soon since last collector start.  This is a safety
- * valve to protect against continuous respawn attempts if the collector
- * is dying immediately at launch.  Note that since we will be re-called
- * from the postmaster main loop, we will get another chance later.
- */
- curtime = time(NULL);
- if ((unsigned int) (curtime - last_pgstat_start_time) <
- (unsigned int) PGSTAT_RESTART_INTERVAL)
- return 0;
- last_pgstat_start_time = curtime;
-
- /*
- * Okay, fork off the collector.
- */
-#ifdef EXEC_BACKEND
- switch ((pgStatPid = pgstat_forkexec()))
-#else
- switch ((pgStatPid = fork_process()))
-#endif
- {
- case -1:
- ereport(LOG,
- (errmsg("could not fork statistics collector: %m")));
- return 0;
-
-#ifndef EXEC_BACKEND
- case 0:
- /* in postmaster child ... */
- InitPostmasterChild();
-
- /* Close the postmaster's sockets */
- ClosePostmasterPorts(false);
-
- /* Drop our connection to postmaster's shared memory, as well */
- dsm_detach_all();
- PGSharedMemoryDetach();
-
- PgstatCollectorMain(0, NULL);
- break;
-#endif
-
- default:
- return (int) pgStatPid;
- }
-
- /* shouldn't get here */
- return 0;
-}
-
-void
-allow_immediate_pgstat_restart(void)
-{
- last_pgstat_start_time = 0;
+ pgstat_detach_shared_stats(false); /* Don't write */
+ pgstat_attach_shared_stats();
 }
 
 /* ------------------------------------------------------------
@@ -798,75 +607,293 @@ allow_immediate_pgstat_restart(void)
  *------------------------------------------------------------
  */
 
-
 /* ----------
  * pgstat_report_stat() -
  *
  * Must be called by processes that performs DML: tcop/postgres.c, logical
- * receiver processes, SPI worker, etc. to send the so far collected
- * per-table and function usage statistics to the collector.  Note that this
- * is called only when not within a transaction, so it is fair to use
+ * receiver processes, SPI worker, etc. to apply the so far collected
+ * per-table and function usage statistics to the shared statistics hashes.
+ *
+ *  Updates are applied not more frequent than the interval of
+ *  PGSTAT_STAT_MIN_INTERVAL milliseconds. They are also postponed on lock
+ *  failure if force is false and there's no pending updates longer than
+ *  PGSTAT_STAT_MAX_INTERVAL milliseconds. Postponed updates are retried in
+ *  succeeding calls of this function.
+ *
+ * Returns the time until the next timing when updates are applied in
+ * milliseconds if there are no updates holded for more than
+ * PGSTAT_STAT_MIN_INTERVAL milliseconds.
+ *
+ * Note that this is called only out of a transaction, so it is fine to use
  * transaction stop time as an approximation of current time.
- * ----------
+ * ----------
  */
-void
+long
 pgstat_report_stat(bool force)
 {
- /* we assume this inits to all zeroes: */
- static const PgStat_TableCounts all_zeroes;
- static TimestampTz last_report = 0;
-
+ static TimestampTz next_flush = 0;
+ static TimestampTz pending_since = 0;
  TimestampTz now;
- PgStat_MsgTabstat regular_msg;
- PgStat_MsgTabstat shared_msg;
- TabStatusArray *tsa;
- int i;
+ pgstat_flush_stat_context cxt = {0};
+ bool pending_stats = false;
+ long elapsed;
+ long secs;
+ int usecs;
 
  /* Don't expend a clock check if nothing to do */
- if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
- pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
- !have_function_stats)
- return;
+ if (area == NULL ||
+ ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
+ pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
+ !HAVE_PENDING_DBSTATS()  && !have_function_stats))
+ return 0;
+
+ now = GetCurrentTransactionStopTimestamp();
+
+ if (!force)
+ {
+ /*
+ * Don't flush stats unless it's the time.  Returns time to wait in
+ * milliseconds.
+ */
+ if (now < next_flush)
+ {
+ /* Record the oldest pending update if not yet. */
+ if (pending_since == 0)
+ pending_since = now;
+
+ /* now < next_flush here */
+ return (next_flush - now) / 1000;
+ }
+
+ /*
+ * Don't keep pending updates longer than PGSTAT_STAT_MAX_INTERVAL.
+ */
+ if (pending_since > 0)
+ {
+ TimestampDifference(pending_since, now, &secs, &usecs);
+ elapsed = secs * 1000 + usecs /1000;
+
+ if(elapsed > PGSTAT_STAT_MAX_INTERVAL)
+ force = true;
+ }
+ }
+
+ /* Flush out table stats */
+ if (pgStatTabList != NULL && !pgstat_flush_stat(&cxt, !force))
+ pending_stats = true;
+
+ /* Flush out function stats */
+ if (pgStatFunctions != NULL && !pgstat_flush_funcstats(&cxt, !force))
+ pending_stats = true;
+
+ /* Flush out database-wide stats */
+ if (HAVE_PENDING_DBSTATS())
+ {
+ if (!pgstat_flush_dbstats(&cxt, !force))
+ pending_stats = true;
+ }
+
+ /* Unpin dbentry if pinned */
+ if (cxt.mydb_tabhash)
+ {
+ dshash_detach(cxt.mydb_tabhash);
+ unpin_hashes(cxt.mydbentry, cxt.mygeneration);
+ cxt.mydb_tabhash = NULL;
+ cxt.mydbentry = NULL;
+ }
+
+ /* Publish the last flush time */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ if (shared_globalStats->stats_timestamp < now)
+ shared_globalStats->stats_timestamp = now;
+ LWLockRelease(StatsLock);
+
+ /* Record how long we are keepnig pending updats. */
+ if (pending_stats)
+ {
+ /* Preserve the first value */
+ if (pending_since == 0)
+ pending_since = now;
+
+ /*
+ * It's possible that the retry interval is longer than the limit by
+ * PGSTAT_STAT_MAX_INTERVAL. We don't bother that since it's not so
+ * much.
+ */
+ return PGSTAT_STAT_RETRY_INTERVAL;
+ }
+
+ /* Set the next time to update stats */
+ next_flush = now + PGSTAT_STAT_MIN_INTERVAL * 1000;
+ pending_since = 0;
+
+ return 0;
+}
+
+/*
+ * snapshot_statentry() - Common routine for functions
+ * pgstat_fetch_stat_*entry()
+ *
+ *  Returns the pointer to a snapshot of a shared entry for the key or NULL if
+ *  not found. Returned snapshots are stable during the current transaction or
+ *  until pgstat_clear_snapshot() is called.
+ *
+ *  The snapshots are stored in a hash, pointer to which is stored in the
+ *  *HTAB variable pointed by cxt->hash. If not created yet, it is created
+ *  using hash_name, hash_entsize in cxt.
+ *
+ *  cxt->dshash points to dshash_table for dbstat entries. If not yet
+ *  attached, it is attached using cxt->dsh_handle.
+ */
+static void *
+snapshot_statentry(pgstat_snapshot_param *cxt, Oid key)
+{
+ PgStat_snapshot *lentry = NULL;
+ size_t keysize = cxt->dsh_params->key_size;
+ size_t dsh_entrysize = cxt->dsh_params->entry_size;
+ bool found;
 
  /*
- * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
- * msec since we last sent one, or the caller wants to force stats out.
+ * We don't want so frequent update of stats snapshot. Keep it at least
+ * for PGSTAT_STAT_MIN_INTERVAL ms. Not postpone but just ignore the cue.
  */
- now = GetCurrentTransactionStopTimestamp();
- if (!force &&
- !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
- return;
- last_report = now;
+ if (clear_snapshot)
+ {
+ clear_snapshot = false;
+
+ if (pgStatSnapshotContext &&
+ snapshot_globalStats.stats_timestamp <
+ GetCurrentStatementStartTimestamp() -
+ PGSTAT_STAT_MIN_INTERVAL * 1000)
+ {
+ MemoryContextReset(pgStatSnapshotContext);
+
+ /* Reset variables */
+ global_snapshot_is_valid = false;
+ pgStatSnapshotContext = NULL;
+ pgStatLocalHash = NULL;
+
+ pgstat_setup_memcxt();
+ }
+ }
+
+ /*
+ * Create new hash, with rather arbitrary initial number of entries since
+ * we don't know how this hash will grow.
+ */
+ if (!*cxt->hash)
+ {
+ HASHCTL ctl;
+
+ /*
+ * Create the hash in the stats context
+ *
+ * The entry is prepended by common header part represented by
+ * PgStat_snapshot.
+ */
+
+ ctl.keysize = keysize;
+ ctl.entrysize = offsetof(PgStat_snapshot, body) + cxt->hash_entsize;
+ ctl.hcxt = pgStatSnapshotContext;
+ *cxt->hash = hash_create(cxt->hash_name, 32, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ }
+
+ lentry = hash_search(*cxt->hash, &key, HASH_ENTER, &found);
+
+ /*
+ * Refer shared hash if not found in the local hash. We return up-to-date
+ * entries outside a transaction so do the same even if the snapshot is
+ * found.
+ */
+ if (!found || !IsTransactionState())
+ {
+ void *sentry;
+
+ /* attach shared hash if not given, leave it alone for later use */
+ if (!*cxt->dshash)
+ {
+ MemoryContext oldcxt;
+
+ Assert (cxt->dsh_handle != DSM_HANDLE_INVALID);
+ oldcxt = MemoryContextSwitchTo(pgStatSnapshotContext);
+ *cxt->dshash =
+ dshash_attach(area, cxt->dsh_params, cxt->dsh_handle, NULL);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ sentry = dshash_find(*cxt->dshash, &key, false);
+
+ if (sentry)
+ {
+ /*
+ * In transaction state, it is obvious that we should create local
+ * cache entries for consistency. If we are not, we return an
+ * up-to-date entry. Having said that, we need a local copy since
+ * dshash entry must be released immediately. We share the same
+ * local hash entry for the purpose.
+ */
+ memcpy(&lentry->body, sentry, dsh_entrysize);
+ dshash_release_lock(*cxt->dshash, sentry);
+
+ /* then zero out the local additional space if any */
+ if (dsh_entrysize < cxt->hash_entsize)
+ MemSet((char *)&lentry->body + dsh_entrysize, 0,
+   cxt->hash_entsize - dsh_entrysize);
+ }
+
+ lentry->negative = !sentry;
+ }
+
+ if (lentry->negative)
+ return NULL;
+
+ return &lentry->body;
+}
+
+/*
+ * pgstat_flush_stat: Flushes table stats out to shared statistics.
+ *
+ *  If nowait is true, returns false if required lock was not acquired
+ *  immediately. In that case, unapplied table stats updates are left alone in
+ *  TabStatusArray to wait for the next chance. cxt holds some dshash related
+ *  values that we want to carry around while updating shared stats.
+ *
+ *  Returns true if all stats info are flushed. Caller must detach dshashes
+ *  stored in cxt after use.
+ */
+static bool
+pgstat_flush_stat(pgstat_flush_stat_context *cxt, bool nowait)
+{
+ static const PgStat_TableCounts all_zeroes;
+ TabStatusArray *tsa;
+ HTAB   *new_tsa_hash = NULL;
+ TabStatusArray *dest_tsa = pgStatTabList;
+ int dest_elem = 0;
+ int i;
+
+ /* nothing to do, just return  */
+ if (pgStatTabHash == NULL)
+ return true;
 
  /*
  * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry
- * entries it points to.  (Should we fail partway through the loop below,
- * it's okay to have removed the hashtable already --- the only
- * consequence is we'd get multiple entries for the same table in the
- * pgStatTabList, and that's safe.)
+ * entries it points to.
  */
- if (pgStatTabHash)
- hash_destroy(pgStatTabHash);
+ hash_destroy(pgStatTabHash);
  pgStatTabHash = NULL;
 
  /*
  * Scan through the TabStatusArray struct(s) to find tables that actually
- * have counts, and build messages to send.  We have to separate shared
- * relations from regular ones because the databaseid field in the message
- * header has to depend on that.
+ * have counts, and try flushing it out to shared stats. We may fail on
+ * some entries in the array. Leaving the entries being packed at the
+ * beginning of the array.
  */
- regular_msg.m_databaseid = MyDatabaseId;
- shared_msg.m_databaseid = InvalidOid;
- regular_msg.m_nentries = 0;
- shared_msg.m_nentries = 0;
-
  for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
  {
  for (i = 0; i < tsa->tsa_used; i++)
  {
  PgStat_TableStatus *entry = &tsa->tsa_entries[i];
- PgStat_MsgTabstat *this_msg;
- PgStat_TableEntry *this_ent;
 
  /* Shouldn't have any pending transaction-dependent counts */
  Assert(entry->trans == NULL);
@@ -879,178 +906,352 @@ pgstat_report_stat(bool force)
    sizeof(PgStat_TableCounts)) == 0)
  continue;
 
- /*
- * OK, insert data into the appropriate message, and send if full.
- */
- this_msg = entry->t_shared ? &shared_msg : &regular_msg;
- this_ent = &this_msg->m_entry[this_msg->m_nentries];
- this_ent->t_id = entry->t_id;
- memcpy(&this_ent->t_counts, &entry->t_counts,
-   sizeof(PgStat_TableCounts));
- if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+ /* try to apply the tab stats */
+ if (!pgstat_flush_tabstat(cxt, nowait, entry))
  {
- pgstat_send_tabstat(this_msg);
- this_msg->m_nentries = 0;
+ /*
+ * Failed. Move it to the beginning in TabStatusArray and
+ * leave it.
+ */
+ TabStatHashEntry *hash_entry;
+ bool found;
+
+ if (new_tsa_hash == NULL)
+ new_tsa_hash = create_tabstat_hash();
+
+ /* Create hash entry for this entry */
+ hash_entry = hash_search(new_tsa_hash, &entry->t_id,
+ HASH_ENTER, &found);
+ Assert(!found);
+
+ /*
+ * Move insertion pointer to the next segment if the segment
+ * is filled up.
+ */
+ if (dest_elem >= TABSTAT_QUANTUM)
+ {
+ Assert(dest_tsa->tsa_next != NULL);
+ dest_tsa = dest_tsa->tsa_next;
+ dest_elem = 0;
+ }
+
+ /*
+ * Pack the entry at the begining of the array. Do nothing if
+ * no need to be moved.
+ */
+ if (tsa != dest_tsa || i != dest_elem)
+ {
+ PgStat_TableStatus *new_entry;
+ new_entry = &dest_tsa->tsa_entries[dest_elem];
+ *new_entry = *entry;
+
+ /* use new_entry as entry hereafter */
+ entry = new_entry;
+ }
+
+ hash_entry->tsa_entry = entry;
+ dest_elem++;
  }
  }
- /* zero out PgStat_TableStatus structs after use */
- MemSet(tsa->tsa_entries, 0,
-   tsa->tsa_used * sizeof(PgStat_TableStatus));
- tsa->tsa_used = 0;
  }
 
- /*
- * Send partial messages.  Make sure that any pending xact commit/abort
- * gets counted, even if there are no table stats to send.
- */
- if (regular_msg.m_nentries > 0 ||
- pgStatXactCommit > 0 || pgStatXactRollback > 0)
- pgstat_send_tabstat(&regular_msg);
- if (shared_msg.m_nentries > 0)
- pgstat_send_tabstat(&shared_msg);
+ /* zero out unused area of TableStatus */
+ dest_tsa->tsa_used = dest_elem;
+ MemSet(&dest_tsa->tsa_entries[dest_elem], 0,
+   (TABSTAT_QUANTUM - dest_elem) * sizeof(PgStat_TableStatus));
+ while (dest_tsa->tsa_next)
+ {
+ dest_tsa = dest_tsa->tsa_next;
+ MemSet(dest_tsa->tsa_entries, 0,
+   dest_tsa->tsa_used * sizeof(PgStat_TableStatus));
+ dest_tsa->tsa_used = 0;
+ }
 
- /* Now, send function statistics */
- pgstat_send_funcstats();
+ /* and set the new TabStatusArray hash if any */
+ pgStatTabHash = new_tsa_hash;
+
+ /*
+ * We no longer need shared database and table entries, but that for my
+ * database may be used later.
+ */
+ if (cxt->shdb_tabhash)
+ {
+ dshash_detach(cxt->shdb_tabhash);
+ unpin_hashes(cxt->shdbentry, cxt->shgeneration);
+ cxt->shdb_tabhash = NULL;
+ cxt->shdbentry = NULL;
+ }
+
+ return pgStatTabHash == NULL;
 }
 
-/*
- * Subroutine for pgstat_report_stat: finish and send a tabstat message
+/* -------
+ * Subroutines for pgstat_flush_stat.
+ * -------
  */
-static void
-pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
+/*
+ * pgstat_flush_tabstat: Flushes a table stats entry.
+ *
+ *  If nowait is true, returns false on lock failure.  Dshashes for table and
+ *  function stats are kept attached in ctx. The caller must detach them after
+ *  use.
+ *
+ *  Returns true if the entry is flushed out.
+ */
+bool
+pgstat_flush_tabstat(pgstat_flush_stat_context *cxt, bool nowait,
+ PgStat_TableStatus *entry)
 {
- int n;
- int len;
+ Oid dboid = entry->t_shared ? InvalidOid : MyDatabaseId;
+ int table_mode = PGSTAT_EXCLUSIVE;
+ bool updated = false;
+ dshash_table *tabhash;
+ PgStat_StatDBEntry *dbent;
+ int generation;
 
- /* It's unlikely we'd get here with no socket, but maybe not impossible */
- if (pgStatSock == PGINVALID_SOCKET)
- return;
+ if (nowait)
+ table_mode |= PGSTAT_NOWAIT;
 
- /*
- * Report and reset accumulated xact commit/rollback and I/O timings
- * whenever we send a normal tabstat message
- */
- if (OidIsValid(tsmsg->m_databaseid))
+ /* Attach required table hash if not yet. */
+ if ((entry->t_shared ? cxt->shdb_tabhash : cxt->mydb_tabhash) == NULL)
  {
- tsmsg->m_xact_commit = pgStatXactCommit;
- tsmsg->m_xact_rollback = pgStatXactRollback;
- tsmsg->m_block_read_time = pgStatBlockReadTime;
- tsmsg->m_block_write_time = pgStatBlockWriteTime;
- pgStatXactCommit = 0;
- pgStatXactRollback = 0;
- pgStatBlockReadTime = 0;
- pgStatBlockWriteTime = 0;
+ /*
+ *  Return if we don't have corresponding dbentry. It would've been
+ *  removed.
+ */
+ dbent = pgstat_get_db_entry(dboid, table_mode, NULL);
+ if (!dbent)
+ return false;
+
+ /*
+ * We don't hold lock on the dbentry since it cannot be dropped while
+ * we are working on it.
+ */
+ generation = pin_hashes(dbent);
+ tabhash = attach_table_hash(dbent, generation);
+
+ if (entry->t_shared)
+ {
+ cxt->shgeneration = generation;
+ cxt->shdbentry = dbent;
+ cxt->shdb_tabhash = tabhash;
+ }
+ else
+ {
+ cxt->mygeneration = generation;
+ cxt->mydbentry = dbent;
+ cxt->mydb_tabhash = tabhash;
+
+ /*
+ * We come here once per database. Take the chance to update
+ * database-wide stats
+ */
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
+ dbent->n_xact_commit += pgStatXactCommit;
+ dbent->n_xact_rollback += pgStatXactRollback;
+ dbent->n_block_read_time += pgStatBlockReadTime;
+ dbent->n_block_write_time += pgStatBlockWriteTime;
+ LWLockRelease(&dbent->lock);
+ pgStatXactCommit = 0;
+ pgStatXactRollback = 0;
+ pgStatBlockReadTime = 0;
+ pgStatBlockWriteTime = 0;
+ }
+ }
+ else if (entry->t_shared)
+ {
+ dbent = cxt->shdbentry;
+ tabhash = cxt->shdb_tabhash;
  }
  else
  {
- tsmsg->m_xact_commit = 0;
- tsmsg->m_xact_rollback = 0;
- tsmsg->m_block_read_time = 0;
- tsmsg->m_block_write_time = 0;
+ dbent = cxt->mydbentry;
+ tabhash = cxt->mydb_tabhash;
  }
 
- n = tsmsg->m_nentries;
- len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
- n * sizeof(PgStat_TableEntry);
 
- pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
- pgstat_send(tsmsg, len);
+ /*
+ * Local table stats should be applied to both dbentry and tabentry at
+ * once. Update dbentry only if we could update tabentry.
+ */
+ if (pgstat_update_tabentry(tabhash, entry, nowait))
+ {
+ pgstat_update_dbentry(dbent, entry);
+ updated = true;
+ }
+
+ return updated;
 }
 
 /*
- * Subroutine for pgstat_report_stat: populate and send a function stat message
+ * pgstat_flush_funcstats: Flushes function stats.
+ *
+ *  If nowait is true, returns false on lock failure. Unapplied local hash
+ *  entryis are left alone.
+ *
+ *  Returns true if all entries are flushed out.
  */
-static void
-pgstat_send_funcstats(void)
+static bool
+pgstat_flush_funcstats(pgstat_flush_stat_context *cxt, bool nowait)
 {
  /* we assume this inits to all zeroes: */
  static const PgStat_FunctionCounts all_zeroes;
-
- PgStat_MsgFuncstat msg;
- PgStat_BackendFunctionEntry *entry;
+ dshash_table   *funchash;
  HASH_SEQ_STATUS fstat;
+ PgStat_BackendFunctionEntry *bestat;
 
+ /* nothing to do, just return  */
  if (pgStatFunctions == NULL)
- return;
+ return true;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT);
- msg.m_databaseid = MyDatabaseId;
- msg.m_nentries = 0;
-
- hash_seq_init(&fstat, pgStatFunctions);
- while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
+ /* get dbentry into cxt if not yet.  */
+ if (cxt->mydbentry == NULL)
  {
- PgStat_FunctionEntry *m_ent;
+ int op = PGSTAT_EXCLUSIVE;
 
- /* Skip it if no counts accumulated since last time */
- if (memcmp(&entry->f_counts, &all_zeroes,
+ if (nowait)
+ op |= PGSTAT_NOWAIT;
+
+ cxt->mydbentry = pgstat_get_db_entry(MyDatabaseId, op, NULL);
+
+ if (cxt->mydbentry == NULL)
+ return false;
+
+ cxt->mygeneration = pin_hashes(cxt->mydbentry);
+ }
+
+ funchash = attach_function_hash(cxt->mydbentry, cxt->mygeneration);
+ if (funchash == NULL)
+ return false;
+
+ have_function_stats = false;
+
+ /*
+ * Scan through the pgStatFunctions to find functions that actually have
+ * counts, and try flushing it out to shared stats.
+ */
+ hash_seq_init(&fstat, pgStatFunctions);
+ while ((bestat = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
+ {
+ bool found;
+ PgStat_StatFuncEntry *funcent = NULL;
+
+ /* Skip it if no counts accumulated for it so far */
+ if (memcmp(&bestat->f_counts, &all_zeroes,
    sizeof(PgStat_FunctionCounts)) == 0)
  continue;
 
- /* need to convert format of time accumulators */
- m_ent = &msg.m_entry[msg.m_nentries];
- m_ent->f_id = entry->f_id;
- m_ent->f_numcalls = entry->f_counts.f_numcalls;
- m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time);
- m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time);
+ funcent = (PgStat_StatFuncEntry *)
+ dshash_find_or_insert_extended(funchash, (void *) &(bestat->f_id),
+   &found, nowait);
 
- if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
+ /*
+ * We couldn't acquire lock on the required entry. Leave the local
+ * entry alone.
+ */
+ if (!funcent)
  {
- pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
- msg.m_nentries * sizeof(PgStat_FunctionEntry));
- msg.m_nentries = 0;
+ have_function_stats = true;
+ continue;
  }
 
- /* reset the entry's counts */
- MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
+ /* Initialize if it's new, or add to it. */
+ if (!found)
+ {
+ funcent->functionid = bestat->f_id;
+ funcent->f_numcalls = bestat->f_counts.f_numcalls;
+ funcent->f_total_time =
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_total_time);
+ funcent->f_self_time =
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_self_time);
+ }
+ else
+ {
+ funcent->f_numcalls += bestat->f_counts.f_numcalls;
+ funcent->f_total_time +=
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_total_time);
+ funcent->f_self_time +=
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_self_time);
+ }
+ dshash_release_lock(funchash, funcent);
+
+ /* reset used counts */
+ MemSet(&bestat->f_counts, 0, sizeof(PgStat_FunctionCounts));
  }
 
- if (msg.m_nentries > 0)
- pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
- msg.m_nentries * sizeof(PgStat_FunctionEntry));
-
- have_function_stats = false;
+ return !have_function_stats;
 }
 
+/*
+ * pgstat_flush_dbstats: Flushes out miscellaneous database stats.
+ *
+ *  If nowait is true, returns with false on lock failure on dbentry.
+ *
+ *  Returns true if all stats are flushed out.
+ */
+static bool
+pgstat_flush_dbstats(pgstat_flush_stat_context *cxt, bool nowait)
+{
+ /* get dbentry if not yet.  */
+ if (cxt->mydbentry == NULL)
+ {
+ int op = PGSTAT_EXCLUSIVE;
+ if (nowait)
+ op |= PGSTAT_NOWAIT;
+
+ cxt->mydbentry = pgstat_get_db_entry(MyDatabaseId, op, NULL);
+
+ /* return if lock failed. */
+ if (cxt->mydbentry == NULL)
+ return false;
+
+ /* we use this generation of table /function stats in this turn */
+ cxt->mygeneration = pin_hashes(cxt->mydbentry);
+ }
+
+ LWLockAcquire(&cxt->mydbentry->lock, LW_EXCLUSIVE);
+ if (HAVE_PENDING_CONFLICTS())
+ pgstat_flush_recovery_conflict(cxt->mydbentry);
+ if (BeDBStats.n_deadlocks != 0)
+ pgstat_flush_deadlock(cxt->mydbentry);
+ if (BeDBStats.n_tmpfiles != 0)
+ pgstat_flush_tempfile(cxt->mydbentry);
+ if (BeDBStats.checksum_failures != NULL)
+ pgstat_flush_checksum_failure(cxt->mydbentry);
+ LWLockRelease(&cxt->mydbentry->lock);
+
+ return true;
+}
 
 /* ----------
  * pgstat_vacuum_stat() -
  *
- * Will tell the collector about objects he can get rid of.
+ * Remove objects we can get rid of.
  * ----------
  */
 void
 pgstat_vacuum_stat(void)
 {
- HTAB   *htab;
- PgStat_MsgTabpurge msg;
- PgStat_MsgFuncpurge f_msg;
- HASH_SEQ_STATUS hstat;
+ HTAB   *oidtab;
+ dshash_seq_status dshstat;
  PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
- PgStat_StatFuncEntry *funcentry;
- int len;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ /* we don't collect stats under standalone mode */
+ if (!IsUnderPostmaster)
  return;
 
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
-
  /*
  * Read pg_database and make a list of OIDs of all existing databases
  */
- htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
+ oidtab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
 
  /*
- * Search the database hash table for dead databases and tell the
- * collector to drop them.
+ * Search the database hash table for dead databases and drop them
+ * from the hash.
  */
- hash_seq_init(&hstat, pgStatDBHash);
- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+
+ dshash_seq_init(&dshstat, pgStatDBHash, false, true);
+ while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&dshstat)) != NULL)
  {
  Oid dbid = dbentry->databaseid;
 
@@ -1058,137 +1259,43 @@ pgstat_vacuum_stat(void)
 
  /* the DB entry for shared tables (with InvalidOid) is never dropped */
  if (OidIsValid(dbid) &&
- hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
+ hash_search(oidtab, (void *) &dbid, HASH_FIND, NULL) == NULL)
  pgstat_drop_database(dbid);
  }
 
  /* Clean up */
- hash_destroy(htab);
+ hash_destroy(oidtab);
 
  /*
  * Lookup our own database entry; if not found, nothing more to do.
  */
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &MyDatabaseId,
- HASH_FIND, NULL);
- if (dbentry == NULL || dbentry->tables == NULL)
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, NULL);
+ if (!dbentry)
  return;
 
  /*
  * Similarly to above, make a list of all known relations in this DB.
  */
- htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
-
- /*
- * Initialize our messages table counter to zero
- */
- msg.m_nentries = 0;
+ oidtab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
 
  /*
  * Check for all tables listed in stats hashtable if they still exist.
+ * Stats cache is useless here so directly search the shared hash.
  */
- hash_seq_init(&hstat, dbentry->tables);
- while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
- {
- Oid tabid = tabentry->tableid;
-
- CHECK_FOR_INTERRUPTS();
-
- if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
- continue;
-
- /*
- * Not there, so add this table's Oid to the message
- */
- msg.m_tableid[msg.m_nentries++] = tabid;
-
- /*
- * If the message is full, send it out and reinitialize to empty
- */
- if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
- {
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
- + msg.m_nentries * sizeof(Oid);
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
-
- msg.m_nentries = 0;
- }
- }
+ pgstat_remove_useless_entries(dbentry->tables, &dsh_tblparams, oidtab);
 
  /*
- * Send the rest
+ * Repeat the above but we needn't bother in the common case where no
+ * function stats are being collected.
  */
- if (msg.m_nentries > 0)
+ if (dbentry->functions != DSM_HANDLE_INVALID)
  {
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
- + msg.m_nentries * sizeof(Oid);
+ oidtab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
- }
-
- /* Clean up */
- hash_destroy(htab);
-
- /*
- * Now repeat the above steps for functions.  However, we needn't bother
- * in the common case where no function stats are being collected.
- */
- if (dbentry->functions != NULL &&
- hash_get_num_entries(dbentry->functions) > 0)
- {
- htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
-
- pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE);
- f_msg.m_databaseid = MyDatabaseId;
- f_msg.m_nentries = 0;
-
- hash_seq_init(&hstat, dbentry->functions);
- while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
- {
- Oid funcid = funcentry->functionid;
-
- CHECK_FOR_INTERRUPTS();
-
- if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
- continue;
-
- /*
- * Not there, so add this function's Oid to the message
- */
- f_msg.m_functionid[f_msg.m_nentries++] = funcid;
-
- /*
- * If the message is full, send it out and reinitialize to empty
- */
- if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
- {
- len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
- + f_msg.m_nentries * sizeof(Oid);
-
- pgstat_send(&f_msg, len);
-
- f_msg.m_nentries = 0;
- }
- }
-
- /*
- * Send the rest
- */
- if (f_msg.m_nentries > 0)
- {
- len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
- + f_msg.m_nentries * sizeof(Oid);
-
- pgstat_send(&f_msg, len);
- }
-
- hash_destroy(htab);
+ pgstat_remove_useless_entries(dbentry->functions, &dsh_funcparams,
+  oidtab);
  }
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
 
@@ -1242,66 +1349,99 @@ pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid)
  return htab;
 }
 
+/*
+ * pgstat_remove_useless_entries - Remove useless entries from per
+ * table/function dshashes.
+ *
+ *  Scan the dshash specified by dshhandle removing entries that are not in
+ *  oidtab. oidtab is destroyed before returning.
+ */
+void
+pgstat_remove_useless_entries(const dshash_table_handle dshhandle,
+  const dshash_parameters *dshparams,
+  HTAB *oidtab)
+{
+ dshash_table *dshtable;
+ dshash_seq_status dshstat;
+ void *ent;
+
+ dshtable = dshash_attach(area, dshparams, dshhandle, 0);
+ dshash_seq_init(&dshstat, dshtable, false, true);
+
+ while ((ent = dshash_seq_next(&dshstat)) != NULL)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* The first member of the entries must be Oid */
+ if (hash_search(oidtab, ent, HASH_FIND, NULL) != NULL)
+ continue;
+
+ /* Not there, so purge this entry */
+ dshash_delete_entry(dshtable, ent);
+ }
+ dshash_detach(dshtable);
+ hash_destroy(oidtab);
+}
 
 /* ----------
  * pgstat_drop_database() -
  *
- * Tell the collector that we just dropped a database.
- * (If the message gets lost, we will still clean the dead DB eventually
- * via future invocations of pgstat_vacuum_stat().)
+ * Remove entry for the database that we just dropped.
+ *
+ * If some stats are flushed after this, this entry will be re-created but we
+ * will still clean the dead DB eventually via future invocations of
+ * pgstat_vacuum_stat().
  * ----------
  */
 void
 pgstat_drop_database(Oid databaseid)
 {
- PgStat_MsgDropdb msg;
+ PgStat_StatDBEntry *dbentry;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ Assert (OidIsValid(databaseid));
+
+ if (!IsUnderPostmaster || !pgStatDBHash)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
- msg.m_databaseid = databaseid;
- pgstat_send(&msg, sizeof(msg));
+ /*
+ * Lookup the database in the hashtable with exclusive lock.
+ */
+ dbentry = pgstat_get_db_entry(databaseid, PGSTAT_EXCLUSIVE, NULL);
+
+ /*
+ * If found, remove it.
+ */
+ if (dbentry)
+ {
+ /* LWLock is needed to rewrite */
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+
+ /* No one is using tables/functions in this dbentry */
+ Assert(dbentry->refcnt == 0);
+
+ /* Remove table/function stats dshash first. */
+ if (dbentry->tables != DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl =
+ dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_destroy(tbl);
+ }
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl =
+ dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_destroy(tbl);
+ }
+ LWLockRelease(&dbentry->lock);
+
+ dshash_delete_entry(pgStatDBHash, (void *)dbentry);
+ }
 }
 
-
-/* ----------
- * pgstat_drop_relation() -
- *
- * Tell the collector that we just dropped a relation.
- * (If the message gets lost, we will still clean the dead entry eventually
- * via future invocations of pgstat_vacuum_stat().)
- *
- * Currently not used for lack of any good place to call it; we rely
- * entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
- * ----------
- */
-#ifdef NOT_USED
-void
-pgstat_drop_relation(Oid relid)
-{
- PgStat_MsgTabpurge msg;
- int len;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
- msg.m_tableid[0] = relid;
- msg.m_nentries = 1;
-
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
-}
-#endif /* NOT_USED */
-
-
 /* ----------
  * pgstat_reset_counters() -
  *
- * Tell the statistics collector to reset counters for our database.
+ * Reset counters for our database.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1310,20 +1450,32 @@ pgstat_drop_relation(Oid relid)
 void
 pgstat_reset_counters(void)
 {
- PgStat_MsgResetcounter msg;
+ PgStat_StatDBEntry   *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ if (!pgStatDBHash)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, sizeof(msg));
+ /*
+ * Lookup the database in the hashtable.  Nothing to do if not there.
+ */
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, &status);
+
+ if (!dbentry)
+ return;
+
+ /* This database is active, safe to release the lock immediately. */
+ dshash_release_lock(pgStatDBHash, dbentry);
+
+ /* Reset database-level stats. */
+ reset_dbentry_counters(dbentry);
+
 }
 
 /* ----------
  * pgstat_reset_shared_counters() -
  *
- * Tell the statistics collector to reset cluster-wide shared counters.
+ * Reset cluster-wide shared counters.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1332,29 +1484,37 @@ pgstat_reset_counters(void)
 void
 pgstat_reset_shared_counters(const char *target)
 {
- PgStat_MsgResetsharedcounter msg;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
+ /* Reset the archiver statistics for the cluster. */
  if (strcmp(target, "archiver") == 0)
- msg.m_resettarget = RESET_ARCHIVER;
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ MemSet(shared_archiverStats, 0, sizeof(*shared_archiverStats));
+ shared_archiverStats->stat_reset_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+ /* Reset the bgwriter statistics for the cluster. */
  else if (strcmp(target, "bgwriter") == 0)
- msg.m_resettarget = RESET_BGWRITER;
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ MemSet(shared_globalStats, 0, sizeof(*shared_globalStats));
+ shared_globalStats->stat_reset_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
  else
  ereport(ERROR,
  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  errmsg("unrecognized reset target: \"%s\"", target),
  errhint("Target must be \"archiver\" or \"bgwriter\".")));
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
- pgstat_send(&msg, sizeof(msg));
 }
 
 /* ----------
  * pgstat_reset_single_counter() -
  *
- * Tell the statistics collector to reset a single counter.
+ * Reset a single counter.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1363,17 +1523,42 @@ pgstat_reset_shared_counters(const char *target)
 void
 pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 {
- PgStat_MsgResetsinglecounter msg;
+ PgStat_StatDBEntry *dbentry;
+ TimestampTz ts;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, NULL);
+
+ if (!dbentry)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER);
- msg.m_databaseid = MyDatabaseId;
- msg.m_resettype = type;
- msg.m_objectid = objoid;
+ /* This database is active, safe to release the lock immediately. */
+ generation = pin_hashes(dbentry);
 
- pgstat_send(&msg, sizeof(msg));
+ /* Set the reset timestamp for the whole database */
+ ts = GetCurrentTimestamp();
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->stat_reset_timestamp = ts;
+ LWLockRelease(&dbentry->lock);
+
+ /* Remove object if it exists, ignore if not */
+ if (type == RESET_TABLE)
+ {
+ dshash_table *t = attach_table_hash(dbentry, generation);
+ dshash_delete_key(t, (void *) &objoid);
+ dshash_detach(t);
+ }
+
+ if (type == RESET_FUNCTION)
+ {
+ dshash_table *t = attach_function_hash(dbentry, generation);
+ if (t)
+ {
+ dshash_delete_key(t, (void *) &objoid);
+ dshash_detach(t);
+ }
+ }
+ unpin_hashes(dbentry, generation);
 }
 
 /* ----------
@@ -1387,48 +1572,81 @@ pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 void
 pgstat_report_autovac(Oid dboid)
 {
- PgStat_MsgAutovacStart msg;
+ PgStat_StatDBEntry *dbentry;
+ TimestampTz ts;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
- msg.m_databaseid = dboid;
- msg.m_start_time = GetCurrentTimestamp();
+ /*
+ * Store the last autovacuum time in the database's hashtable entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ dshash_release_lock(pgStatDBHash, dbentry);
 
- pgstat_send(&msg, sizeof(msg));
+ ts = GetCurrentTimestamp();
+
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->last_autovac_time = ts;
+ LWLockRelease(&dbentry->lock);
 }
 
 
 /* ---------
  * pgstat_report_vacuum() -
  *
- * Tell the collector about the table we just vacuumed.
+ * Report about the table we just vacuumed.
  * ---------
  */
 void
 pgstat_report_vacuum(Oid tableoid, bool shared,
  PgStat_Counter livetuples, PgStat_Counter deadtuples)
 {
- PgStat_MsgVacuum msg;
+ Oid dboid;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ dshash_table *table;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
- msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
- msg.m_tableoid = tableoid;
- msg.m_autovacuum = IsAutoVacuumWorkerProcess();
- msg.m_vacuumtime = GetCurrentTimestamp();
- msg.m_live_tuples = livetuples;
- msg.m_dead_tuples = deadtuples;
- pgstat_send(&msg, sizeof(msg));
+ dboid = shared ? InvalidOid : MyDatabaseId;
+
+ /*
+ * Store the data in the table's hash table entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ generation = pin_hashes(dbentry);
+ table = attach_table_hash(dbentry, generation);
+
+ tabentry = pgstat_get_tab_entry(table, tableoid, true);
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_vacuum_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_vacuum_count++;
+ }
+ else
+ {
+ tabentry->vacuum_timestamp = GetCurrentTimestamp();
+ tabentry->vacuum_count++;
+ }
+ dshash_release_lock(table, tabentry);
+
+ dshash_detach(table);
+ unpin_hashes(dbentry, generation);
 }
 
 /* --------
  * pgstat_report_analyze() -
  *
- * Tell the collector about the table we just analyzed.
+ * Report about the table we just analyzed.
  *
  * Caller must provide new live- and dead-tuples estimates, as well as a
  * flag indicating whether to reset the changes_since_analyze counter.
@@ -1439,9 +1657,14 @@ pgstat_report_analyze(Relation rel,
   PgStat_Counter livetuples, PgStat_Counter deadtuples,
   bool resetcounter)
 {
- PgStat_MsgAnalyze msg;
+ Oid dboid;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ dshash_table   *table;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
  /*
@@ -1470,78 +1693,153 @@ pgstat_report_analyze(Relation rel,
  deadtuples = Max(deadtuples, 0);
  }
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
- msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
- msg.m_tableoid = RelationGetRelid(rel);
- msg.m_autovacuum = IsAutoVacuumWorkerProcess();
- msg.m_resetcounter = resetcounter;
- msg.m_analyzetime = GetCurrentTimestamp();
- msg.m_live_tuples = livetuples;
- msg.m_dead_tuples = deadtuples;
- pgstat_send(&msg, sizeof(msg));
+ dboid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
+
+ /*
+ * Store the data in the table's hashtable entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ generation = pin_hashes(dbentry);
+ table = attach_table_hash(dbentry, generation);
+ tabentry = pgstat_get_tab_entry(table, RelationGetRelid(rel), true);
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ /*
+ * If commanded, reset changes_since_analyze to zero.  This forgets any
+ * changes that were committed while the ANALYZE was in progress, but we
+ * have no good way to estimate how many of those there were.
+ */
+ if (resetcounter)
+ tabentry->changes_since_analyze = 0;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_analyze_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_analyze_count++;
+ }
+ else
+ {
+ tabentry->analyze_timestamp = GetCurrentTimestamp();
+ tabentry->analyze_count++;
+ }
+ dshash_release_lock(table, tabentry);
+
+ dshash_detach(table);
+ unpin_hashes(dbentry, generation);
 }
 
 /* --------
  * pgstat_report_recovery_conflict() -
  *
- * Tell the collector about a Hot Standby recovery conflict.
+ * Report a Hot Standby recovery conflict.
  * --------
  */
 void
 pgstat_report_recovery_conflict(int reason)
 {
- PgStat_MsgRecoveryConflict msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT);
- msg.m_databaseid = MyDatabaseId;
- msg.m_reason = reason;
- pgstat_send(&msg, sizeof(msg));
+ switch (reason)
+ {
+ case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+
+ /*
+ * Since we drop the information about the database as soon as it
+ * replicates, there is no point in counting these conflicts.
+ */
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ BeDBStats.n_conflict_tablespace++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ BeDBStats.n_conflict_lock++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ BeDBStats.n_conflict_snapshot++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ BeDBStats.n_conflict_bufferpin++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ BeDBStats.n_conflict_startup_deadlock++;
+ break;
+ }
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ /* We had a chance to flush immediately */
+ pgstat_flush_recovery_conflict(dbentry);
+
+ dshash_release_lock(pgStatDBHash, dbentry);
+}
+
+/*
+ * flush recovery conflict stats
+ */
+static void
+pgstat_flush_recovery_conflict(PgStat_StatDBEntry *dbentry)
+{
+ dbentry->n_conflict_tablespace += BeDBStats.n_conflict_tablespace;
+ dbentry->n_conflict_lock += BeDBStats.n_conflict_lock;
+ dbentry->n_conflict_snapshot += BeDBStats.n_conflict_snapshot;
+ dbentry->n_conflict_bufferpin += BeDBStats.n_conflict_bufferpin;
+ dbentry->n_conflict_startup_deadlock += BeDBStats.n_conflict_startup_deadlock;
+
+ BeDBStats.n_conflict_tablespace = 0;
+ BeDBStats.n_conflict_lock = 0;
+ BeDBStats.n_conflict_snapshot = 0;
+ BeDBStats.n_conflict_bufferpin = 0;
+ BeDBStats.n_conflict_startup_deadlock = 0;
 }
 
 /* --------
  * pgstat_report_deadlock() -
  *
- * Tell the collector about a deadlock detected.
+ * Report a deadlock detected.
  * --------
  */
 void
 pgstat_report_deadlock(void)
 {
- PgStat_MsgDeadlock msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, sizeof(msg));
+ BeDBStats.n_deadlocks++;
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
-
-
-/* --------
- * pgstat_report_checksum_failures_in_db() -
- *
- * Tell the collector about one or more checksum failures.
- * --------
+/*
+ * flush dead lock stats
  */
-void
-pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
+static void
+pgstat_flush_deadlock(PgStat_StatDBEntry *dbentry)
 {
- PgStat_MsgChecksumFailure msg;
-
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
- return;
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
- msg.m_databaseid = dboid;
- msg.m_failurecount = failurecount;
- msg.m_failure_time = GetCurrentTimestamp();
-
- pgstat_send(&msg, sizeof(msg));
+ dbentry->n_deadlocks += BeDBStats.n_deadlocks;
+ BeDBStats.n_deadlocks = 0;
 }
 
 /* --------
@@ -1559,60 +1857,153 @@ pgstat_report_checksum_failure(void)
 /* --------
  * pgstat_report_tempfile() -
  *
- * Tell the collector about a temporary file.
+ * Report a temporary file.
  * --------
  */
 void
 pgstat_report_tempfile(size_t filesize)
 {
- PgStat_MsgTempFile msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE);
- msg.m_databaseid = MyDatabaseId;
- msg.m_filesize = filesize;
- pgstat_send(&msg, sizeof(msg));
-}
+ if (filesize > 0) /* Is there a case where filesize is really 0? */
+ {
+ BeDBStats.tmpfilesize += filesize; /* needs check overflow */
+ BeDBStats.n_tmpfiles++;
+ }
 
-
-/* ----------
- * pgstat_ping() -
- *
- * Send some junk data to the collector to increase traffic.
- * ----------
- */
-void
-pgstat_ping(void)
-{
- PgStat_MsgDummy msg;
-
- if (pgStatSock == PGINVALID_SOCKET)
+ if (BeDBStats.n_tmpfiles == 0)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
- pgstat_send(&msg, sizeof(msg));
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ /* We had a chance to flush immediately */
+ pgstat_flush_tempfile(dbentry);
+
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
-/* ----------
- * pgstat_send_inquiry() -
- *
- * Notify collector that we need fresh data.
- * ----------
+/*
+ * flush temporary file stats
  */
 static void
-pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
+pgstat_flush_tempfile(PgStat_StatDBEntry *dbentry)
 {
- PgStat_MsgInquiry msg;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
- msg.clock_time = clock_time;
- msg.cutoff_time = cutoff_time;
- msg.databaseid = databaseid;
- pgstat_send(&msg, sizeof(msg));
+ dbentry->n_temp_bytes += BeDBStats.tmpfilesize;
+ dbentry->n_temp_files += BeDBStats.n_tmpfiles;
+ BeDBStats.tmpfilesize = 0;
+ BeDBStats.n_tmpfiles = 0;
 }
 
+/* --------
+ * pgstat_report_checksum_failures_in_db(dboid, failure_count) -
+ *
+ * Tell the collector about one or more checksum failures.
+ * --------
+ */
+void
+pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
+{
+ PgStat_StatDBEntry   *dbentry;
+ PgStat_TableLookupResult status;
+ ChecksumFailureEnt   *failent = NULL;
+
+ /* return if we are not collecting stats */
+ if (!area)
+ return;
+
+ if (BeDBStats.checksum_failures != NULL)
+ {
+ failent = hash_search(BeDBStats.checksum_failures, &dboid,
+  HASH_FIND, NULL);
+ if (failent)
+ failurecount += failent->count;
+ }
+
+ if (failurecount == 0)
+ return;
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ {
+ if (!failent)
+ {
+ if (!BeDBStats.checksum_failures)
+ {
+ HASHCTL ctl;
+
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(ChecksumFailureEnt);
+ BeDBStats.checksum_failures =
+ hash_create("pgstat checksum failure count hash",
+ 32, &ctl, HASH_ELEM | HASH_BLOBS);
+ }
+
+ failent = hash_search(BeDBStats.checksum_failures,
+  &dboid, HASH_ENTER, NULL);
+ }
+
+ failent->count = failurecount;
+ return;
+ }
+
+ /* We have a chance to flush immediately */
+ dbentry->n_checksum_failures += failurecount;
+ BeDBStats.checksum_failures = NULL;
+
+ dshash_release_lock(pgStatDBHash, dbentry);
+}
+
+/*
+ * flush checkpoint failure count for all databases
+ */
+static void
+pgstat_flush_checksum_failure(PgStat_StatDBEntry *dbentry)
+{
+ HASH_SEQ_STATUS stat;
+ ChecksumFailureEnt *ent;
+ bool release_dbent;
+
+ if (BeDBStats.checksum_failures == NULL)
+ return;
+
+ hash_seq_init(&stat, BeDBStats.checksum_failures);
+ while ((ent = (ChecksumFailureEnt *) hash_seq_search(&stat)) != NULL)
+ {
+ release_dbent = false;
+
+ if (dbentry->databaseid != ent->dboid)
+ {
+ dbentry = pgstat_get_db_entry(ent->dboid,
+  PGSTAT_EXCLUSIVE, NULL);
+ if (!dbentry)
+ continue;
+
+ release_dbent = true;
+ }
+
+ dbentry->n_checksum_failures += ent->count;
+
+ if (release_dbent)
+ dshash_release_lock(pgStatDBHash, dbentry);
+ }
+
+ hash_destroy(BeDBStats.checksum_failures);
+ BeDBStats.checksum_failures = NULL;
+}
 
 /*
  * Initialize function call usage data.
@@ -1764,7 +2155,8 @@ pgstat_initstats(Relation rel)
  return;
  }
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  {
  /* We're not counting at all */
  rel->pgstat_info = NULL;
@@ -1783,6 +2175,24 @@ pgstat_initstats(Relation rel)
  rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
 }
 
+/*
+ * create_tabstat_hash - create local hash as transactional storage
+ */
+static HTAB *
+create_tabstat_hash(void)
+{
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(ctl));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(TabStatHashEntry);
+
+ return hash_create("pgstat TabStatusArray lookup hash table",
+   TABSTAT_QUANTUM,
+   &ctl,
+   HASH_ELEM | HASH_BLOBS);
+}
+
 /*
  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
  */
@@ -1798,18 +2208,7 @@ get_tabstat_entry(Oid rel_id, bool isshared)
  * Create hash table if we don't have it already.
  */
  if (pgStatTabHash == NULL)
- {
- HASHCTL ctl;
-
- memset(&ctl, 0, sizeof(ctl));
- ctl.keysize = sizeof(Oid);
- ctl.entrysize = sizeof(TabStatHashEntry);
-
- pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table",
- TABSTAT_QUANTUM,
- &ctl,
- HASH_ELEM | HASH_BLOBS);
- }
+ pgStatTabHash = create_tabstat_hash();
 
  /*
  * Find an entry or create a new one.
@@ -2422,30 +2821,33 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info,
 /* ----------
  * pgstat_fetch_stat_dbentry() -
  *
- * Support function for the SQL-callable pgstat* functions. Returns
- * the collected statistics for one database or NULL. NULL doesn't mean
- * that the database doesn't exist, it is just not yet known by the
- * collector, so the caller is better off to report ZERO instead.
- * ----------
+ * Find database stats entry on backends. The returned entries are cached
+ * until transaction end or pgstat_clear_snapshot() is called.
  */
 PgStat_StatDBEntry *
 pgstat_fetch_stat_dbentry(Oid dbid)
 {
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "local database stats hash",
+ .hash_entsize = sizeof(PgStat_StatDBEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,   /* already attached */
+ .dsh_params = &dsh_dbparams,
+ .hash = &pgStatLocalHash,
+ .dshash = &pgStatDBHash
+ };
 
- /*
- * Lookup the requested database; return NULL if not found
- */
- return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-  (void *) &dbid,
-  HASH_FIND, NULL);
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ /* If not done for this transaction, take a snapshot of global stats */
+ pgstat_snapshot_global_stats();
+
+ /* caller doesn't have a business with snapshot-local members  */
+ return (PgStat_StatDBEntry *) snapshot_statentry(&param, dbid);
 }
 
-
 /* ----------
  * pgstat_fetch_stat_tabentry() -
  *
@@ -2458,51 +2860,66 @@ pgstat_fetch_stat_dbentry(Oid dbid)
 PgStat_StatTabEntry *
 pgstat_fetch_stat_tabentry(Oid relid)
 {
- Oid dbid;
  PgStat_StatDBEntry *dbentry;
  PgStat_StatTabEntry *tabentry;
 
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
+ /* Lookup our database, then look in its table hash table. */
+ dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
+ if (dbentry == NULL)
+ return NULL;
 
- /*
- * Lookup our database, then look in its table hash table.
- */
- dbid = MyDatabaseId;
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_FIND, NULL);
- if (dbentry != NULL && dbentry->tables != NULL)
- {
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &relid,
-   HASH_FIND, NULL);
- if (tabentry)
- return tabentry;
- }
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
+ if (tabentry != NULL)
+ return tabentry;
 
  /*
  * If we didn't find it, maybe it's a shared table.
  */
- dbid = InvalidOid;
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_FIND, NULL);
- if (dbentry != NULL && dbentry->tables != NULL)
- {
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &relid,
-   HASH_FIND, NULL);
- if (tabentry)
- return tabentry;
- }
+ dbentry = pgstat_fetch_stat_dbentry(InvalidOid);
+ if (dbentry == NULL)
+ return NULL;
+
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
+ if (tabentry != NULL)
+ return tabentry;
 
  return NULL;
 }
 
+/* ----------
+ * pgstat_fetch_stat_tabentry_extended() -
+ *
+ * Find table stats entry on backends. The returned entries are cached until
+ * transaction end or pgstat_clear_snapshot() is called.
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry_extended(PgStat_StatDBEntry *dbent, Oid reloid)
+{
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "table stats snapshot hash",
+ .hash_entsize = sizeof(PgStat_StatTabEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,
+ .dsh_params = &dsh_tblparams,
+ .hash = NULL,
+ .dshash = NULL
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ /* set target shared hash */
+ param.dsh_handle = dbent->tables;
+
+ /* tell snapshot_statentry what variables to use */
+ param.hash = &dbent->snapshot_tables;
+ param.dshash = &dbent->dshash_tables;
+
+ return (PgStat_StatTabEntry *)
+ snapshot_statentry(&param, reloid);
+}
+
 
 /* ----------
  * pgstat_fetch_stat_funcentry() -
@@ -2517,21 +2934,90 @@ pgstat_fetch_stat_funcentry(Oid func_id)
  PgStat_StatDBEntry *dbentry;
  PgStat_StatFuncEntry *funcentry = NULL;
 
- /* load the stats file if needed */
- backend_read_statsfile();
-
- /* Lookup our database, then find the requested function.  */
+ /* Lookup our database, then find the requested function */
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
- if (dbentry != NULL && dbentry->functions != NULL)
- {
- funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
- (void *) &func_id,
- HASH_FIND, NULL);
- }
+ if (dbentry == NULL)
+ return NULL;
+
+ funcentry = pgstat_fetch_stat_funcentry_extended(dbentry, func_id);
 
  return funcentry;
 }
 
+/* ----------
+ * pgstat_fetch_stat_funcentry_extended() -
+ *
+ * Find function stats entry on backends. The returned entries are cached
+ * until transaction end or pgstat_clear_snapshot() is called.
+ *
+ *  dbent is type of (PgStat_StatDBEntry *) but it's body must be an
+ *  PgSTat_StatDBEntry returned from pgstat_fetch_stat_dbentry().
+ */
+static PgStat_StatFuncEntry *
+pgstat_fetch_stat_funcentry_extended(PgStat_StatDBEntry *dbent, Oid funcid)
+{
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "function stats snapshot hash",
+ .hash_entsize = sizeof(PgStat_StatFuncEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,
+ .dsh_params = &dsh_funcparams,
+ .hash = NULL,
+ .dshash = NULL
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ if (dbent->functions == DSM_HANDLE_INVALID)
+ return NULL;
+
+ /* set target shared hash */
+ param.dsh_handle = dbent->functions;
+
+ /* tell snapshot_statentry what variables to use */
+ param.hash = &dbent->snapshot_functions;
+ param.dshash = &dbent->dshash_functions;
+
+ return (PgStat_StatFuncEntry *)
+ snapshot_statentry(&param, funcid);
+}
+
+/*
+ * pgstat_snapshot_global_stats() -
+ *
+ * Makes a snapshot of global stats if not done yet.  They will be kept until
+ * subsequent call of pgstat_clear_snapshot() or the end of the current
+ * memory context (typically TopTransactionContext).
+ */
+static void
+pgstat_snapshot_global_stats(void)
+{
+ MemoryContext oldcontext;
+
+ pgstat_attach_shared_stats();
+
+ /* Nothing to do if already done */
+ if (global_snapshot_is_valid)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(pgStatSnapshotContext);
+
+ LWLockAcquire(StatsLock, LW_SHARED);
+ memcpy(&snapshot_globalStats, shared_globalStats,
+   sizeof(PgStat_GlobalStats));
+
+ memcpy(&snapshot_archiverStats, shared_archiverStats,
+   sizeof(PgStat_ArchiverStats));
+ LWLockRelease(StatsLock);
+
+ global_snapshot_is_valid = true;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return;
+}
 
 /* ----------
  * pgstat_fetch_stat_beentry() -
@@ -2603,9 +3089,10 @@ pgstat_fetch_stat_numbackends(void)
 PgStat_ArchiverStats *
 pgstat_fetch_stat_archiver(void)
 {
- backend_read_statsfile();
+ /* If not done for this transaction, take a stats snapshot */
+ pgstat_snapshot_global_stats();
 
- return &archiverStats;
+ return &snapshot_archiverStats;
 }
 
 
@@ -2620,9 +3107,10 @@ pgstat_fetch_stat_archiver(void)
 PgStat_GlobalStats *
 pgstat_fetch_global(void)
 {
- backend_read_statsfile();
+ /* If not done for this transaction, take a stats snapshot */
+ pgstat_snapshot_global_stats();
 
- return &globalStats;
+ return &snapshot_globalStats;
 }
 
 
@@ -2836,8 +3324,8 @@ pgstat_initialize(void)
  MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType];
  }
 
- /* Set up a process-exit hook to clean up */
- on_shmem_exit(pgstat_beshutdown_hook, 0);
+ /* need to be called before dsm shutodwn */
+ before_shmem_exit(pgstat_beshutdown_hook, 0);
 }
 
 /* ----------
@@ -2935,7 +3423,7 @@ pgstat_bestart(void)
  lbeentry.st_backendType = B_STARTUP;
  break;
  case ArchiverProcess:
- beentry->st_backendType = B_ARCHIVER;
+ lbeentry.st_backendType = B_ARCHIVER;
  break;
  case BgWriterProcess:
  lbeentry.st_backendType = B_BG_WRITER;
@@ -3071,6 +3559,10 @@ pgstat_bestart(void)
  /* Update app name to current GUC setting */
  if (application_name)
  pgstat_report_appname(application_name);
+
+
+ /* attach shared database stats area */
+ pgstat_attach_shared_stats();
 }
 
 /*
@@ -3106,6 +3598,8 @@ pgstat_beshutdown_hook(int code, Datum arg)
  beentry->st_procpid = 0; /* mark invalid */
 
  PGSTAT_END_WRITE_ACTIVITY(beentry);
+
+ pgstat_detach_shared_stats(true);
 }
 
 
@@ -3366,7 +3860,8 @@ pgstat_read_current_status(void)
 #endif
  int i;
 
- Assert(!pgStatRunningInCollector);
+ Assert(IsUnderPostmaster);
+
  if (localBackendStatusTable)
  return; /* already done */
 
@@ -3661,9 +4156,6 @@ pgstat_get_wait_activity(WaitEventActivity w)
  case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
  event_name = "LogicalLauncherMain";
  break;
- case WAIT_EVENT_PGSTAT_MAIN:
- event_name = "PgStatMain";
- break;
  case WAIT_EVENT_RECOVERY_WAL_ALL:
  event_name = "RecoveryWalAll";
  break;
@@ -4323,75 +4815,43 @@ pgstat_get_backend_desc(BackendType backendType)
  * ------------------------------------------------------------
  */
 
-
-/* ----------
- * pgstat_setheader() -
- *
- * Set common header fields in a statistics message
- * ----------
- */
-static void
-pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
-{
- hdr->m_type = mtype;
-}
-
-
-/* ----------
- * pgstat_send() -
- *
- * Send out one statistics message to the collector
- * ----------
- */
-static void
-pgstat_send(void *msg, int len)
-{
- int rc;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
- ((PgStat_MsgHdr *) msg)->m_size = len;
-
- /* We'll retry after EINTR, but ignore all other failures */
- do
- {
- rc = send(pgStatSock, msg, len, 0);
- } while (rc < 0 && errno == EINTR);
-
-#ifdef USE_ASSERT_CHECKING
- /* In debug builds, log send failures ... */
- if (rc < 0)
- elog(LOG, "could not send to statistics collector: %m");
-#endif
-}
-
 /* ----------
  * pgstat_send_archiver() -
  *
- * Tell the collector about the WAL file that we successfully
- * archived or failed to archive.
+ * Report archiver statistics
  * ----------
  */
 void
 pgstat_send_archiver(const char *xlog, bool failed)
 {
- PgStat_MsgArchiver msg;
+ TimestampTz now = GetCurrentTimestamp();
 
- /*
- * Prepare and send the message
- */
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER);
- msg.m_failed = failed;
- StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
- msg.m_timestamp = GetCurrentTimestamp();
- pgstat_send(&msg, sizeof(msg));
+ if (failed)
+ {
+ /* Failed archival attempt */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ ++shared_archiverStats->failed_count;
+ memcpy(shared_archiverStats->last_failed_wal, xlog,
+   sizeof(shared_archiverStats->last_failed_wal));
+ shared_archiverStats->last_failed_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+ else
+ {
+ /* Successful archival operation */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ ++shared_archiverStats->archived_count;
+ memcpy(shared_archiverStats->last_archived_wal, xlog,
+   sizeof(shared_archiverStats->last_archived_wal));
+ shared_archiverStats->last_archived_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
 }
 
 /* ----------
  * pgstat_send_bgwriter() -
  *
- * Send bgwriter statistics to the collector
+ * Report bgwriter statistics
  * ----------
  */
 void
@@ -4400,6 +4860,8 @@ pgstat_send_bgwriter(void)
  /* We assume this initializes to zeroes */
  static const PgStat_MsgBgWriter all_zeroes;
 
+ PgStat_MsgBgWriter *s = &BgWriterStats;
+
  /*
  * This function can be called even if nothing at all has happened. In
  * this case, avoid sending a completely empty message to the stats
@@ -4408,11 +4870,18 @@ pgstat_send_bgwriter(void)
  if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
  return;
 
- /*
- * Prepare and send the message
- */
- pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
- pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ shared_globalStats->timed_checkpoints += s->m_timed_checkpoints;
+ shared_globalStats->requested_checkpoints += s->m_requested_checkpoints;
+ shared_globalStats->checkpoint_write_time += s->m_checkpoint_write_time;
+ shared_globalStats->checkpoint_sync_time += s->m_checkpoint_sync_time;
+ shared_globalStats->buf_written_checkpoints += s->m_buf_written_checkpoints;
+ shared_globalStats->buf_written_clean += s->m_buf_written_clean;
+ shared_globalStats->maxwritten_clean += s->m_maxwritten_clean;
+ shared_globalStats->buf_written_backend += s->m_buf_written_backend;
+ shared_globalStats->buf_fsync_backend += s->m_buf_fsync_backend;
+ shared_globalStats->buf_alloc += s->m_buf_alloc;
+ LWLockRelease(StatsLock);
 
  /*
  * Clear out the statistics buffer, so it can be re-used.
@@ -4421,305 +4890,164 @@ pgstat_send_bgwriter(void)
 }
 
 
-/* ----------
- * PgstatCollectorMain() -
+/*
+ * Pin and Unpin dbentry.
  *
- * Start up the statistics collector process.  This is the body of the
- * postmaster child process.
- *
- * The argc/argv parameters are valid only in EXEC_BACKEND case.
- * ----------
+ * To keep less memory usage, and for speed, counters are by recreation of
+ * dshash instead of removing entries one-by-one keeping whole-dshash lock. On
+ * the other hand dshash cannot be destroyed until all referrers have gone. As
+ * the result, other backend may be kept waiting the counter reset for not a
+ * short time. We isolate the hashes under destruction as another generation,
+ * which means no longer used but cannot be removed yet.
+
+ * When we start accessing hashes on a dbentry, call pin_hashes() and acquire
+ * the current "generation". Unlock removes the older generation's hashes when
+ * all refers have gone.
  */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+static int
+pin_hashes(PgStat_StatDBEntry *dbentry)
 {
- int len;
- PgStat_Msg msg;
- int wr;
+ int generation;
 
- /*
- * Ignore all signals usually bound to some action in the postmaster,
- * except SIGHUP and SIGQUIT.  Note we don't need a SIGUSR1 handler to
- * support latch operations, because we only use a local latch.
- */
- pqsignal(SIGHUP, pgstat_sighup_handler);
- pqsignal(SIGINT, SIG_IGN);
- pqsignal(SIGTERM, SIG_IGN);
- pqsignal(SIGQUIT, pgstat_exit);
- pqsignal(SIGALRM, SIG_IGN);
- pqsignal(SIGPIPE, SIG_IGN);
- pqsignal(SIGUSR1, SIG_IGN);
- pqsignal(SIGUSR2, SIG_IGN);
- /* Reset some signals that are accepted by postmaster but not here */
- pqsignal(SIGCHLD, SIG_DFL);
- PG_SETMASK(&UnBlockSig);
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->refcnt++;
+ generation = dbentry->generation;
+ LWLockRelease(&dbentry->lock);
 
- /*
- * Identify myself via ps
- */
- init_ps_display("stats collector", "", "", "");
+ dshash_release_lock(pgStatDBHash, dbentry);
 
- /*
- * Read in existing stats files or initialize the stats to zero.
- */
- pgStatRunningInCollector = true;
- pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
+ return generation;
+}
 
- /*
- * Loop to process messages until we get SIGQUIT or detect ungraceful
- * death of our parent postmaster.
- *
- * For performance reasons, we don't want to do ResetLatch/WaitLatch after
- * every message; instead, do that only after a recv() fails to obtain a
- * message.  (This effectively means that if backends are sending us stuff
- * like mad, we won't notice postmaster death until things slack off a
- * bit; which seems fine.) To do that, we have an inner loop that
- * iterates as long as recv() succeeds.  We do recognize got_SIGHUP inside
- * the inner loop, which means that such interrupts will get serviced but
- * the latch won't get cleared until next time there is a break in the
- * action.
- */
- for (;;)
+/*
+ * Unpin hashes in dbentry. If given generation is isolated, destroy it after
+ * all referrers has gone. Otherwise just decrease reference count then return.
+ */
+static void
+unpin_hashes(PgStat_StatDBEntry *dbentry, int generation)
+{
+ dshash_table *tables;
+ dshash_table *funcs = NULL;
+
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+
+ /* using current generation, just decrease refcount */
+ if (dbentry->generation == generation)
  {
- /* Clear any already-pending wakeups */
- ResetLatch(MyLatch);
-
- /*
- * Quit if we get SIGQUIT from the postmaster.
- */
- if (need_exit)
- break;
-
- /*
- * Inner loop iterates as long as we keep getting messages, or until
- * need_exit becomes set.
- */
- while (!need_exit)
- {
- /*
- * Reload configuration if we got SIGHUP from the postmaster.
- */
- if (got_SIGHUP)
- {
- got_SIGHUP = false;
- ProcessConfigFile(PGC_SIGHUP);
- }
-
- /*
- * Write the stats file(s) if a new request has arrived that is
- * not satisfied by existing file(s).
- */
- if (pgstat_write_statsfile_needed())
- pgstat_write_statsfiles(false, false);
-
- /*
- * Try to receive and process a message.  This will not block,
- * since the socket is set to non-blocking mode.
- *
- * XXX On Windows, we have to force pgwin32_recv to cooperate,
- * despite the previous use of pg_set_noblock() on the socket.
- * This is extremely broken and should be fixed someday.
- */
-#ifdef WIN32
- pgwin32_noblock = 1;
-#endif
-
- len = recv(pgStatSock, (char *) &msg,
-   sizeof(PgStat_Msg), 0);
-
-#ifdef WIN32
- pgwin32_noblock = 0;
-#endif
-
- if (len < 0)
- {
- if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
- break; /* out of inner loop */
- ereport(ERROR,
- (errcode_for_socket_access(),
- errmsg("could not read statistics message: %m")));
- }
-
- /*
- * We ignore messages that are smaller than our common header
- */
- if (len < sizeof(PgStat_MsgHdr))
- continue;
-
- /*
- * The received length must match the length in the header
- */
- if (msg.msg_hdr.m_size != len)
- continue;
-
- /*
- * O.K. - we accept this message.  Process it.
- */
- switch (msg.msg_hdr.m_type)
- {
- case PGSTAT_MTYPE_DUMMY:
- break;
-
- case PGSTAT_MTYPE_INQUIRY:
- pgstat_recv_inquiry(&msg.msg_inquiry, len);
- break;
-
- case PGSTAT_MTYPE_TABSTAT:
- pgstat_recv_tabstat(&msg.msg_tabstat, len);
- break;
-
- case PGSTAT_MTYPE_TABPURGE:
- pgstat_recv_tabpurge(&msg.msg_tabpurge, len);
- break;
-
- case PGSTAT_MTYPE_DROPDB:
- pgstat_recv_dropdb(&msg.msg_dropdb, len);
- break;
-
- case PGSTAT_MTYPE_RESETCOUNTER:
- pgstat_recv_resetcounter(&msg.msg_resetcounter, len);
- break;
-
- case PGSTAT_MTYPE_RESETSHAREDCOUNTER:
- pgstat_recv_resetsharedcounter(
-   &msg.msg_resetsharedcounter,
-   len);
- break;
-
- case PGSTAT_MTYPE_RESETSINGLECOUNTER:
- pgstat_recv_resetsinglecounter(
-   &msg.msg_resetsinglecounter,
-   len);
- break;
-
- case PGSTAT_MTYPE_AUTOVAC_START:
- pgstat_recv_autovac(&msg.msg_autovacuum_start, len);
- break;
-
- case PGSTAT_MTYPE_VACUUM:
- pgstat_recv_vacuum(&msg.msg_vacuum, len);
- break;
-
- case PGSTAT_MTYPE_ANALYZE:
- pgstat_recv_analyze(&msg.msg_analyze, len);
- break;
-
- case PGSTAT_MTYPE_ARCHIVER:
- pgstat_recv_archiver(&msg.msg_archiver, len);
- break;
-
- case PGSTAT_MTYPE_BGWRITER:
- pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
- break;
-
- case PGSTAT_MTYPE_FUNCSTAT:
- pgstat_recv_funcstat(&msg.msg_funcstat, len);
- break;
-
- case PGSTAT_MTYPE_FUNCPURGE:
- pgstat_recv_funcpurge(&msg.msg_funcpurge, len);
- break;
-
- case PGSTAT_MTYPE_RECOVERYCONFLICT:
- pgstat_recv_recoveryconflict(
- &msg.msg_recoveryconflict,
- len);
- break;
-
- case PGSTAT_MTYPE_DEADLOCK:
- pgstat_recv_deadlock(&msg.msg_deadlock, len);
- break;
-
- case PGSTAT_MTYPE_TEMPFILE:
- pgstat_recv_tempfile(&msg.msg_tempfile, len);
- break;
-
- case PGSTAT_MTYPE_CHECKSUMFAILURE:
- pgstat_recv_checksum_failure(
- &msg.msg_checksumfailure,
- len);
- break;
-
- default:
- break;
- }
- } /* end of inner message-processing loop */
-
- /* Sleep until there's something to do */
-#ifndef WIN32
- wr = WaitLatchOrSocket(MyLatch,
-   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-   pgStatSock, -1L,
-   WAIT_EVENT_PGSTAT_MAIN);
-#else
-
- /*
- * Windows, at least in its Windows Server 2003 R2 incarnation,
- * sometimes loses FD_READ events.  Waking up and retrying the recv()
- * fixes that, so don't sleep indefinitely.  This is a crock of the
- * first water, but until somebody wants to debug exactly what's
- * happening there, this is the best we can do.  The two-second
- * timeout matches our pre-9.2 behavior, and needs to be short enough
- * to not provoke "using stale statistics" complaints from
- * backend_read_statsfile.
- */
- wr = WaitLatchOrSocket(MyLatch,
-   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-   pgStatSock,
-   2 * 1000L /* msec */ ,
-   WAIT_EVENT_PGSTAT_MAIN);
-#endif
-
- /*
- * Emergency bailout if postmaster has died.  This is to avoid the
- * necessity for manual cleanup of all postmaster children.
- */
- if (wr & WL_POSTMASTER_DEATH)
- break;
- } /* end of outer loop */
+ dbentry->refcnt--;
+ LWLockRelease(&dbentry->lock);
+ return;
+ }
 
  /*
- * Save the final stats to reuse at next startup.
+ * It is isolated, waiting for all referrers to end.
  */
- pgstat_write_statsfiles(true, true);
+ Assert(dbentry->generation == generation + 1);
 
- exit(0);
+ if (--dbentry->prev_refcnt > 0)
+ {
+ LWLockRelease(&dbentry->lock);
+ return;
+ }
+
+ /* no referrer remains, remove the hashes */
+ tables = dshash_attach(area, &dsh_tblparams, dbentry->prev_tables, 0);
+ if (dbentry->prev_functions != DSM_HANDLE_INVALID)
+ funcs = dshash_attach(area, &dsh_funcparams,
+  dbentry->prev_functions, 0);
+
+ dbentry->prev_tables = DSM_HANDLE_INVALID;
+ dbentry->prev_functions = DSM_HANDLE_INVALID;
+
+ /* release the entry immediately */
+ LWLockRelease(&dbentry->lock);
+
+ dshash_destroy(tables);
+ if (funcs)
+ dshash_destroy(funcs);
+
+ return;
 }
 
-
-/* SIGQUIT signal handler for collector process */
-static void
-pgstat_exit(SIGNAL_ARGS)
+/*
+ * attach and return the specified generation of table hash
+ * Returns NULL on lock failure.
+ */
+static dshash_table *
+attach_table_hash(PgStat_StatDBEntry *dbent, int gen)
 {
- int save_errno = errno;
+ dshash_table *ret;
 
- need_exit = true;
- SetLatch(MyLatch);
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
 
- errno = save_errno;
+ if (dbent->generation == gen)
+ ret = dshash_attach(area, &dsh_tblparams, dbent->tables, 0);
+ else
+ {
+ Assert (dbent->generation == gen + 1);
+ Assert (dbent->prev_tables != DSM_HANDLE_INVALID);
+ ret = dshash_attach(area, &dsh_tblparams, dbent->prev_tables, 0);
+ }
+ LWLockRelease(&dbent->lock);
+
+ return ret;
 }
 
-/* SIGHUP handler for collector process */
-static void
-pgstat_sighup_handler(SIGNAL_ARGS)
+/* attach and return the specified generation of function hash */
+static dshash_table *
+attach_function_hash(PgStat_StatDBEntry *dbent, int gen)
 {
- int save_errno = errno;
+ dshash_table *ret = NULL;
 
- got_SIGHUP = true;
- SetLatch(MyLatch);
 
- errno = save_errno;
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
+
+ if (dbent->generation == gen)
+ {
+ if (dbent->functions == DSM_HANDLE_INVALID)
+ {
+ dshash_table *funchash =
+ dshash_create(area, &dsh_funcparams, 0);
+ dbent->functions = dshash_get_hash_table_handle(funchash);
+
+ ret = funchash;
+ }
+ else
+ ret =  dshash_attach(area, &dsh_funcparams, dbent->functions, 0);
+ }
+ /* don't bother creating useless hash */
+
+ LWLockRelease(&dbent->lock);
+
+ return  ret;
+}
+
+static void
+init_dbentry(PgStat_StatDBEntry *dbentry)
+{
+ LWLockInitialize(&dbentry->lock, LWTRANCHE_STATS);
+ dbentry->generation = 0;
+ dbentry->refcnt = 0;
+ dbentry->prev_refcnt = 0;
+ dbentry->tables = DSM_HANDLE_INVALID;
+ dbentry->prev_tables = DSM_HANDLE_INVALID;
+ dbentry->functions = DSM_HANDLE_INVALID;
+ dbentry->prev_functions = DSM_HANDLE_INVALID;
 }
 
 /*
  * Subroutine to clear stats in a database entry
  *
- * Tables and functions hashes are initialized to empty.
+ * Reset all counters in the dbentry. Tables and functions dshashes are
+ * destroyed.  If any backend is pinning this dbentry, the current dshashes
+ * are stashed out to the previous "generation" to wait for all accessors are
+ * gone. If the previous generation is already occupied, the current dshashes
+ * are so fresh that they doesn't need to be cleared.
  */
 static void
 reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 {
- HASHCTL hash_ctl;
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
 
  dbentry->n_xact_commit = 0;
  dbentry->n_xact_rollback = 0;
@@ -4744,72 +5072,865 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
  dbentry->n_block_read_time = 0;
  dbentry->n_block_write_time = 0;
 
+ if (dbentry->refcnt == 0)
+ {
+ /*
+ * No one is referring to the current hash. It's very costly to remove
+ * entries in dshash individually so just destroy the whole.  If
+ * someone pined this entry just after, pin_hashes() returns the
+ * current generation and attach will happen after the following
+ * LWLock released.
+ */
+ dshash_table *tbl;
+
+ if (dbentry->tables != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_destroy(tbl);
+ dbentry->tables = DSM_HANDLE_INVALID;
+ }
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_destroy(tbl);
+ dbentry->functions = DSM_HANDLE_INVALID;
+ }
+ }
+ else if (dbentry->prev_refcnt == 0)
+ {
+ /*
+ * Someone is still referring to the current hash and previous slot is
+ * vacant. Stash out the current hash to the previous slot.
+ */
+ dbentry->prev_refcnt = dbentry->refcnt;
+ dbentry->prev_tables = dbentry->tables;
+ dbentry->prev_functions = dbentry->functions;
+ dbentry->refcnt = 0;
+ dbentry->tables = DSM_HANDLE_INVALID;
+ dbentry->functions = DSM_HANDLE_INVALID;
+ dbentry->generation++;
+ }
+ else
+ {
+ Assert(dbentry->prev_refcnt > 0 && dbentry->refcnt > 0);
+ /*
+ * If we get here, we just have got another reset request and the old
+ * hashes are waiting to all referrers to be released. It must be
+ * quite a short time so we can just ignore this request.
+ *
+ * As the side effect, the resetter can see non-zero values before
+ * anyone updates them but it's not distinctive with someone updated
+ * them before reading.
+ */
+ }
+
+ /* Create new table hash if not exists */
+ if (dbentry->tables == DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->tables = dshash_get_hash_table_handle(tbl);
+ dshash_detach(tbl);
+ }
+
+ /* Create new function hash if not exists and needed. */
+ if (dbentry->functions == DSM_HANDLE_INVALID &&
+ pgstat_track_functions != TRACK_FUNC_OFF)
+ {
+ dshash_table *tbl = dshash_create(area, &dsh_funcparams, 0);
+ dbentry->functions = dshash_get_hash_table_handle(tbl);
+ dshash_detach(tbl);
+ }
+
  dbentry->stat_reset_timestamp = GetCurrentTimestamp();
- dbentry->stats_timestamp = 0;
 
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
- dbentry->tables = hash_create("Per-database table",
-  PGSTAT_TAB_HASH_SIZE,
-  &hash_ctl,
-  HASH_ELEM | HASH_BLOBS);
-
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
- dbentry->functions = hash_create("Per-database function",
- PGSTAT_FUNCTION_HASH_SIZE,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS);
+ LWLockRelease(&dbentry->lock);
 }
 
 /*
- * Lookup the hash table entry for the specified database. If no hash
- * table entry exists, initialize it, if the create parameter is true.
- * Else, return NULL.
+ * Create the filename for a DB stat file; filename is output parameter points
+ * to a character buffer of length len.
  */
-static PgStat_StatDBEntry *
-pgstat_get_db_entry(Oid databaseid, bool create)
+static void
+get_dbstat_filename(bool tempname, Oid databaseid, char *filename, int len)
 {
- PgStat_StatDBEntry *result;
- bool found;
- HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
+ int printed;
 
- /* Lookup or create the hash table entry for this database */
- result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- &databaseid,
- action, &found);
+ /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
+ printed = snprintf(filename, len, "%s/db_%u.%s",
+   PGSTAT_STAT_PERMANENT_DIRECTORY,
+   databaseid,
+   tempname ? "tmp" : "stat");
+ if (printed >= len)
+ elog(ERROR, "overlength pgstat path");
+}
 
- if (!create && !found)
- return NULL;
+/* ----------
+ * pgstat_write_statsfiles() -
+ * Write the global statistics file, as well as DB files.
+ * ----------
+ */
+void
+pgstat_write_statsfiles(void)
+{
+ dshash_seq_status hstat;
+ PgStat_StatDBEntry *dbentry;
+ FILE   *fpout;
+ int32 format_id;
+ const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+ int rc;
+
+ /* stats is not initialized yet. just return. */
+ if (StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID)
+ return;
+
+ elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
  /*
- * If not found, initialize the new one.  This creates empty hash tables
- * for tables and functions, too.
+ * Open the statistics temp file to write out the current values.
  */
+ fpout = AllocateFile(tmpfile, PG_BINARY_W);
+ if (fpout == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary statistics file \"%s\": %m",
+ tmpfile)));
+ return;
+ }
+
+ /*
+ * Set the timestamp of the stats file.
+ */
+ shared_globalStats->stats_timestamp = GetCurrentTimestamp();
+
+ /*
+ * Write the file header --- currently just a format ID.
+ */
+ format_id = PGSTAT_FILE_FORMAT_ID;
+ rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Write global stats struct
+ */
+ rc = fwrite(shared_globalStats, sizeof(*shared_globalStats), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Write archiver stats struct
+ */
+ rc = fwrite(shared_archiverStats, sizeof(*shared_archiverStats), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Walk through the database table.
+ */
+ dshash_seq_init(&hstat, pgStatDBHash, false, false);
+ while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL)
+ {
+ /*
+ * Write out the table and function stats for this DB into the
+ * appropriate per-DB stat file, if required.
+ */
+ /* Make DB's timestamp consistent with the global stats */
+ dbentry->stats_timestamp = shared_globalStats->stats_timestamp;
+
+ pgstat_write_pgStatDBHashfile(dbentry);
+
+ /*
+ * Write out the DB entry. We don't write the tables or functions
+ * pointers, since they're of no use to any other process.
+ */
+ fputc('D', fpout);
+ rc = fwrite(dbentry,
+ offsetof(PgStat_StatDBEntry, generation), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+
+ /*
+ * No more output to be done. Close the temp file and replace the old
+ * pgstat.stat with it.  The ferror() check replaces testing for error
+ * after each individual fputc or fwrite above.
+ */
+ fputc('E', fpout);
+
+ if (ferror(fpout))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write temporary statistics file \"%s\": %m",
+ tmpfile)));
+ FreeFile(fpout);
+ unlink(tmpfile);
+ }
+ else if (FreeFile(fpout) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not close temporary statistics file \"%s\": %m",
+ tmpfile)));
+ unlink(tmpfile);
+ }
+ else if (rename(tmpfile, statfile) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+ tmpfile, statfile)));
+ unlink(tmpfile);
+ }
+}
+
+/* ----------
+ * pgstat_write_pgStatDBHashfile() -
+ * Write the stat file for a single database.
+ * ----------
+ */
+static void
+pgstat_write_pgStatDBHashfile(PgStat_StatDBEntry *dbentry)
+{
+ dshash_seq_status tstat;
+ dshash_seq_status fstat;
+ PgStat_StatTabEntry *tabentry;
+ PgStat_StatFuncEntry *funcentry;
+ FILE   *fpout;
+ int32 format_id;
+ Oid dbid = dbentry->databaseid;
+ int rc;
+ char tmpfile[MAXPGPATH];
+ char statfile[MAXPGPATH];
+ dshash_table *tbl;
+
+ get_dbstat_filename(true, dbid, tmpfile, MAXPGPATH);
+ get_dbstat_filename(false, dbid, statfile, MAXPGPATH);
+
+ elog(DEBUG2, "writing stats file \"%s\"", statfile);
+
+ /*
+ * Open the statistics temp file to write out the current values.
+ */
+ fpout = AllocateFile(tmpfile, PG_BINARY_W);
+ if (fpout == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary statistics file \"%s\": %m",
+ tmpfile)));
+ return;
+ }
+
+ /*
+ * Write the file header --- currently just a format ID.
+ */
+ format_id = PGSTAT_FILE_FORMAT_ID;
+ rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+
+ /*
+ * Walk through the database's access stats per table.
+ */
+ tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_seq_init(&tstat, tbl, false, false);
+ while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&tstat)) != NULL)
+ {
+ fputc('T', fpout);
+ rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+ dshash_detach(tbl);
+
+ /*
+ * Walk through the database's function stats table.
+ */
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_seq_init(&fstat, tbl, false, false);
+ while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&fstat)) != NULL)
+ {
+ fputc('F', fpout);
+ rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+ dshash_detach(tbl);
+ }
+
+ /*
+ * No more output to be done. Close the temp file and replace the old
+ * pgstat.stat with it.  The ferror() check replaces testing for error
+ * after each individual fputc or fwrite above.
+ */
+ fputc('E', fpout);
+
+ if (ferror(fpout))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write temporary statistics file \"%s\": %m",
+ tmpfile)));
+ FreeFile(fpout);
+ unlink(tmpfile);
+ }
+ else if (FreeFile(fpout) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not close temporary statistics file \"%s\": %m",
+ tmpfile)));
+ unlink(tmpfile);
+ }
+ else if (rename(tmpfile, statfile) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+ tmpfile, statfile)));
+ unlink(tmpfile);
+ }
+}
+
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ * Reads in existing statistics collector files into the shared stats hash.
+ *
+ * ----------
+ */
+void
+pgstat_read_statsfiles(void)
+{
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatDBEntry dbbuf;
+ FILE   *fpin;
+ int32 format_id;
+ bool found;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+
+ /* shouldn't be called from postmaster  */
+ Assert(IsUnderPostmaster);
+
+ elog(DEBUG2, "reading stats file \"%s\"", statfile);
+
+ /*
+ * Set the current timestamp (will be kept only in case we can't load an
+ * existing statsfile).
+ */
+ shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
+ shared_archiverStats->stat_reset_timestamp =
+ shared_globalStats->stat_reset_timestamp;
+
+ /*
+ * Try to open the stats file. If it doesn't exist, the backends simply
+ * return zero for anything and the collector simply starts from scratch
+ * with empty counters.
+ *
+ * ENOENT is a possibility if the stats collector is not running or has
+ * not yet written the stats file the first time.  Any other failure
+ * condition is suspicious.
+ */
+ if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+ {
+ if (errno != ENOENT)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open statistics file \"%s\": %m",
+ statfile)));
+ return;
+ }
+
+ /*
+ * Verify it's of the expected format.
+ */
+ if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
+ format_id != PGSTAT_FILE_FORMAT_ID)
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ goto done;
+ }
+
+ /*
+ * Read global stats struct
+ */
+ if (fread(shared_globalStats, 1, sizeof(*shared_globalStats), fpin) !=
+ sizeof(*shared_globalStats))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ MemSet(shared_globalStats, 0, sizeof(*shared_globalStats));
+ goto done;
+ }
+
+ /*
+ * Read archiver stats struct
+ */
+ if (fread(shared_archiverStats, 1, sizeof(*shared_archiverStats), fpin) !=
+ sizeof(*shared_archiverStats))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ MemSet(shared_archiverStats, 0, sizeof(*shared_archiverStats));
+ goto done;
+ }
+
+ /*
+ * We found an existing collector stats file. Read it and put all the
+ * hashtable entries into place.
+ */
+ for (;;)
+ {
+ switch (fgetc(fpin))
+ {
+ /*
+ * 'D' A PgStat_StatDBEntry struct describing a database
+ * follows.
+ */
+ case 'D':
+ if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, generation),
+  fpin) != offsetof(PgStat_StatDBEntry, generation))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ /*
+ * Add to the DB hash
+ */
+ dbentry = (PgStat_StatDBEntry *)
+ dshash_find_or_insert(pgStatDBHash, (void *) &dbbuf.databaseid,
+  &found);
+
+ /* don't allow duplicate dbentries */
+ if (found)
+ {
+ dshash_release_lock(pgStatDBHash, dbentry);
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ /* initialize the new shared entry */
+ init_dbentry(dbentry);
+
+ memcpy(dbentry, &dbbuf,
+   offsetof(PgStat_StatDBEntry, generation));
+
+ /* Read the data from the database-specific file. */
+ pgstat_read_pgStatDBHashfile(dbentry);
+ dshash_release_lock(pgStatDBHash, dbentry);
+ break;
+
+ case 'E':
+ goto done;
+
+ default:
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+ }
+
+done:
+ FreeFile(fpin);
+
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
+
+ return;
+}
+
+
+/* ----------
+ * pgstat_read_pgStatDBHashfile() -
+ *
+ * Reads in the at-rest statistics file and create shared statistics
+ * tables. The file is removed after reading.
+ * ----------
+ */
+static void
+pgstat_read_pgStatDBHashfile(PgStat_StatDBEntry *dbentry)
+{
+ PgStat_StatTabEntry *tabentry;
+ PgStat_StatTabEntry tabbuf;
+ PgStat_StatFuncEntry funcbuf;
+ PgStat_StatFuncEntry *funcentry;
+ dshash_table *tabhash = NULL;
+ dshash_table *funchash = NULL;
+ FILE   *fpin;
+ int32 format_id;
+ bool found;
+ char statfile[MAXPGPATH];
+
+ get_dbstat_filename(false, dbentry->databaseid, statfile, MAXPGPATH);
+
+ /*
+ * Try to open the stats file. If it doesn't exist, the backends simply
+ * return zero for anything and the collector simply starts from scratch
+ * with empty counters.
+ *
+ * ENOENT is a possibility if the stats collector is not running or has
+ * not yet written the stats file the first time.  Any other failure
+ * condition is suspicious.
+ */
+ if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+ {
+ if (errno != ENOENT)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open statistics file \"%s\": %m",
+ statfile)));
+ return;
+ }
+
+ /*
+ * Verify it's of the expected format.
+ */
+ if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
+ format_id != PGSTAT_FILE_FORMAT_ID)
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+ goto done;
+ }
+
+ /*
+ * We found an existing statistics file. Read it and put all the hashtable
+ * entries into place.
+ */
+ for (;;)
+ {
+ switch (fgetc(fpin))
+ {
+ /*
+ * 'T' A PgStat_StatTabEntry follows.
+ */
+ case 'T':
+ if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
+  fpin) != sizeof(PgStat_StatTabEntry))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ if (tabhash == NULL)
+ {
+ tabhash = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->tables =
+ dshash_get_hash_table_handle(tabhash);
+ }
+
+ tabentry = (PgStat_StatTabEntry *)
+ dshash_find_or_insert(tabhash,
+  (void *) &tabbuf.tableid, &found);
+
+ /* don't allow duplicate entries */
+ if (found)
+ {
+ dshash_release_lock(tabhash, tabentry);
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+ dshash_release_lock(tabhash, tabentry);
+ break;
+
+ /*
+ * 'F' A PgStat_StatFuncEntry follows.
+ */
+ case 'F':
+ if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
+  fpin) != sizeof(PgStat_StatFuncEntry))
+ {
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ if (funchash == NULL)
+ {
+ funchash = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->functions =
+ dshash_get_hash_table_handle(funchash);
+ }
+
+ funcentry = (PgStat_StatFuncEntry *)
+ dshash_find_or_insert(funchash,
+  (void *) &funcbuf.functionid, &found);
+
+ if (found)
+ {
+ dshash_release_lock(funchash, funcentry);
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+
+ memcpy(funcentry, &funcbuf, sizeof(funcbuf));
+ dshash_release_lock(funchash, funcentry);
+ break;
+
+ /*
+ * 'E' The EOF marker of a complete stats file.
+ */
+ case 'E':
+ goto done;
+
+ default:
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"",
+ statfile)));
+ goto done;
+ }
+ }
+
+done:
+ if (tabhash)
+ dshash_detach(tabhash);
+ if (funchash)
+ dshash_detach(funchash);
+
+ FreeFile(fpin);
+
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
+}
+
+/* ----------
+ * pgstat_setup_memcxt() -
+ *
+ * Create pgStatLocalContext and pgStatSnapshotContext, if not already done.
+ * ----------
+ */
+static void
+pgstat_setup_memcxt(void)
+{
+ if (!pgStatLocalContext)
+ pgStatLocalContext =
+ AllocSetContextCreate(TopMemoryContext,
+  "Backend statistics snapshot",
+  ALLOCSET_SMALL_SIZES);
+
+ if (!pgStatSnapshotContext)
+ pgStatSnapshotContext =
+ AllocSetContextCreate(TopMemoryContext,
+  "Database statistics snapshot",
+  ALLOCSET_SMALL_SIZES);
+}
+
+/* ----------
+ * pgstat_clear_snapshot() -
+ *
+ * Discard any data collected in the current transaction.  Any subsequent
+ * request will cause new snapshots to be read.
+ *
+ * This is also invoked during transaction commit or abort to discard
+ * the no-longer-wanted snapshot.
+ * ----------
+ */
+void
+pgstat_clear_snapshot(void)
+{
+ /* Release memory, if any was allocated */
+ if (pgStatLocalContext)
+ {
+ MemoryContextDelete(pgStatLocalContext);
+
+ /* Reset variables */
+ pgStatLocalContext = NULL;
+ localBackendStatusTable = NULL;
+ localNumBackends = 0;
+ }
+
+ if (pgStatSnapshotContext)
+ clear_snapshot  = true;
+}
+
+static bool
+pgstat_update_tabentry(dshash_table *tabhash, PgStat_TableStatus *stat,
+   bool nowait)
+{
+ PgStat_StatTabEntry *tabentry;
+ bool found;
+
+ if (tabhash == NULL)
+ return false;
+
+ tabentry = (PgStat_StatTabEntry *)
+ dshash_find_or_insert_extended(tabhash, (void *) &(stat->t_id),
+   &found, nowait);
+
+ /* failed to acquire lock */
+ if (tabentry == NULL)
+ return false;
+
  if (!found)
- reset_dbentry_counters(result);
+ {
+ /*
+ * If it's a new table entry, initialize counters to the values we
+ * just got.
+ */
+ tabentry->numscans = stat->t_counts.t_numscans;
+ tabentry->tuples_returned = stat->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched = stat->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted = stat->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated = stat->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted = stat->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated = stat->t_counts.t_tuples_hot_updated;
+ tabentry->n_live_tuples = stat->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples = stat->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze = stat->t_counts.t_changed_tuples;
+ tabentry->blocks_fetched = stat->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit = stat->t_counts.t_blocks_hit;
+
+ tabentry->vacuum_timestamp = 0;
+ tabentry->vacuum_count = 0;
+ tabentry->autovac_vacuum_timestamp = 0;
+ tabentry->autovac_vacuum_count = 0;
+ tabentry->analyze_timestamp = 0;
+ tabentry->analyze_count = 0;
+ tabentry->autovac_analyze_timestamp = 0;
+ tabentry->autovac_analyze_count = 0;
+ }
+ else
+ {
+ /*
+ * Otherwise add the values to the existing entry.
+ */
+ tabentry->numscans += stat->t_counts.t_numscans;
+ tabentry->tuples_returned += stat->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched += stat->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted += stat->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated += stat->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted += stat->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated += stat->t_counts.t_tuples_hot_updated;
+ /* If table was truncated, first reset the live/dead counters */
+ if (stat->t_counts.t_truncated)
+ {
+ tabentry->n_live_tuples = 0;
+ tabentry->n_dead_tuples = 0;
+ }
+ tabentry->n_live_tuples += stat->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples += stat->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze += stat->t_counts.t_changed_tuples;
+ tabentry->blocks_fetched += stat->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit += stat->t_counts.t_blocks_hit;
+ }
+
+ /* Clamp n_live_tuples in case of negative delta_live_tuples */
+ tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+ /* Likewise for n_dead_tuples */
+ tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
+
+ dshash_release_lock(tabhash, tabentry);
+
+ return true;
+}
+
+static void
+pgstat_update_dbentry(PgStat_StatDBEntry *dbentry, PgStat_TableStatus *stat)
+{
+ /*
+ * Add per-table stats to the per-database entry, too.
+ */
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->n_tuples_returned += stat->t_counts.t_tuples_returned;
+ dbentry->n_tuples_fetched += stat->t_counts.t_tuples_fetched;
+ dbentry->n_tuples_inserted += stat->t_counts.t_tuples_inserted;
+ dbentry->n_tuples_updated += stat->t_counts.t_tuples_updated;
+ dbentry->n_tuples_deleted += stat->t_counts.t_tuples_deleted;
+ dbentry->n_blocks_fetched += stat->t_counts.t_blocks_fetched;
+ dbentry->n_blocks_hit += stat->t_counts.t_blocks_hit;
+ LWLockRelease(&dbentry->lock);
+}
+
+/*
+ * Lookup shared stats hash table for the specified database. Returns NULL
+ * when PGSTAT_NOWAIT and required lock cannot be acquired.
+ */
+static PgStat_StatDBEntry *
+pgstat_get_db_entry(Oid databaseid, int op, PgStat_TableLookupResult *status)
+{
+ PgStat_StatDBEntry *result;
+ bool nowait = ((op & PGSTAT_NOWAIT) != 0);
+ bool lock_acquired = true;
+ bool found = true;
+
+ if (!IsUnderPostmaster || !pgStatDBHash)
+ return NULL;
+
+ /* Lookup or create the hash table entry for this database */
+ if (op & PGSTAT_EXCLUSIVE)
+ {
+ result = (PgStat_StatDBEntry *)
+ dshash_find_or_insert_extended(pgStatDBHash, &databaseid,
+   &found, nowait);
+ if (result == NULL)
+ lock_acquired = false;
+ else if (!found)
+ {
+ /*
+ * If not found, initialize the new one.  This creates empty hash
+ * tables hash, too.
+ */
+ init_dbentry(result);
+ reset_dbentry_counters(result);
+ }
+ }
+ else
+ {
+ result = (PgStat_StatDBEntry *)
+ dshash_find_extended(pgStatDBHash, &databaseid, true, nowait,
+ nowait ? &lock_acquired : NULL);
+ if (result == NULL)
+ found = false;
+ }
+
+ /* Set return status if requested */
+ if (status)
+ {
+ if (!lock_acquired)
+ {
+ Assert(nowait);
+ *status = LOCK_FAILED;
+ }
+ else if (!found)
+ *status = NOT_FOUND;
+ else
+ *status = FOUND;
+ }
 
  return result;
 }
 
-
 /*
  * Lookup the hash table entry for the specified table. If no hash
  * table entry exists, initialize it, if the create parameter is true.
  * Else, return NULL.
  */
 static PgStat_StatTabEntry *
-pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
+pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create)
 {
  PgStat_StatTabEntry *result;
  bool found;
- HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
 
  /* Lookup or create the hash table entry for this table */
- result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
- &tableoid,
- action, &found);
+ if (create)
+ result = (PgStat_StatTabEntry *)
+ dshash_find_or_insert(table, &tableoid, &found);
+ else
+ result = (PgStat_StatTabEntry *) dshash_find(table, &tableoid, false);
 
  if (!create && !found)
  return NULL;
@@ -4842,1702 +5963,6 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
  return result;
 }
 
-
-/* ----------
- * pgstat_write_statsfiles() -
- * Write the global statistics file, as well as requested DB files.
- *
- * 'permanent' specifies writing to the permanent files not temporary ones.
- * When true (happens only when the collector is shutting down), also remove
- * the temporary files so that backends starting up under a new postmaster
- * can't read old data before the new collector is ready.
- *
- * When 'allDbs' is false, only the requested databases (listed in
- * pending_write_requests) will be written; otherwise, all databases
- * will be written.
- * ----------
- */
-static void
-pgstat_write_statsfiles(bool permanent, bool allDbs)
-{
- HASH_SEQ_STATUS hstat;
- PgStat_StatDBEntry *dbentry;
- FILE   *fpout;
- int32 format_id;
- const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
- int rc;
-
- elog(DEBUG2, "writing stats file \"%s\"", statfile);
-
- /*
- * Open the statistics temp file to write out the current values.
- */
- fpout = AllocateFile(tmpfile, PG_BINARY_W);
- if (fpout == NULL)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not open temporary statistics file \"%s\": %m",
- tmpfile)));
- return;
- }
-
- /*
- * Set the timestamp of the stats file.
- */
- globalStats.stats_timestamp = GetCurrentTimestamp();
-
- /*
- * Write the file header --- currently just a format ID.
- */
- format_id = PGSTAT_FILE_FORMAT_ID;
- rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Write global stats struct
- */
- rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Write archiver stats struct
- */
- rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Walk through the database table.
- */
- hash_seq_init(&hstat, pgStatDBHash);
- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
- {
- /*
- * Write out the table and function stats for this DB into the
- * appropriate per-DB stat file, if required.
- */
- if (allDbs || pgstat_db_requested(dbentry->databaseid))
- {
- /* Make DB's timestamp consistent with the global stats */
- dbentry->stats_timestamp = globalStats.stats_timestamp;
-
- pgstat_write_db_statsfile(dbentry, permanent);
- }
-
- /*
- * Write out the DB entry. We don't write the tables or functions
- * pointers, since they're of no use to any other process.
- */
- fputc('D', fpout);
- rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
- }
-
- /*
- * No more output to be done. Close the temp file and replace the old
- * pgstat.stat with it.  The ferror() check replaces testing for error
- * after each individual fputc or fwrite above.
- */
- fputc('E', fpout);
-
- if (ferror(fpout))
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not write temporary statistics file \"%s\": %m",
- tmpfile)));
- FreeFile(fpout);
- unlink(tmpfile);
- }
- else if (FreeFile(fpout) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not close temporary statistics file \"%s\": %m",
- tmpfile)));
- unlink(tmpfile);
- }
- else if (rename(tmpfile, statfile) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
- tmpfile, statfile)));
- unlink(tmpfile);
- }
-
- if (permanent)
- unlink(pgstat_stat_filename);
-
- /*
- * Now throw away the list of requests.  Note that requests sent after we
- * started the write are still waiting on the network socket.
- */
- list_free(pending_write_requests);
- pending_write_requests = NIL;
-}
-
-/*
- * return the filename for a DB stat file; filename is the output buffer,
- * of length len.
- */
-static void
-get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
- char *filename, int len)
-{
- int printed;
-
- /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
- printed = snprintf(filename, len, "%s/db_%u.%s",
-   permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
-   pgstat_stat_directory,
-   databaseid,
-   tempname ? "tmp" : "stat");
- if (printed >= len)
- elog(ERROR, "overlength pgstat path");
-}
-
-/* ----------
- * pgstat_write_db_statsfile() -
- * Write the stat file for a single database.
- *
- * If writing to the permanent file (happens when the collector is
- * shutting down only), remove the temporary file so that backends
- * starting up under a new postmaster can't read the old data before
- * the new collector is ready.
- * ----------
- */
-static void
-pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
-{
- HASH_SEQ_STATUS tstat;
- HASH_SEQ_STATUS fstat;
- PgStat_StatTabEntry *tabentry;
- PgStat_StatFuncEntry *funcentry;
- FILE   *fpout;
- int32 format_id;
- Oid dbid = dbentry->databaseid;
- int rc;
- char tmpfile[MAXPGPATH];
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
- get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "writing stats file \"%s\"", statfile);
-
- /*
- * Open the statistics temp file to write out the current values.
- */
- fpout = AllocateFile(tmpfile, PG_BINARY_W);
- if (fpout == NULL)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not open temporary statistics file \"%s\": %m",
- tmpfile)));
- return;
- }
-
- /*
- * Write the file header --- currently just a format ID.
- */
- format_id = PGSTAT_FILE_FORMAT_ID;
- rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
-
- /*
- * Walk through the database's access stats per table.
- */
- hash_seq_init(&tstat, dbentry->tables);
- while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
- {
- fputc('T', fpout);
- rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
- }
-
- /*
- * Walk through the database's function stats table.
- */
- hash_seq_init(&fstat, dbentry->functions);
- while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
- {
- fputc('F', fpout);
- rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
- }
-
- /*
- * No more output to be done. Close the temp file and replace the old
- * pgstat.stat with it.  The ferror() check replaces testing for error
- * after each individual fputc or fwrite above.
- */
- fputc('E', fpout);
-
- if (ferror(fpout))
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not write temporary statistics file \"%s\": %m",
- tmpfile)));
- FreeFile(fpout);
- unlink(tmpfile);
- }
- else if (FreeFile(fpout) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not close temporary statistics file \"%s\": %m",
- tmpfile)));
- unlink(tmpfile);
- }
- else if (rename(tmpfile, statfile) < 0)
- {
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
- tmpfile, statfile)));
- unlink(tmpfile);
- }
-
- if (permanent)
- {
- get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
- unlink(statfile);
- }
-}
-
-/* ----------
- * pgstat_read_statsfiles() -
- *
- * Reads in some existing statistics collector files and returns the
- * databases hash table that is the top level of the data.
- *
- * If 'onlydb' is not InvalidOid, it means we only want data for that DB
- * plus the shared catalogs ("DB 0").  We'll still populate the DB hash
- * table for all databases, but we don't bother even creating table/function
- * hash tables for other databases.
- *
- * 'permanent' specifies reading from the permanent files not temporary ones.
- * When true (happens only when the collector is starting up), remove the
- * files after reading; the in-memory status is now authoritative, and the
- * files would be out of date in case somebody else reads them.
- *
- * If a 'deep' read is requested, table/function stats are read, otherwise
- * the table/function hash tables remain empty.
- * ----------
- */
-static HTAB *
-pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatDBEntry dbbuf;
- HASHCTL hash_ctl;
- HTAB   *dbhash;
- FILE   *fpin;
- int32 format_id;
- bool found;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
- /*
- * The tables will live in pgStatLocalContext.
- */
- pgstat_setup_memcxt();
-
- /*
- * Create the DB hashtable
- */
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- /*
- * Clear out global and archiver statistics so they start from zero in
- * case we can't load an existing statsfile.
- */
- memset(&globalStats, 0, sizeof(globalStats));
- memset(&archiverStats, 0, sizeof(archiverStats));
-
- /*
- * Set the current timestamp (will be kept only in case we can't load an
- * existing statsfile).
- */
- globalStats.stat_reset_timestamp = GetCurrentTimestamp();
- archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
-
- /*
- * Try to open the stats file. If it doesn't exist, the backends simply
- * return zero for anything and the collector simply starts from scratch
- * with empty counters.
- *
- * ENOENT is a possibility if the stats collector is not running or has
- * not yet written the stats file the first time.  Any other failure
- * condition is suspicious.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return dbhash;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- goto done;
- }
-
- /*
- * Read global stats struct
- */
- if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- memset(&globalStats, 0, sizeof(globalStats));
- goto done;
- }
-
- /*
- * In the collector, disregard the timestamp we read from the permanent
- * stats file; we should be willing to write a temp stats file immediately
- * upon the first request from any backend.  This only matters if the old
- * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
- * an unusual scenario.
- */
- if (pgStatRunningInCollector)
- globalStats.stats_timestamp = 0;
-
- /*
- * Read archiver stats struct
- */
- if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- memset(&archiverStats, 0, sizeof(archiverStats));
- goto done;
- }
-
- /*
- * We found an existing collector stats file. Read it and put all the
- * hashtable entries into place.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'D' A PgStat_StatDBEntry struct describing a database
- * follows.
- */
- case 'D':
- if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
-  fpin) != offsetof(PgStat_StatDBEntry, tables))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * Add to the DB hash
- */
- dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
- (void *) &dbbuf.databaseid,
- HASH_ENTER,
- &found);
- if (found)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
- dbentry->tables = NULL;
- dbentry->functions = NULL;
-
- /*
- * In the collector, disregard the timestamp we read from the
- * permanent stats file; we should be willing to write a temp
- * stats file immediately upon the first request from any
- * backend.
- */
- if (pgStatRunningInCollector)
- dbentry->stats_timestamp = 0;
-
- /*
- * Don't create tables/functions hashtables for uninteresting
- * databases.
- */
- if (onlydb != InvalidOid)
- {
- if (dbbuf.databaseid != onlydb &&
- dbbuf.databaseid != InvalidOid)
- break;
- }
-
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbentry->tables = hash_create("Per-database table",
-  PGSTAT_TAB_HASH_SIZE,
-  &hash_ctl,
-  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbentry->functions = hash_create("Per-database function",
- PGSTAT_FUNCTION_HASH_SIZE,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- /*
- * If requested, read the data from the database-specific
- * file.  Otherwise we just leave the hashtables empty.
- */
- if (deep)
- pgstat_read_db_statsfile(dbentry->databaseid,
- dbentry->tables,
- dbentry->functions,
- permanent);
-
- break;
-
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
- FreeFile(fpin);
-
- /* If requested to read the permanent file, also get rid of it. */
- if (permanent)
- {
- elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
- unlink(statfile);
- }
-
- return dbhash;
-}
-
-
-/* ----------
- * pgstat_read_db_statsfile() -
- *
- * Reads in the existing statistics collector file for the given database,
- * filling the passed-in tables and functions hash tables.
- *
- * As in pgstat_read_statsfiles, if the permanent file is requested, it is
- * removed after reading.
- *
- * Note: this code has the ability to skip storing per-table or per-function
- * data, if NULL is passed for the corresponding hashtable.  That's not used
- * at the moment though.
- * ----------
- */
-static void
-pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
- bool permanent)
-{
- PgStat_StatTabEntry *tabentry;
- PgStat_StatTabEntry tabbuf;
- PgStat_StatFuncEntry funcbuf;
- PgStat_StatFuncEntry *funcentry;
- FILE   *fpin;
- int32 format_id;
- bool found;
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
-
- /*
- * Try to open the stats file. If it doesn't exist, the backends simply
- * return zero for anything and the collector simply starts from scratch
- * with empty counters.
- *
- * ENOENT is a possibility if the stats collector is not running or has
- * not yet written the stats file the first time.  Any other failure
- * condition is suspicious.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- goto done;
- }
-
- /*
- * We found an existing collector stats file. Read it and put all the
- * hashtable entries into place.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'T' A PgStat_StatTabEntry follows.
- */
- case 'T':
- if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
-  fpin) != sizeof(PgStat_StatTabEntry))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * Skip if table data not wanted.
- */
- if (tabhash == NULL)
- break;
-
- tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-   (void *) &tabbuf.tableid,
-   HASH_ENTER, &found);
-
- if (found)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- memcpy(tabentry, &tabbuf, sizeof(tabbuf));
- break;
-
- /*
- * 'F' A PgStat_StatFuncEntry follows.
- */
- case 'F':
- if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
-  fpin) != sizeof(PgStat_StatFuncEntry))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * Skip if function data not wanted.
- */
- if (funchash == NULL)
- break;
-
- funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
- (void *) &funcbuf.functionid,
- HASH_ENTER, &found);
-
- if (found)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- memcpy(funcentry, &funcbuf, sizeof(funcbuf));
- break;
-
- /*
- * 'E' The EOF marker of a complete stats file.
- */
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
- FreeFile(fpin);
-
- if (permanent)
- {
- elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
- unlink(statfile);
- }
-}
-
-/* ----------
- * pgstat_read_db_statsfile_timestamp() -
- *
- * Attempt to determine the timestamp of the last db statfile write.
- * Returns true if successful; the timestamp is stored in *ts.
- *
- * This needs to be careful about handling databases for which no stats file
- * exists, such as databases without a stat entry or those not yet written:
- *
- * - if there's a database entry in the global file, return the corresponding
- * stats_timestamp value.
- *
- * - if there's no db stat entry (e.g. for a new or inactive database),
- * there's no stats_timestamp value, but also nothing to write so we return
- * the timestamp of the global statfile.
- * ----------
- */
-static bool
-pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
-   TimestampTz *ts)
-{
- PgStat_StatDBEntry dbentry;
- PgStat_GlobalStats myGlobalStats;
- PgStat_ArchiverStats myArchiverStats;
- FILE   *fpin;
- int32 format_id;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
- /*
- * Try to open the stats file.  As above, anything but ENOENT is worthy of
- * complaining about.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return false;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /*
- * Read global stats struct
- */
- if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
-  fpin) != sizeof(myGlobalStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /*
- * Read archiver stats struct
- */
- if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
-  fpin) != sizeof(myArchiverStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /* By default, we're going to return the timestamp of the global file. */
- *ts = myGlobalStats.stats_timestamp;
-
- /*
- * We found an existing collector stats file.  Read it and look for a
- * record for the requested database.  If found, use its timestamp.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'D' A PgStat_StatDBEntry struct describing a database
- * follows.
- */
- case 'D':
- if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
-  fpin) != offsetof(PgStat_StatDBEntry, tables))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * If this is the DB we're looking for, save its timestamp and
- * we're done.
- */
- if (dbentry.databaseid == databaseid)
- {
- *ts = dbentry.stats_timestamp;
- goto done;
- }
-
- break;
-
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
- FreeFile(fpin);
- return true;
-}
-
-/*
- * If not already done, read the statistics collector stats file into
- * some hash tables.  The results will be kept until pgstat_clear_snapshot()
- * is called (typically, at end of transaction).
- */
-static void
-backend_read_statsfile(void)
-{
- TimestampTz min_ts = 0;
- TimestampTz ref_ts = 0;
- Oid inquiry_db;
- int count;
-
- /* already read it? */
- if (pgStatDBHash)
- return;
- Assert(!pgStatRunningInCollector);
-
- /*
- * In a normal backend, we check staleness of the data for our own DB, and
- * so we send MyDatabaseId in inquiry messages.  In the autovac launcher,
- * check staleness of the shared-catalog data, and send InvalidOid in
- * inquiry messages so as not to force writing unnecessary data.
- */
- if (IsAutoVacuumLauncherProcess())
- inquiry_db = InvalidOid;
- else
- inquiry_db = MyDatabaseId;
-
- /*
- * Loop until fresh enough stats file is available or we ran out of time.
- * The stats inquiry message is sent repeatedly in case collector drops
- * it; but not every single time, as that just swamps the collector.
- */
- for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
- {
- bool ok;
- TimestampTz file_ts = 0;
- TimestampTz cur_ts;
-
- CHECK_FOR_INTERRUPTS();
-
- ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
-
- cur_ts = GetCurrentTimestamp();
- /* Calculate min acceptable timestamp, if we didn't already */
- if (count == 0 || cur_ts < ref_ts)
- {
- /*
- * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
- * msec before now.  This indirectly ensures that the collector
- * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
- * an autovacuum worker, however, we want a lower delay to avoid
- * using stale data, so we use PGSTAT_RETRY_DELAY (since the
- * number of workers is low, this shouldn't be a problem).
- *
- * We don't recompute min_ts after sleeping, except in the
- * unlikely case that cur_ts went backwards.  So we might end up
- * accepting a file a bit older than PGSTAT_STAT_INTERVAL.  In
- * practice that shouldn't happen, though, as long as the sleep
- * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
- * tell the collector that our cutoff time is less than what we'd
- * actually accept.
- */
- ref_ts = cur_ts;
- if (IsAutoVacuumWorkerProcess())
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_RETRY_DELAY);
- else
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_STAT_INTERVAL);
- }
-
- /*
- * If the file timestamp is actually newer than cur_ts, we must have
- * had a clock glitch (system time went backwards) or there is clock
- * skew between our processor and the stats collector's processor.
- * Accept the file, but send an inquiry message anyway to make
- * pgstat_recv_inquiry do a sanity check on the collector's time.
- */
- if (ok && file_ts > cur_ts)
- {
- /*
- * A small amount of clock skew between processors isn't terribly
- * surprising, but a large difference is worth logging.  We
- * arbitrarily define "large" as 1000 msec.
- */
- if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
- {
- char   *filetime;
- char   *mytime;
-
- /* Copy because timestamptz_to_str returns a static buffer */
- filetime = pstrdup(timestamptz_to_str(file_ts));
- mytime = pstrdup(timestamptz_to_str(cur_ts));
- elog(LOG, "stats collector's time %s is later than backend local time %s",
- filetime, mytime);
- pfree(filetime);
- pfree(mytime);
- }
-
- pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
- break;
- }
-
- /* Normal acceptance case: file is not older than cutoff time */
- if (ok && file_ts >= min_ts)
- break;
-
- /* Not there or too old, so kick the collector and wait a bit */
- if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
- pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
-
- pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
- }
-
- if (count >= PGSTAT_POLL_LOOP_COUNT)
- ereport(LOG,
- (errmsg("using stale statistics instead of current ones "
- "because stats collector is not responding")));
-
- /*
- * Autovacuum launcher wants stats about all databases, but a shallow read
- * is sufficient.  Regular backends want a deep read for just the tables
- * they can see (MyDatabaseId + shared catalogs).
- */
- if (IsAutoVacuumLauncherProcess())
- pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
- else
- pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
-}
-
-
-/* ----------
- * pgstat_setup_memcxt() -
- *
- * Create pgStatLocalContext, if not already done.
- * ----------
- */
-static void
-pgstat_setup_memcxt(void)
-{
- if (!pgStatLocalContext)
- pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
-   "Statistics snapshot",
-   ALLOCSET_SMALL_SIZES);
-}
-
-
-/* ----------
- * pgstat_clear_snapshot() -
- *
- * Discard any data collected in the current transaction.  Any subsequent
- * request will cause new snapshots to be read.
- *
- * This is also invoked during transaction commit or abort to discard
- * the no-longer-wanted snapshot.
- * ----------
- */
-void
-pgstat_clear_snapshot(void)
-{
- /* Release memory, if any was allocated */
- if (pgStatLocalContext)
- MemoryContextDelete(pgStatLocalContext);
-
- /* Reset variables */
- pgStatLocalContext = NULL;
- pgStatDBHash = NULL;
- localBackendStatusTable = NULL;
- localNumBackends = 0;
-}
-
-
-/* ----------
- * pgstat_recv_inquiry() -
- *
- * Process stat inquiry requests.
- * ----------
- */
-static void
-pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
-
- /*
- * If there's already a write request for this DB, there's nothing to do.
- *
- * Note that if a request is found, we return early and skip the below
- * check for clock skew.  This is okay, since the only way for a DB
- * request to be present in the list is that we have been here since the
- * last write round.  It seems sufficient to check for clock skew once per
- * write round.
- */
- if (list_member_oid(pending_write_requests, msg->databaseid))
- return;
-
- /*
- * Check to see if we last wrote this database at a time >= the requested
- * cutoff time.  If so, this is a stale request that was generated before
- * we updated the DB file, and we don't need to do so again.
- *
- * If the requestor's local clock time is older than stats_timestamp, we
- * should suspect a clock glitch, ie system time going backwards; though
- * the more likely explanation is just delayed message receipt.  It is
- * worth expending a GetCurrentTimestamp call to be sure, since a large
- * retreat in the system clock reading could otherwise cause us to neglect
- * to update the stats file for a long time.
- */
- dbentry = pgstat_get_db_entry(msg->databaseid, false);
- if (dbentry == NULL)
- {
- /*
- * We have no data for this DB.  Enter a write request anyway so that
- * the global stats will get updated.  This is needed to prevent
- * backend_read_statsfile from waiting for data that we cannot supply,
- * in the case of a new DB that nobody has yet reported any stats for.
- * See the behavior of pgstat_read_db_statsfile_timestamp.
- */
- }
- else if (msg->clock_time < dbentry->stats_timestamp)
- {
- TimestampTz cur_ts = GetCurrentTimestamp();
-
- if (cur_ts < dbentry->stats_timestamp)
- {
- /*
- * Sure enough, time went backwards.  Force a new stats file write
- * to get back in sync; but first, log a complaint.
- */
- char   *writetime;
- char   *mytime;
-
- /* Copy because timestamptz_to_str returns a static buffer */
- writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
- mytime = pstrdup(timestamptz_to_str(cur_ts));
- elog(LOG,
- "stats_timestamp %s is later than collector's time %s for database %u",
- writetime, mytime, dbentry->databaseid);
- pfree(writetime);
- pfree(mytime);
- }
- else
- {
- /*
- * Nope, it's just an old request.  Assuming msg's clock_time is
- * >= its cutoff_time, it must be stale, so we can ignore it.
- */
- return;
- }
- }
- else if (msg->cutoff_time <= dbentry->stats_timestamp)
- {
- /* Stale request, ignore it */
- return;
- }
-
- /*
- * We need to write this DB, so create a request.
- */
- pending_write_requests = lappend_oid(pending_write_requests,
- msg->databaseid);
-}
-
-
-/* ----------
- * pgstat_recv_tabstat() -
- *
- * Count what the backend has done.
- * ----------
- */
-static void
-pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
- int i;
- bool found;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- /*
- * Update database-wide stats.
- */
- dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
- dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
- dbentry->n_block_read_time += msg->m_block_read_time;
- dbentry->n_block_write_time += msg->m_block_write_time;
-
- /*
- * Process all table entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
-
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &(tabmsg->t_id),
-   HASH_ENTER, &found);
-
- if (!found)
- {
- /*
- * If it's a new table entry, initialize counters to the values we
- * just got.
- */
- tabentry->numscans = tabmsg->t_counts.t_numscans;
- tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned;
- tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched;
- tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted;
- tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated;
- tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted;
- tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated;
- tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples;
- tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples;
- tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples;
- tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched;
- tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit;
-
- tabentry->vacuum_timestamp = 0;
- tabentry->vacuum_count = 0;
- tabentry->autovac_vacuum_timestamp = 0;
- tabentry->autovac_vacuum_count = 0;
- tabentry->analyze_timestamp = 0;
- tabentry->analyze_count = 0;
- tabentry->autovac_analyze_timestamp = 0;
- tabentry->autovac_analyze_count = 0;
- }
- else
- {
- /*
- * Otherwise add the values to the existing entry.
- */
- tabentry->numscans += tabmsg->t_counts.t_numscans;
- tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned;
- tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
- tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
- tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated;
- tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
- tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated;
- /* If table was truncated, first reset the live/dead counters */
- if (tabmsg->t_counts.t_truncated)
- {
- tabentry->n_live_tuples = 0;
- tabentry->n_dead_tuples = 0;
- }
- tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples;
- tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples;
- tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples;
- tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
- tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit;
- }
-
- /* Clamp n_live_tuples in case of negative delta_live_tuples */
- tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
- /* Likewise for n_dead_tuples */
- tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
-
- /*
- * Add per-table stats to the per-database entry, too.
- */
- dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned;
- dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
- dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
- dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated;
- dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
- dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
- dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit;
- }
-}
-
-
-/* ----------
- * pgstat_recv_tabpurge() -
- *
- * Arrange for dead table removal.
- * ----------
- */
-static void
-pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- int i;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- /*
- * No need to purge if we don't even know the database.
- */
- if (!dbentry || !dbentry->tables)
- return;
-
- /*
- * Process all table entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- /* Remove from hashtable if present; we don't care if it's not. */
- (void) hash_search(dbentry->tables,
-   (void *) &(msg->m_tableid[i]),
-   HASH_REMOVE, NULL);
- }
-}
-
-
-/* ----------
- * pgstat_recv_dropdb() -
- *
- * Arrange for dead database removal
- * ----------
- */
-static void
-pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
-{
- Oid dbid = msg->m_databaseid;
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Lookup the database in the hashtable.
- */
- dbentry = pgstat_get_db_entry(dbid, false);
-
- /*
- * If found, remove it (along with the db statfile).
- */
- if (dbentry)
- {
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "removing stats file \"%s\"", statfile);
- unlink(statfile);
-
- if (dbentry->tables != NULL)
- hash_destroy(dbentry->tables);
- if (dbentry->functions != NULL)
- hash_destroy(dbentry->functions);
-
- if (hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_REMOVE, NULL) == NULL)
- ereport(ERROR,
- (errmsg("database hash table corrupted during cleanup --- abort")));
- }
-}
-
-
-/* ----------
- * pgstat_recv_resetcounter() -
- *
- * Reset the statistics for the specified database.
- * ----------
- */
-static void
-pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Lookup the database in the hashtable.  Nothing to do if not there.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- if (!dbentry)
- return;
-
- /*
- * We simply throw away all the database's table entries by recreating a
- * new hash table for them.
- */
- if (dbentry->tables != NULL)
- hash_destroy(dbentry->tables);
- if (dbentry->functions != NULL)
- hash_destroy(dbentry->functions);
-
- dbentry->tables = NULL;
- dbentry->functions = NULL;
-
- /*
- * Reset database-level stats, too.  This creates empty hash tables for
- * tables and functions.
- */
- reset_dbentry_counters(dbentry);
-}
-
-/* ----------
- * pgstat_recv_resetsharedcounter() -
- *
- * Reset some shared statistics of the cluster.
- * ----------
- */
-static void
-pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
-{
- if (msg->m_resettarget == RESET_BGWRITER)
- {
- /* Reset the global background writer statistics for the cluster. */
- memset(&globalStats, 0, sizeof(globalStats));
- globalStats.stat_reset_timestamp = GetCurrentTimestamp();
- }
- else if (msg->m_resettarget == RESET_ARCHIVER)
- {
- /* Reset the archiver statistics for the cluster. */
- memset(&archiverStats, 0, sizeof(archiverStats));
- archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
- }
-
- /*
- * Presumably the sender of this message validated the target, don't
- * complain here if it's not valid
- */
-}
-
-/* ----------
- * pgstat_recv_resetsinglecounter() -
- *
- * Reset a statistics for a single object
- * ----------
- */
-static void
-pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- if (!dbentry)
- return;
-
- /* Set the reset timestamp for the whole database */
- dbentry->stat_reset_timestamp = GetCurrentTimestamp();
-
- /* Remove object if it exists, ignore it if not */
- if (msg->m_resettype == RESET_TABLE)
- (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid),
-   HASH_REMOVE, NULL);
- else if (msg->m_resettype == RESET_FUNCTION)
- (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid),
-   HASH_REMOVE, NULL);
-}
-
-/* ----------
- * pgstat_recv_autovac() -
- *
- * Process an autovacuum signalling message.
- * ----------
- */
-static void
-pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Store the last autovacuum time in the database's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->last_autovac_time = msg->m_start_time;
-}
-
-/* ----------
- * pgstat_recv_vacuum() -
- *
- * Process a VACUUM message.
- * ----------
- */
-static void
-pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
-
- /*
- * Store the data in the table's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
-
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
-
- if (msg->m_autovacuum)
- {
- tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
- tabentry->autovac_vacuum_count++;
- }
- else
- {
- tabentry->vacuum_timestamp = msg->m_vacuumtime;
- tabentry->vacuum_count++;
- }
-}
-
-/* ----------
- * pgstat_recv_analyze() -
- *
- * Process an ANALYZE message.
- * ----------
- */
-static void
-pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
-
- /*
- * Store the data in the table's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
-
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
-
- /*
- * If commanded, reset changes_since_analyze to zero.  This forgets any
- * changes that were committed while the ANALYZE was in progress, but we
- * have no good way to estimate how many of those there were.
- */
- if (msg->m_resetcounter)
- tabentry->changes_since_analyze = 0;
-
- if (msg->m_autovacuum)
- {
- tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
- tabentry->autovac_analyze_count++;
- }
- else
- {
- tabentry->analyze_timestamp = msg->m_analyzetime;
- tabentry->analyze_count++;
- }
-}
-
-
-/* ----------
- * pgstat_recv_archiver() -
- *
- * Process a ARCHIVER message.
- * ----------
- */
-static void
-pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
-{
- if (msg->m_failed)
- {
- /* Failed archival attempt */
- ++archiverStats.failed_count;
- memcpy(archiverStats.last_failed_wal, msg->m_xlog,
-   sizeof(archiverStats.last_failed_wal));
- archiverStats.last_failed_timestamp = msg->m_timestamp;
- }
- else
- {
- /* Successful archival operation */
- ++archiverStats.archived_count;
- memcpy(archiverStats.last_archived_wal, msg->m_xlog,
-   sizeof(archiverStats.last_archived_wal));
- archiverStats.last_archived_timestamp = msg->m_timestamp;
- }
-}
-
-/* ----------
- * pgstat_recv_bgwriter() -
- *
- * Process a BGWRITER message.
- * ----------
- */
-static void
-pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
-{
- globalStats.timed_checkpoints += msg->m_timed_checkpoints;
- globalStats.requested_checkpoints += msg->m_requested_checkpoints;
- globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
- globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
- globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
- globalStats.buf_written_clean += msg->m_buf_written_clean;
- globalStats.maxwritten_clean += msg->m_maxwritten_clean;
- globalStats.buf_written_backend += msg->m_buf_written_backend;
- globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
- globalStats.buf_alloc += msg->m_buf_alloc;
-}
-
-/* ----------
- * pgstat_recv_recoveryconflict() -
- *
- * Process a RECOVERYCONFLICT message.
- * ----------
- */
-static void
-pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- switch (msg->m_reason)
- {
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
-
- /*
- * Since we drop the information about the database as soon as it
- * replicates, there is no point in counting these conflicts.
- */
- break;
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
- dbentry->n_conflict_tablespace++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
- dbentry->n_conflict_lock++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
- dbentry->n_conflict_snapshot++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
- dbentry->n_conflict_bufferpin++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
- dbentry->n_conflict_startup_deadlock++;
- break;
- }
-}
-
-/* ----------
- * pgstat_recv_deadlock() -
- *
- * Process a DEADLOCK message.
- * ----------
- */
-static void
-pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_deadlocks++;
-}
-
-/* ----------
- * pgstat_recv_checksum_failure() -
- *
- * Process a CHECKSUMFAILURE message.
- * ----------
- */
-static void
-pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_checksum_failures += msg->m_failurecount;
- dbentry->last_checksum_failure = msg->m_failure_time;
-}
-
-/* ----------
- * pgstat_recv_tempfile() -
- *
- * Process a TEMPFILE message.
- * ----------
- */
-static void
-pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_temp_bytes += msg->m_filesize;
- dbentry->n_temp_files += 1;
-}
-
-/* ----------
- * pgstat_recv_funcstat() -
- *
- * Count what the backend has done.
- * ----------
- */
-static void
-pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
-{
- PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
- PgStat_StatDBEntry *dbentry;
- PgStat_StatFuncEntry *funcentry;
- int i;
- bool found;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- /*
- * Process all function entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++, funcmsg++)
- {
- funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
- (void *) &(funcmsg->f_id),
- HASH_ENTER, &found);
-
- if (!found)
- {
- /*
- * If it's a new function entry, initialize counters to the values
- * we just got.
- */
- funcentry->f_numcalls = funcmsg->f_numcalls;
- funcentry->f_total_time = funcmsg->f_total_time;
- funcentry->f_self_time = funcmsg->f_self_time;
- }
- else
- {
- /*
- * Otherwise add the values to the existing entry.
- */
- funcentry->f_numcalls += funcmsg->f_numcalls;
- funcentry->f_total_time += funcmsg->f_total_time;
- funcentry->f_self_time += funcmsg->f_self_time;
- }
- }
-}
-
-/* ----------
- * pgstat_recv_funcpurge() -
- *
- * Arrange for dead function removal.
- * ----------
- */
-static void
-pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- int i;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- /*
- * No need to purge if we don't even know the database.
- */
- if (!dbentry || !dbentry->functions)
- return;
-
- /*
- * Process all function entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- /* Remove from hashtable if present; we don't care if it's not. */
- (void) hash_search(dbentry->functions,
-   (void *) &(msg->m_functionid[i]),
-   HASH_REMOVE, NULL);
- }
-}
-
-/* ----------
- * pgstat_write_statsfile_needed() -
- *
- * Do we need to write out any stats files?
- * ----------
- */
-static bool
-pgstat_write_statsfile_needed(void)
-{
- if (pending_write_requests != NIL)
- return true;
-
- /* Everything was written recently */
- return false;
-}
-
-/* ----------
- * pgstat_db_requested() -
- *
- * Checks whether stats for a particular DB need to be written to a file.
- * ----------
- */
-static bool
-pgstat_db_requested(Oid databaseid)
-{
- /*
- * If any requests are outstanding at all, we should write the stats for
- * shared catalogs (the "database" with OID 0).  This ensures that
- * backends will see up-to-date stats for shared catalogs, even though
- * they send inquiry messages mentioning only their own DB.
- */
- if (databaseid == InvalidOid && pending_write_requests != NIL)
- return true;
-
- /* Search to see if there's an open request to write this database. */
- if (list_member_oid(pending_write_requests, databaseid))
- return true;
-
- return false;
-}
-
 /*
  * Convert a potentially unsafely truncated activity string (see
  * PgBackendStatus.st_activity_raw's documentation) into a correctly truncated
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 27a9e45074..d4a590fa5a 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -255,7 +255,6 @@ static pid_t StartupPID = 0,
  WalReceiverPID = 0,
  AutoVacPID = 0,
  PgArchPID = 0,
- PgStatPID = 0,
  SysLoggerPID = 0;
 
 /* Startup process's status */
@@ -503,7 +502,6 @@ typedef struct
  PGPROC   *AuxiliaryProcs;
  PGPROC   *PreparedXactProcs;
  PMSignalData *PMSignalState;
- InheritableSocket pgStatSock;
  pid_t PostmasterPid;
  TimestampTz PgStartTime;
  TimestampTz PgReloadTime;
@@ -1317,12 +1315,6 @@ PostmasterMain(int argc, char *argv[])
  */
  RemovePgTempFiles();
 
- /*
- * Initialize stats collection subsystem (this does NOT start the
- * collector process!)
- */
- pgstat_init();
-
  /*
  * Initialize the autovacuum subsystem (again, no process start yet)
  */
@@ -1771,11 +1763,6 @@ ServerLoop(void)
  start_autovac_launcher = false; /* signal processed */
  }
 
- /* If we have lost the stats collector, try to start a new one */
- if (PgStatPID == 0 &&
- (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
- PgStatPID = pgstat_start();
-
  /* If we have lost the archiver, try to start a new one. */
  if (PgArchPID == 0 && PgArchStartupAllowed())
  PgArchPID = StartArchiver();
@@ -2660,8 +2647,6 @@ SIGHUP_handler(SIGNAL_ARGS)
  signal_child(PgArchPID, SIGHUP);
  if (SysLoggerPID != 0)
  signal_child(SysLoggerPID, SIGHUP);
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGHUP);
 
  /* Reload authentication config files too */
  if (!load_hba())
@@ -3008,8 +2993,6 @@ reaper(SIGNAL_ARGS)
  AutoVacPID = StartAutoVacLauncher();
  if (PgArchStartupAllowed() && PgArchPID == 0)
  PgArchPID = StartArchiver();
- if (PgStatPID == 0)
- PgStatPID = pgstat_start();
 
  /* workers may be scheduled to start now */
  maybe_start_bgworkers();
@@ -3076,13 +3059,6 @@ reaper(SIGNAL_ARGS)
  SignalChildren(SIGUSR2);
 
  pmState = PM_SHUTDOWN_2;
-
- /*
- * We can also shut down the stats collector now; there's
- * nothing left for it to do.
- */
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
  }
  else
  {
@@ -3157,22 +3133,6 @@ reaper(SIGNAL_ARGS)
  continue;
  }
 
- /*
- * Was it the statistics collector?  If so, just try to start a new
- * one; no need to force reset of the rest of the system.  (If fail,
- * we'll try again in future cycles of the main loop.)
- */
- if (pid == PgStatPID)
- {
- PgStatPID = 0;
- if (!EXIT_STATUS_0(exitstatus))
- LogChildExit(LOG, _("statistics collector process"),
- pid, exitstatus);
- if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
- PgStatPID = pgstat_start();
- continue;
- }
-
  /* Was it the system logger?  If so, try to start a new one */
  if (pid == SysLoggerPID)
  {
@@ -3631,22 +3591,6 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  signal_child(PgArchPID, SIGQUIT);
  }
 
- /*
- * Force a power-cycle of the pgstat process too.  (This isn't absolutely
- * necessary, but it seems like a good idea for robustness, and it
- * simplifies the state-machine logic in the case where a shutdown request
- * arrives during crash processing.)
- */
- if (PgStatPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- "SIGQUIT",
- (int) PgStatPID)));
- signal_child(PgStatPID, SIGQUIT);
- allow_immediate_pgstat_restart();
- }
-
  /* We do NOT restart the syslogger */
 
  if (Shutdown != ImmediateShutdown)
@@ -3842,8 +3786,6 @@ PostmasterStateMachine(void)
  SignalChildren(SIGQUIT);
  if (PgArchPID != 0)
  signal_child(PgArchPID, SIGQUIT);
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
  }
  }
  }
@@ -3878,8 +3820,7 @@ PostmasterStateMachine(void)
  * normal state transition leading up to PM_WAIT_DEAD_END, or during
  * FatalError processing.
  */
- if (dlist_is_empty(&BackendList) &&
- PgArchPID == 0 && PgStatPID == 0)
+ if (dlist_is_empty(&BackendList) && PgArchPID == 0)
  {
  /* These other guys should be dead already */
  Assert(StartupPID == 0);
@@ -4080,8 +4021,6 @@ TerminateChildren(int signal)
  signal_child(AutoVacPID, signal);
  if (PgArchPID != 0)
  signal_child(PgArchPID, signal);
- if (PgStatPID != 0)
- signal_child(PgStatPID, signal);
 }
 
 /*
@@ -5054,18 +4993,6 @@ SubPostmasterMain(int argc, char *argv[])
 
  StartBackgroundWorker();
  }
- if (strcmp(argv[1], "--forkarch") == 0)
- {
- /* Do not want to attach to shared memory */
-
- PgArchiverMain(argc, argv); /* does not return */
- }
- if (strcmp(argv[1], "--forkcol") == 0)
- {
- /* Do not want to attach to shared memory */
-
- PgstatCollectorMain(argc, argv); /* does not return */
- }
  if (strcmp(argv[1], "--forklog") == 0)
  {
  /* Do not want to attach to shared memory */
@@ -5178,12 +5105,6 @@ sigusr1_handler(SIGNAL_ARGS)
  if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
  pmState == PM_RECOVERY && Shutdown == NoShutdown)
  {
- /*
- * Likewise, start other special children as needed.
- */
- Assert(PgStatPID == 0);
- PgStatPID = pgstat_start();
-
  ereport(LOG,
  (errmsg("database system is ready to accept read only connections")));
 
@@ -6072,7 +5993,6 @@ extern slock_t *ShmemLock;
 extern slock_t *ProcStructLock;
 extern PGPROC *AuxiliaryProcs;
 extern PMSignalData *PMSignalState;
-extern pgsocket pgStatSock;
 extern pg_time_t first_syslogger_file_time;
 
 #ifndef WIN32
@@ -6128,8 +6048,6 @@ save_backend_variables(BackendParameters *param, Port *port,
  param->AuxiliaryProcs = AuxiliaryProcs;
  param->PreparedXactProcs = PreparedXactProcs;
  param->PMSignalState = PMSignalState;
- if (!write_inheritable_socket(&param->pgStatSock, pgStatSock, childPid))
- return false;
 
  param->PostmasterPid = PostmasterPid;
  param->PgStartTime = PgStartTime;
@@ -6364,7 +6282,6 @@ restore_backend_variables(BackendParameters *param, Port *port)
  AuxiliaryProcs = param->AuxiliaryProcs;
  PreparedXactProcs = param->PreparedXactProcs;
  PMSignalState = param->PMSignalState;
- read_inheritable_socket(&pgStatSock, &param->pgStatSock);
 
  PostmasterPid = param->PostmasterPid;
  PgStartTime = param->PgStartTime;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 885370698f..cfb3b91b11 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -147,6 +147,7 @@ CreateSharedMemoryAndSemaphores(void)
  size = add_size(size, BTreeShmemSize());
  size = add_size(size, SyncScanShmemSize());
  size = add_size(size, AsyncShmemSize());
+ size = add_size(size, StatsShmemSize());
 #ifdef EXEC_BACKEND
  size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -263,6 +264,7 @@ CreateSharedMemoryAndSemaphores(void)
  BTreeShmemInit();
  SyncScanShmemInit();
  AsyncShmemInit();
+ StatsShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index fb0bf44264..b423aaaf02 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -522,6 +522,7 @@ RegisterLWLockTranches(void)
  LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
  LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join");
  LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact");
+ LWLockRegisterTranche(LWTRANCHE_STATS, "activity stats");
 
  /* Register named tranches. */
  for (i = 0; i < NamedLWLockTrancheRequests; i++)
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index e8d8e6f828..bec27c3034 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3159,6 +3159,12 @@ ProcessInterrupts(void)
 
  if (ParallelMessagePending)
  HandleParallelMessages();
+
+ if (IdleStatsUpdateTimeoutPending)
+ {
+ IdleStatsUpdateTimeoutPending = false;
+ pgstat_report_stat(true);
+ }
 }
 
 
@@ -3733,6 +3739,7 @@ PostgresMain(int argc, char *argv[],
  sigjmp_buf local_sigjmp_buf;
  volatile bool send_ready_for_query = true;
  bool disable_idle_in_transaction_timeout = false;
+ bool disable_idle_stats_update_timeout = false;
 
  /* Initialize startup process environment if necessary. */
  if (!IsUnderPostmaster)
@@ -4173,9 +4180,17 @@ PostgresMain(int argc, char *argv[],
  }
  else
  {
- ProcessCompletedNotifies();
- pgstat_report_stat(false);
+ long stats_timeout;
 
+ ProcessCompletedNotifies();
+
+ stats_timeout = pgstat_report_stat(false);
+ if (stats_timeout > 0)
+ {
+ disable_idle_stats_update_timeout = true;
+ enable_timeout_after(IDLE_STATS_UPDATE_TIMEOUT,
+ stats_timeout);
+ }
  set_ps_display("idle", false);
  pgstat_report_activity(STATE_IDLE, NULL);
  }
@@ -4210,7 +4225,7 @@ PostgresMain(int argc, char *argv[],
  DoingCommandRead = false;
 
  /*
- * (5) turn off the idle-in-transaction timeout
+ * (5) turn off the idle-in-transaction timeout and stats update timeout
  */
  if (disable_idle_in_transaction_timeout)
  {
@@ -4218,6 +4233,12 @@ PostgresMain(int argc, char *argv[],
  disable_idle_in_transaction_timeout = false;
  }
 
+ if (disable_idle_stats_update_timeout)
+ {
+ disable_timeout(IDLE_STATS_UPDATE_TIMEOUT, false);
+ disable_idle_stats_update_timeout = false;
+ }
+
  /*
  * (6) check for any other interesting events that happened while we
  * slept.
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 3bf96de256..9c694f20c9 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -32,6 +32,7 @@ volatile sig_atomic_t QueryCancelPending = false;
 volatile sig_atomic_t ProcDiePending = false;
 volatile sig_atomic_t ClientConnectionLost = false;
 volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
+volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false;
 volatile sig_atomic_t ConfigReloadPending = false;
 volatile uint32 InterruptHoldoffCount = 0;
 volatile uint32 QueryCancelHoldoffCount = 0;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 29c5ec7b58..66c6a2b1e8 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -74,6 +74,7 @@ static void ShutdownPostgres(int code, Datum arg);
 static void StatementTimeoutHandler(void);
 static void LockTimeoutHandler(void);
 static void IdleInTransactionSessionTimeoutHandler(void);
+static void IdleStatsUpdateTimeoutHandler(void);
 static bool ThereIsAtLeastOneRole(void);
 static void process_startup_options(Port *port, bool am_superuser);
 static void process_settings(Oid databaseid, Oid roleid);
@@ -631,6 +632,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
  RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler);
  RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
  IdleInTransactionSessionTimeoutHandler);
+ RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT,
+ IdleStatsUpdateTimeoutHandler);
  }
 
  /*
@@ -1241,6 +1244,14 @@ IdleInTransactionSessionTimeoutHandler(void)
  SetLatch(MyLatch);
 }
 
+static void
+IdleStatsUpdateTimeoutHandler(void)
+{
+ IdleStatsUpdateTimeoutPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
 /*
  * Returns true if at least one role is defined in this database cluster.
  */
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b7d36b65dd..13be46c172 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 106;
+use Test::More tests => 105;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -123,7 +123,7 @@ is_deeply(
 
 # Contents of these directories should not be copied.
 foreach my $dirname (
- qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_stat_tmp pg_subtrans)
+ qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_subtrans)
   )
 {
  is_deeply(
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1f4db67f3f..43250c3885 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -82,6 +82,7 @@ extern PGDLLIMPORT volatile sig_atomic_t InterruptPending;
 extern PGDLLIMPORT volatile sig_atomic_t QueryCancelPending;
 extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
 extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
+extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending;
 extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
 
 extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 65713abc2b..c9fbcead3f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1,7 +1,7 @@
 /* ----------
  * pgstat.h
  *
- * Definitions for the PostgreSQL statistics collector daemon.
+ * Definitions for the PostgreSQL statistics collector facility.
  *
  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  *
@@ -13,10 +13,11 @@
 
 #include "datatype/timestamp.h"
 #include "libpq/pqcomm.h"
-#include "port/atomics.h"
+#include "lib/dshash.h"
 #include "portability/instr_time.h"
 #include "postmaster/pgarch.h"
 #include "storage/proc.h"
+#include "storage/lwlock.h"
 #include "utils/hsearch.h"
 #include "utils/relcache.h"
 
@@ -40,33 +41,6 @@ typedef enum TrackFunctionsLevel
  TRACK_FUNC_ALL
 } TrackFunctionsLevel;
 
-/* ----------
- * The types of backend -> collector messages
- * ----------
- */
-typedef enum StatMsgType
-{
- PGSTAT_MTYPE_DUMMY,
- PGSTAT_MTYPE_INQUIRY,
- PGSTAT_MTYPE_TABSTAT,
- PGSTAT_MTYPE_TABPURGE,
- PGSTAT_MTYPE_DROPDB,
- PGSTAT_MTYPE_RESETCOUNTER,
- PGSTAT_MTYPE_RESETSHAREDCOUNTER,
- PGSTAT_MTYPE_RESETSINGLECOUNTER,
- PGSTAT_MTYPE_AUTOVAC_START,
- PGSTAT_MTYPE_VACUUM,
- PGSTAT_MTYPE_ANALYZE,
- PGSTAT_MTYPE_ARCHIVER,
- PGSTAT_MTYPE_BGWRITER,
- PGSTAT_MTYPE_FUNCSTAT,
- PGSTAT_MTYPE_FUNCPURGE,
- PGSTAT_MTYPE_RECOVERYCONFLICT,
- PGSTAT_MTYPE_TEMPFILE,
- PGSTAT_MTYPE_DEADLOCK,
- PGSTAT_MTYPE_CHECKSUMFAILURE
-} StatMsgType;
-
 /* ----------
  * The data type used for counters.
  * ----------
@@ -77,9 +51,8 @@ typedef int64 PgStat_Counter;
  * PgStat_TableCounts The actual per-table counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
- * it against zeroes to detect whether there are any counts to transmit.
- * It is a component of PgStat_TableStatus (within-backend state) and
- * PgStat_TableEntry (the transmitted message format).
+ * it against zeroes to detect whether there are any counts to write.
+ * It is a component of PgStat_TableStatus (within-backend state).
  *
  * Note: for a table, tuples_returned is the number of tuples successfully
  * fetched by heap_getnext, while tuples_fetched is the number of tuples
@@ -115,13 +88,6 @@ typedef struct PgStat_TableCounts
  PgStat_Counter t_blocks_hit;
 } PgStat_TableCounts;
 
-/* Possible targets for resetting cluster-wide shared values */
-typedef enum PgStat_Shared_Reset_Target
-{
- RESET_ARCHIVER,
- RESET_BGWRITER
-} PgStat_Shared_Reset_Target;
-
 /* Possible object types for resetting single counters */
 typedef enum PgStat_Single_Reset_Type
 {
@@ -180,236 +146,12 @@ typedef struct PgStat_TableXactStatus
 } PgStat_TableXactStatus;
 
 
-/* ------------------------------------------------------------
- * Message formats follow
- * ------------------------------------------------------------
- */
-
-
 /* ----------
- * PgStat_MsgHdr The common message header
- * ----------
- */
-typedef struct PgStat_MsgHdr
-{
- StatMsgType m_type;
- int m_size;
-} PgStat_MsgHdr;
-
-/* ----------
- * Space available in a message.  This will keep the UDP packets below 1K,
- * which should fit unfragmented into the MTU of the loopback interface.
- * (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most
- * platforms, but we're being conservative here.)
- * ----------
- */
-#define PGSTAT_MAX_MSG_SIZE 1000
-#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr))
-
-
-/* ----------
- * PgStat_MsgDummy A dummy message, ignored by the collector
- * ----------
- */
-typedef struct PgStat_MsgDummy
-{
- PgStat_MsgHdr m_hdr;
-} PgStat_MsgDummy;
-
-
-/* ----------
- * PgStat_MsgInquiry Sent by a backend to ask the collector
- * to write the stats file(s).
- *
- * Ordinarily, an inquiry message prompts writing of the global stats file,
- * the stats file for shared catalogs, and the stats file for the specified
- * database.  If databaseid is InvalidOid, only the first two are written.
- *
- * New file(s) will be written only if the existing file has a timestamp
- * older than the specified cutoff_time; this prevents duplicated effort
- * when multiple requests arrive at nearly the same time, assuming that
- * backends send requests with cutoff_times a little bit in the past.
- *
- * clock_time should be the requestor's current local time; the collector
- * uses this to check for the system clock going backward, but it has no
- * effect unless that occurs.  We assume clock_time >= cutoff_time, though.
- * ----------
- */
-
-typedef struct PgStat_MsgInquiry
-{
- PgStat_MsgHdr m_hdr;
- TimestampTz clock_time; /* observed local clock time */
- TimestampTz cutoff_time; /* minimum acceptable file timestamp */
- Oid databaseid; /* requested DB (InvalidOid => shared only) */
-} PgStat_MsgInquiry;
-
-
-/* ----------
- * PgStat_TableEntry Per-table info in a MsgTabstat
- * ----------
- */
-typedef struct PgStat_TableEntry
-{
- Oid t_id;
- PgStat_TableCounts t_counts;
-} PgStat_TableEntry;
-
-/* ----------
- * PgStat_MsgTabstat Sent by the backend to report table
- * and buffer access statistics.
- * ----------
- */
-#define PGSTAT_NUM_TABENTRIES  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter)) \
- / sizeof(PgStat_TableEntry))
-
-typedef struct PgStat_MsgTabstat
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- int m_xact_commit;
- int m_xact_rollback;
- PgStat_Counter m_block_read_time; /* times in microseconds */
- PgStat_Counter m_block_write_time;
- PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
-} PgStat_MsgTabstat;
-
-
-/* ----------
- * PgStat_MsgTabpurge Sent by the backend to tell the collector
- * about dead tables.
- * ----------
- */
-#define PGSTAT_NUM_TABPURGE  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(Oid))
-
-typedef struct PgStat_MsgTabpurge
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- Oid m_tableid[PGSTAT_NUM_TABPURGE];
-} PgStat_MsgTabpurge;
-
-
-/* ----------
- * PgStat_MsgDropdb Sent by the backend to tell the collector
- * about a dropped database
- * ----------
- */
-typedef struct PgStat_MsgDropdb
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgDropdb;
-
-
-/* ----------
- * PgStat_MsgResetcounter Sent by the backend to tell the collector
- * to reset counters
- * ----------
- */
-typedef struct PgStat_MsgResetcounter
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgResetcounter;
-
-/* ----------
- * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
- * to reset a shared counter
- * ----------
- */
-typedef struct PgStat_MsgResetsharedcounter
-{
- PgStat_MsgHdr m_hdr;
- PgStat_Shared_Reset_Target m_resettarget;
-} PgStat_MsgResetsharedcounter;
-
-/* ----------
- * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector
- * to reset a single counter
- * ----------
- */
-typedef struct PgStat_MsgResetsinglecounter
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- PgStat_Single_Reset_Type m_resettype;
- Oid m_objectid;
-} PgStat_MsgResetsinglecounter;
-
-/* ----------
- * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal
- * that a database is going to be processed
- * ----------
- */
-typedef struct PgStat_MsgAutovacStart
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- TimestampTz m_start_time;
-} PgStat_MsgAutovacStart;
-
-
-/* ----------
- * PgStat_MsgVacuum Sent by the backend or autovacuum daemon
- * after VACUUM
- * ----------
- */
-typedef struct PgStat_MsgVacuum
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- Oid m_tableoid;
- bool m_autovacuum;
- TimestampTz m_vacuumtime;
- PgStat_Counter m_live_tuples;
- PgStat_Counter m_dead_tuples;
-} PgStat_MsgVacuum;
-
-
-/* ----------
- * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon
- * after ANALYZE
- * ----------
- */
-typedef struct PgStat_MsgAnalyze
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- Oid m_tableoid;
- bool m_autovacuum;
- bool m_resetcounter;
- TimestampTz m_analyzetime;
- PgStat_Counter m_live_tuples;
- PgStat_Counter m_dead_tuples;
-} PgStat_MsgAnalyze;
-
-
-/* ----------
- * PgStat_MsgArchiver Sent by the archiver to update statistics.
- * ----------
- */
-typedef struct PgStat_MsgArchiver
-{
- PgStat_MsgHdr m_hdr;
- bool m_failed; /* Failed attempt */
- char m_xlog[MAX_XFN_CHARS + 1];
- TimestampTz m_timestamp;
-} PgStat_MsgArchiver;
-
-/* ----------
- * PgStat_MsgBgWriter Sent by the bgwriter to update statistics.
+ * PgStat_MsgBgWriter bgwriter statistics
  * ----------
  */
 typedef struct PgStat_MsgBgWriter
 {
- PgStat_MsgHdr m_hdr;
-
  PgStat_Counter m_timed_checkpoints;
  PgStat_Counter m_requested_checkpoints;
  PgStat_Counter m_buf_written_checkpoints;
@@ -422,38 +164,14 @@ typedef struct PgStat_MsgBgWriter
  PgStat_Counter m_checkpoint_sync_time;
 } PgStat_MsgBgWriter;
 
-/* ----------
- * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict
- * ----------
- */
-typedef struct PgStat_MsgRecoveryConflict
-{
- PgStat_MsgHdr m_hdr;
-
- Oid m_databaseid;
- int m_reason;
-} PgStat_MsgRecoveryConflict;
-
-/* ----------
- * PgStat_MsgTempFile Sent by the backend upon creating a temp file
- * ----------
- */
-typedef struct PgStat_MsgTempFile
-{
- PgStat_MsgHdr m_hdr;
-
- Oid m_databaseid;
- size_t m_filesize;
-} PgStat_MsgTempFile;
-
 /* ----------
  * PgStat_FunctionCounts The actual per-function counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
- * it against zeroes to detect whether there are any counts to transmit.
+ * it against zeroes to detect whether there are any counts to write.
  *
  * Note that the time counters are in instr_time format here.  We convert to
- * microseconds in PgStat_Counter format when transmitting to the collector.
+ * microseconds in PgStat_Counter format when writing to shared statsitics.
  * ----------
  */
 typedef struct PgStat_FunctionCounts
@@ -485,96 +203,8 @@ typedef struct PgStat_FunctionEntry
  PgStat_Counter f_self_time;
 } PgStat_FunctionEntry;
 
-/* ----------
- * PgStat_MsgFuncstat Sent by the backend to report function
- * usage statistics.
- * ----------
- */
-#define PGSTAT_NUM_FUNCENTRIES \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(PgStat_FunctionEntry))
-
-typedef struct PgStat_MsgFuncstat
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES];
-} PgStat_MsgFuncstat;
-
-/* ----------
- * PgStat_MsgFuncpurge Sent by the backend to tell the collector
- * about dead functions.
- * ----------
- */
-#define PGSTAT_NUM_FUNCPURGE  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(Oid))
-
-typedef struct PgStat_MsgFuncpurge
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- Oid m_functionid[PGSTAT_NUM_FUNCPURGE];
-} PgStat_MsgFuncpurge;
-
-/* ----------
- * PgStat_MsgDeadlock Sent by the backend to tell the collector
- * about a deadlock that occurred.
- * ----------
- */
-typedef struct PgStat_MsgDeadlock
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgDeadlock;
-
-/* ----------
- * PgStat_MsgChecksumFailure Sent by the backend to tell the collector
- * about checksum failures noticed.
- * ----------
- */
-typedef struct PgStat_MsgChecksumFailure
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_failurecount;
- TimestampTz m_failure_time;
-} PgStat_MsgChecksumFailure;
-
-
-/* ----------
- * PgStat_Msg Union over all possible messages.
- * ----------
- */
-typedef union PgStat_Msg
-{
- PgStat_MsgHdr msg_hdr;
- PgStat_MsgDummy msg_dummy;
- PgStat_MsgInquiry msg_inquiry;
- PgStat_MsgTabstat msg_tabstat;
- PgStat_MsgTabpurge msg_tabpurge;
- PgStat_MsgDropdb msg_dropdb;
- PgStat_MsgResetcounter msg_resetcounter;
- PgStat_MsgResetsharedcounter msg_resetsharedcounter;
- PgStat_MsgResetsinglecounter msg_resetsinglecounter;
- PgStat_MsgAutovacStart msg_autovacuum_start;
- PgStat_MsgVacuum msg_vacuum;
- PgStat_MsgAnalyze msg_analyze;
- PgStat_MsgArchiver msg_archiver;
- PgStat_MsgBgWriter msg_bgwriter;
- PgStat_MsgFuncstat msg_funcstat;
- PgStat_MsgFuncpurge msg_funcpurge;
- PgStat_MsgRecoveryConflict msg_recoveryconflict;
- PgStat_MsgDeadlock msg_deadlock;
- PgStat_MsgTempFile msg_tempfile;
- PgStat_MsgChecksumFailure msg_checksumfailure;
-} PgStat_Msg;
-
-
 /* ------------------------------------------------------------
- * Statistic collector data structures follow
+ * Statistic collector data structures on file and shared memory follow
  *
  * PGSTAT_FILE_FORMAT_ID should be changed whenever any of these
  * data structures change.
@@ -614,16 +244,29 @@ typedef struct PgStat_StatDBEntry
  PgStat_Counter n_block_write_time;
 
  TimestampTz stat_reset_timestamp;
- TimestampTz stats_timestamp; /* time of db stats file update */
+ TimestampTz stats_timestamp; /* time of db stats update */
 
  /*
- * tables and functions must be last in the struct, because we don't write
- * the pointers out to the stats file.
+ * The followings must be last in the struct, because we don't write them
+ * out to the stats file.
  */
- HTAB   *tables;
- HTAB   *functions;
+ int generation; /* current generation of the below */
+ int refcnt; /* current gen reference count */
+ dshash_table_handle tables; /* current gen tables hash */
+ dshash_table_handle functions; /* current gen functions hash */
+ int prev_refcnt; /* prev gen reference count */
+ dshash_table_handle prev_tables; /* prev gen tables hash */
+ dshash_table_handle prev_functions; /* prev gen functions hash */
+ LWLock lock; /* Lock for the above members */
+
+ /* non-shared members */
+ HTAB *snapshot_tables; /* table entry snapshot */
+ HTAB *snapshot_functions; /* function entry snapshot */
+ dshash_table *dshash_tables; /* attached tables dshash */
+ dshash_table *dshash_functions; /* attached functions dshash */
 } PgStat_StatDBEntry;
 
+#define SHARED_DBENT_SIZE offsetof(PgStat_StatDBEntry, snapshot_tables)
 
 /* ----------
  * PgStat_StatTabEntry The collector's data per table (or index)
@@ -662,7 +305,7 @@ typedef struct PgStat_StatTabEntry
 
 
 /* ----------
- * PgStat_StatFuncEntry The collector's data per function
+ * PgStat_StatFuncEntry per function stats data
  * ----------
  */
 typedef struct PgStat_StatFuncEntry
@@ -677,7 +320,7 @@ typedef struct PgStat_StatFuncEntry
 
 
 /*
- * Archiver statistics kept in the stats collector
+ * Archiver statistics kept in the shared stats
  */
 typedef struct PgStat_ArchiverStats
 {
@@ -693,7 +336,7 @@ typedef struct PgStat_ArchiverStats
 } PgStat_ArchiverStats;
 
 /*
- * Global statistics kept in the stats collector
+ * Global statistics kept in the shared stats
  */
 typedef struct PgStat_GlobalStats
 {
@@ -779,7 +422,6 @@ typedef enum
  WAIT_EVENT_CHECKPOINTER_MAIN,
  WAIT_EVENT_LOGICAL_APPLY_MAIN,
  WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
- WAIT_EVENT_PGSTAT_MAIN,
  WAIT_EVENT_RECOVERY_WAL_ALL,
  WAIT_EVENT_RECOVERY_WAL_STREAM,
  WAIT_EVENT_SYSLOGGER_MAIN,
@@ -1214,6 +856,8 @@ extern bool pgstat_track_counts;
 extern int pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
+
+/* No longer used, but will be removed with GUC */
 extern char *pgstat_stat_tmpname;
 extern char *pgstat_stat_filename;
 
@@ -1235,29 +879,26 @@ extern PgStat_Counter pgStatBlockWriteTime;
 extern Size BackendStatusShmemSize(void);
 extern void CreateSharedBackendStatus(void);
 
-extern void pgstat_init(void);
-extern int pgstat_start(void);
+extern Size StatsShmemSize(void);
+extern void StatsShmemInit(void);
+
 extern void pgstat_reset_all(void);
-extern void allow_immediate_pgstat_restart(void);
-
-#ifdef EXEC_BACKEND
-extern void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-#endif
 
+/* File input/output functions  */
+extern void pgstat_read_statsfiles(void);
+extern void pgstat_write_statsfiles(void);
 
 /* ----------
  * Functions called from backends
  * ----------
  */
-extern void pgstat_ping(void);
-
-extern void pgstat_report_stat(bool force);
+extern long pgstat_report_stat(bool force);
 extern void pgstat_vacuum_stat(void);
 extern void pgstat_drop_database(Oid databaseid);
 
 extern void pgstat_clear_snapshot(void);
 extern void pgstat_reset_counters(void);
-extern void pgstat_reset_shared_counters(const char *);
+extern void pgstat_reset_shared_counters(const char *target);
 extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type type);
 
 extern void pgstat_report_autovac(Oid dboid);
@@ -1429,11 +1070,13 @@ extern void pgstat_send_bgwriter(void);
  */
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
+extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_extended(PgStat_StatDBEntry *dbent, Oid relid);
 extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
 extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid);
 extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern void pgstat_clear_snapshot(void);
 
 #endif /* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index f627dfedc5..97801f4791 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -220,6 +220,7 @@ typedef enum BuiltinTrancheIds
  LWTRANCHE_TBM,
  LWTRANCHE_PARALLEL_APPEND,
  LWTRANCHE_SXACT,
+ LWTRANCHE_STATS,
  LWTRANCHE_FIRST_USER_DEFINED
 } BuiltinTrancheIds;
 
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index 9244a2a7b7..a9b625211b 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -31,6 +31,7 @@ typedef enum TimeoutId
  STANDBY_TIMEOUT,
  STANDBY_LOCK_TIMEOUT,
  IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
+ IDLE_STATS_UPDATE_TIMEOUT,
  /* First user-definable timeout reason */
  USER_TIMEOUT,
  /* Maximum number of timeout reasons */
--
2.16.3


From eda37e6344f6f848234e4ee79563b629a76737e6 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Tue, 27 Nov 2018 14:42:12 +0900
Subject: [PATCH v23 5/5] Remove the GUC stats_temp_directory

The guc used to specifie the directory to store temporary statistics
files. It is no longer needed by the stats collector but still used by
the programs in bin and contirb, and maybe other extensions. Thus this
patch removes the GUC but some backing variables and macro definitions
are left alone for backward comptibility.
---
 doc/src/sgml/backup.sgml                      |  2 --
 doc/src/sgml/config.sgml                      | 19 -------------
 doc/src/sgml/monitoring.sgml                  |  7 +----
 doc/src/sgml/storage.sgml                     |  3 +-
 src/backend/postmaster/pgstat.c               | 13 ++++-----
 src/backend/replication/basebackup.c          | 13 ++-------
 src/backend/utils/misc/guc.c                  | 41 ---------------------------
 src/backend/utils/misc/postgresql.conf.sample |  1 -
 src/include/pgstat.h                          |  5 +++-
 src/test/perl/PostgresNode.pm                 |  4 ---
 10 files changed, 14 insertions(+), 94 deletions(-)

diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml
index bdc9026c62..2885540362 100644
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -1146,8 +1146,6 @@ SELECT pg_stop_backup();
     <filename>pg_snapshots/</filename>, <filename>pg_stat_tmp/</filename>,
     and <filename>pg_subtrans/</filename> (but not the directories themselves) can be
     omitted from the backup as they will be initialized on postmaster startup.
-    If <xref linkend="guc-stats-temp-directory"/> is set and is under the data
-    directory then the contents of that directory can also be omitted.
    </para>
 
    <para>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6612f95f9f..b346809c11 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6818,25 +6818,6 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-stats-temp-directory" xreflabel="stats_temp_directory">
-      <term><varname>stats_temp_directory</varname> (<type>string</type>)
-      <indexterm>
-       <primary><varname>stats_temp_directory</varname> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        Sets the directory to store temporary statistics data in. This can be
-        a path relative to the data directory or an absolute path. The default
-        is <filename>pg_stat_tmp</filename>. Pointing this at a RAM-based
-        file system will decrease physical I/O requirements and can lead to
-        improved performance.
-        This parameter can only be set in the <filename>postgresql.conf</filename>
-        file or on the server command line.
-       </para>
-      </listitem>
-     </varlistentry>
-
      </variablelist>
     </sect2>
 
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index ea6aad4d1e..33ad2b8be8 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -195,12 +195,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
 
   <para>
    The statistics collector transmits the collected information to other
-   <productname>PostgreSQL</productname> processes through temporary files.
-   These files are stored in the directory named by the
-   <xref linkend="guc-stats-temp-directory"/> parameter,
-   <filename>pg_stat_tmp</filename> by default.
-   For better performance, <varname>stats_temp_directory</varname> can be
-   pointed at a RAM-based file system, decreasing physical I/O requirements.
+   <productname>PostgreSQL</productname> processes through shared memory.
    When the server shuts down cleanly, a permanent copy of the statistics
    data is stored in the <filename>pg_stat</filename> subdirectory, so that
    statistics can be retained across server restarts.  When recovery is
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index 1c19e863d2..2f04bb68bb 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -122,8 +122,7 @@ Item
 
 <row>
  <entry><filename>pg_stat_tmp</filename></entry>
- <entry>Subdirectory containing temporary files for the statistics
-  subsystem</entry>
+ <entry>Subdirectory containing ephemeral files for extensions</entry>
 </row>
 
 <row>
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index c0b20763b0..6b8025ad13 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -108,15 +108,12 @@ bool pgstat_track_counts = false;
 int pgstat_track_functions = TRACK_FUNC_OFF;
 int pgstat_track_activity_query_size = 1024;
 
-/* ----------
- * Built from GUC parameter
- * ----------
+/*
+ * This used to be a GUC variable and is no longer used in this file, but left
+ * alone just for backward comptibility for extensions, having the default
+ * value.
  */
-char   *pgstat_stat_directory = NULL;
-
-/* No longer used, but will be removed with GUC */
-char   *pgstat_stat_filename = NULL;
-char   *pgstat_stat_tmpname = NULL;
+char   *pgstat_stat_directory = PG_STAT_TMP_DIR;
 
 #define StatsLock (&StatsShmem->StatsMainLock)
 
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index d0f210de8c..39fcf29ff2 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -242,11 +242,8 @@ perform_base_backup(basebackup_options *opt)
  TimeLineID endtli;
  StringInfo labelfile;
  StringInfo tblspc_map_file = NULL;
- int datadirpathlen;
  List   *tablespaces = NIL;
 
- datadirpathlen = strlen(DataDir);
-
  backup_started_in_recovery = RecoveryInProgress();
 
  labelfile = makeStringInfo();
@@ -277,13 +274,9 @@ perform_base_backup(basebackup_options *opt)
  * Calculate the relative path of temporary statistics directory in
  * order to skip the files which are located in that directory later.
  */
- if (is_absolute_path(pgstat_stat_directory) &&
- strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
- statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
- else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
- statrelpath = psprintf("./%s", pgstat_stat_directory);
- else
- statrelpath = pgstat_stat_directory;
+
+ Assert(strchr(PG_STAT_TMP_DIR, '/') == NULL);
+ statrelpath = psprintf("./%s", PG_STAT_TMP_DIR);
 
  /* Add a node for the base directory at the end */
  ti = palloc0(sizeof(tablespaceinfo));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2178e1cf5e..50625421ab 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -194,7 +194,6 @@ static bool check_max_wal_senders(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static void assign_effective_io_concurrency(int newval, void *extra);
-static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
 static bool check_cluster_name(char **newval, void **extra, GucSource source);
@@ -4072,17 +4071,6 @@ static struct config_string ConfigureNamesString[] =
  NULL, NULL, NULL
  },
 
- {
- {"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR,
- gettext_noop("Writes temporary statistics files to the specified directory."),
- NULL,
- GUC_SUPERUSER_ONLY
- },
- &pgstat_temp_directory,
- PG_STAT_TMP_DIR,
- check_canonical_path, assign_pgstat_temp_directory, NULL
- },
-
  {
  {"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER,
  gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."),
@@ -11352,35 +11340,6 @@ assign_effective_io_concurrency(int newval, void *extra)
 #endif /* USE_PREFETCH */
 }
 
-static void
-assign_pgstat_temp_directory(const char *newval, void *extra)
-{
- /* check_canonical_path already canonicalized newval for us */
- char   *dname;
- char   *tname;
- char   *fname;
-
- /* directory */
- dname = guc_malloc(ERROR, strlen(newval) + 1); /* runtime dir */
- sprintf(dname, "%s", newval);
-
- /* global stats */
- tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */
- sprintf(tname, "%s/global.tmp", newval);
- fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */
- sprintf(fname, "%s/global.stat", newval);
-
- if (pgstat_stat_directory)
- free(pgstat_stat_directory);
- pgstat_stat_directory = dname;
- if (pgstat_stat_tmpname)
- free(pgstat_stat_tmpname);
- pgstat_stat_tmpname = tname;
- if (pgstat_stat_filename)
- free(pgstat_stat_filename);
- pgstat_stat_filename = fname;
-}
-
 static bool
 check_application_name(char **newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0fc23e3a61..66f539c4bb 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -558,7 +558,6 @@
 #track_io_timing = off
 #track_functions = none # none, pl, all
 #track_activity_query_size = 1024 # (change requires restart)
-#stats_temp_directory = 'pg_stat_tmp'
 
 
 # - Monitoring -
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c9fbcead3f..e9e18ed27a 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -30,7 +30,10 @@
 #define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat"
 #define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp"
 
-/* Default directory to store temporary statistics data in */
+/*
+ * This used to be the directory to store temporary statistics data in but is
+ * no longer used. Defined here for backward compatibility.
+ */
 #define PG_STAT_TMP_DIR "pg_stat_tmp"
 
 /* Values for track_functions GUC variable --- order is significant! */
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 270bd6c856..c604c5e90b 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -455,10 +455,6 @@ sub init
  print $conf TestLib::slurp_file($ENV{TEMP_CONFIG})
   if defined $ENV{TEMP_CONFIG};
 
- # XXX Neutralize any stats_temp_directory in TEMP_CONFIG.  Nodes running
- # concurrently must not share a stats_temp_directory.
- print $conf "stats_temp_directory = 'pg_stat_tmp'\n";
-
  if ($params{allows_streaming})
  {
  if ($params{allows_streaming} eq "logical")
--
2.16.3

Reply | Threaded
Open this post in threaded view
|

Re: shared-memory based stats collector

Michael Paquier-2
On Fri, Sep 27, 2019 at 09:46:47AM +0900, Kyotaro Horiguchi wrote:
> Affected by the code movement in 9a86f03b4e. Just
> rebased. Thanks.

This does not apply anymore.  Could you provide a rebase?  I have
moved the patch to next CF, waiting on author.

Thanks,
--
Michael

signature.asc (849 bytes) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: shared-memory based stats collector

Kyotaro Horiguchi-4
At Sun, 1 Dec 2019 11:12:32 +0900, Michael Paquier <[hidden email]> wrote in
> On Fri, Sep 27, 2019 at 09:46:47AM +0900, Kyotaro Horiguchi wrote:
> > Affected by the code movement in 9a86f03b4e. Just
> > rebased. Thanks.
>
> This does not apply anymore.  Could you provide a rebase?  I have
> moved the patch to next CF, waiting on author.

Thanks! Rebased.

# I should design then run a performance test on this..

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

From 761b0c55e88acc90c143d29a7d53dc6bb0495b7b Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Fri, 29 Jun 2018 16:41:04 +0900
Subject: [PATCH v24 1/5] sequential scan for dshash

Add sequential scan feature to dshash.
---
 src/backend/lib/dshash.c | 188 ++++++++++++++++++++++++++++++++++++++-
 src/include/lib/dshash.h |  23 ++++-
 2 files changed, 206 insertions(+), 5 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 350f8c0a66..4f0c7ec840 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -112,6 +112,7 @@ struct dshash_table
  size_t size_log2; /* log2(number of buckets) */
  bool find_locked; /* Is any partition lock held by 'find'? */
  bool find_exclusively_locked; /* ... exclusively? */
+ bool seqscan_running;/* now under sequential scan */
 };
 
 /* Given a pointer to an item, find the entry (user data) it holds. */
@@ -127,6 +128,10 @@ struct dshash_table
 #define NUM_SPLITS(size_log2) \
  (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
 
+/* How many buckets are there in a given size? */
+#define NUM_BUCKETS(size_log2) \
+ (((size_t) 1) << (size_log2))
+
 /* How many buckets are there in each partition at a given size? */
 #define BUCKETS_PER_PARTITION(size_log2) \
  (((size_t) 1) << NUM_SPLITS(size_log2))
@@ -153,6 +158,10 @@ struct dshash_table
 #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
  ((partition) << NUM_SPLITS(size_log2))
 
+/* Choose partition based on bucket index. */
+#define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
+ ((bucket_idx) >> NUM_SPLITS(size_log2))
+
 /* The head of the active bucket for a given hash value (lvalue). */
 #define BUCKET_FOR_HASH(hash_table, hash) \
  (hash_table->buckets[ \
@@ -228,6 +237,7 @@ dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
 
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
+ hash_table->seqscan_running = false;
 
  /*
  * Set up the initial array of buckets.  Our initial size is the same as
@@ -279,6 +289,7 @@ dshash_attach(dsa_area *area, const dshash_parameters *params,
  hash_table->control = dsa_get_address(area, control);
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
+ hash_table->seqscan_running = false;
  Assert(hash_table->control->magic == DSHASH_MAGIC);
 
  /*
@@ -324,7 +335,7 @@ dshash_destroy(dshash_table *hash_table)
  ensure_valid_bucket_pointers(hash_table);
 
  /* Free all the entries. */
- size = ((size_t) 1) << hash_table->size_log2;
+ size = NUM_BUCKETS(hash_table->size_log2);
  for (i = 0; i < size; ++i)
  {
  dsa_pointer item_pointer = hash_table->buckets[i];
@@ -549,9 +560,14 @@ dshash_delete_entry(dshash_table *hash_table, void *entry)
  LW_EXCLUSIVE));
 
  delete_item(hash_table, item);
- hash_table->find_locked = false;
- hash_table->find_exclusively_locked = false;
- LWLockRelease(PARTITION_LOCK(hash_table, partition));
+
+ /* We need to keep partition lock while sequential scan */
+ if (!hash_table->seqscan_running)
+ {
+ hash_table->find_locked = false;
+ hash_table->find_exclusively_locked = false;
+ LWLockRelease(PARTITION_LOCK(hash_table, partition));
+ }
 }
 
 /*
@@ -568,6 +584,8 @@ dshash_release_lock(dshash_table *hash_table, void *entry)
  Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition_index),
  hash_table->find_exclusively_locked
  ? LW_EXCLUSIVE : LW_SHARED));
+ /* lock is under control of sequential scan */
+ Assert(!hash_table->seqscan_running);
 
  hash_table->find_locked = false;
  hash_table->find_exclusively_locked = false;
@@ -592,6 +610,168 @@ dshash_memhash(const void *v, size_t size, void *arg)
  return tag_hash(v, size);
 }
 
+/*
+ * dshash_seq_init/_next/_term
+ *           Sequentially scan trhough dshash table and return all the
+ *           elements one by one, return NULL when no more.
+ *
+ * dshash_seq_term should be called if and only if the scan is abandoned
+ * before completion; if dshash_seq_next returns NULL then it has already done
+ * the end-of-scan cleanup.
+ *
+ * On returning element, it is locked as is the case with dshash_find.
+ * However, the caller must not release the lock. The lock is released as
+ * necessary in continued scan.
+ *
+ * As opposed to the equivalent for dynanash, the caller is not supposed to
+ * delete the returned element before continuing the scan.
+ *
+ * If consistent is set for dshash_seq_init, the whole hash table is
+ * non-exclusively locked. Otherwise a part of the hash table is locked in the
+ * same mode (partition lock).
+ */
+void
+dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+ bool consistent, bool exclusive)
+{
+ /* allowed at most one scan at once */
+ Assert(!hash_table->seqscan_running);
+
+ status->hash_table = hash_table;
+ status->curbucket = 0;
+ status->nbuckets = 0;
+ status->curitem = NULL;
+ status->pnextitem = InvalidDsaPointer;
+ status->curpartition = -1;
+ status->consistent = consistent;
+ status->exclusive = exclusive;
+ hash_table->seqscan_running = true;
+
+ /*
+ * Protect all partitions from modification if the caller wants a
+ * consistent result.
+ */
+ if (consistent)
+ {
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+
+ LWLockAcquire(PARTITION_LOCK(hash_table, i),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ }
+ ensure_valid_bucket_pointers(hash_table);
+ }
+}
+
+void *
+dshash_seq_next(dshash_seq_status *status)
+{
+ dsa_pointer next_item_pointer;
+
+ Assert(status->hash_table->seqscan_running);
+ if (status->curitem == NULL)
+ {
+ int partition;
+
+ Assert (status->curbucket == 0);
+ Assert(!status->hash_table->find_locked);
+
+ /* first shot. grab the first item. */
+ if (!status->consistent)
+ {
+ partition =
+ PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+   status->hash_table->size_log2);
+ LWLockAcquire(PARTITION_LOCK(status->hash_table, partition),
+  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ status->curpartition = partition;
+
+ /* resize doesn't happen from now until seq scan ends */
+ status->nbuckets =
+ NUM_BUCKETS(status->hash_table->control->size_log2);
+ ensure_valid_bucket_pointers(status->hash_table);
+ }
+
+ next_item_pointer = status->hash_table->buckets[status->curbucket];
+ }
+ else
+ next_item_pointer = status->pnextitem;
+
+ /* Move to the next bucket if we finished the current bucket */
+ while (!DsaPointerIsValid(next_item_pointer))
+ {
+ if (++status->curbucket >= status->nbuckets)
+ {
+ /* all buckets have been scanned. finsih. */
+ dshash_seq_term(status);
+ return NULL;
+ }
+
+ /* Also move parititon lock if needed */
+ if (!status->consistent)
+ {
+ int next_partition =
+ PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+   status->hash_table->size_log2);
+
+ /* Move lock along with partition for the bucket */
+ if (status->curpartition != next_partition)
+ {
+ /*
+ * Take lock on the next partition then release the current,
+ * not in the reverse order. This is required to avoid
+ * resizing from happening during a sequential scan. Locks are
+ * taken in partition order so no dead lock happen with other
+ * seq scans or resizing.
+ */
+ LWLockAcquire(PARTITION_LOCK(status->hash_table,
+ next_partition),
+  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ LWLockRelease(PARTITION_LOCK(status->hash_table,
+ status->curpartition));
+ status->curpartition = next_partition;
+ }
+ }
+
+ next_item_pointer = status->hash_table->buckets[status->curbucket];
+ }
+
+ status->curitem =
+ dsa_get_address(status->hash_table->area, next_item_pointer);
+ status->hash_table->find_locked = true;
+ status->hash_table->find_exclusively_locked = status->exclusive;
+
+ /*
+ * This item can be deleted by the caller. Store the next item for the
+ * next iteration for the occasion.
+ */
+ status->pnextitem = status->curitem->next;
+
+ return ENTRY_FROM_ITEM(status->curitem);
+}
+
+void
+dshash_seq_term(dshash_seq_status *status)
+{
+ Assert(status->hash_table->seqscan_running);
+ status->hash_table->find_locked = false;
+ status->hash_table->find_exclusively_locked = false;
+ status->hash_table->seqscan_running = false;
+
+ if (status->consistent)
+ {
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ LWLockRelease(PARTITION_LOCK(status->hash_table, i));
+ }
+ else if (status->curpartition >= 0)
+ LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
+}
+
 /*
  * Print debugging information about the internal state of the hash table to
  * stderr.  The caller must hold no partition locks.
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index fa2e28ff3e..79698a6ad6 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -59,6 +59,23 @@ typedef struct dshash_parameters
 struct dshash_table_item;
 typedef struct dshash_table_item dshash_table_item;
 
+/*
+ * Sequential scan state of dshash. The detail is exposed since the storage
+ * size should be known to users but it should be considered as an opaque
+ * type by callers.
+ */
+typedef struct dshash_seq_status
+{
+ dshash_table   *hash_table;
+ int curbucket;
+ int nbuckets;
+ dshash_table_item  *curitem;
+ dsa_pointer pnextitem;
+ int curpartition;
+ bool consistent;
+ bool exclusive;
+} dshash_seq_status;
+
 /* Creating, sharing and destroying from hash tables. */
 extern dshash_table *dshash_create(dsa_area *area,
    const dshash_parameters *params,
@@ -70,7 +87,6 @@ extern dshash_table *dshash_attach(dsa_area *area,
 extern void dshash_detach(dshash_table *hash_table);
 extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table);
 extern void dshash_destroy(dshash_table *hash_table);
-
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
  const void *key, bool exclusive);
@@ -80,6 +96,11 @@ extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
 
+/* seq scan support */
+extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+ bool consistent, bool exclusive);
+extern void *dshash_seq_next(dshash_seq_status *status);
+extern void dshash_seq_term(dshash_seq_status *status);
 /* Convenience hash and compare functions wrapping memcmp and tag_hash. */
 extern int dshash_memcmp(const void *a, const void *b, size_t size, void *arg);
 extern dshash_hash dshash_memhash(const void *v, size_t size, void *arg);
--
2.23.0


From 54d24757f2ddac318bb781137972321d819d22a5 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Thu, 27 Sep 2018 11:15:19 +0900
Subject: [PATCH v24 2/5] Add conditional lock feature to dshash

Dshash currently waits for lock unconditionally. This commit adds new
interfaces for dshash_find and dshash_find_or_insert. The new
interfaces have an extra parameter "nowait" taht commands not to wait
for lock.
---
 src/backend/lib/dshash.c | 69 ++++++++++++++++++++++++++++++++++++----
 src/include/lib/dshash.h |  6 ++++
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 4f0c7ec840..60a6e3c0bc 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -394,19 +394,48 @@ dshash_get_hash_table_handle(dshash_table *hash_table)
  */
 void *
 dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
+{
+ return dshash_find_extended(hash_table, key, exclusive, false, NULL);
+}
+
+/*
+ * Addition to dshash_find, returns immediately when nowait is true and lock
+ * was not acquired. Lock status is set to *lock_failed if any.
+ */
+void *
+dshash_find_extended(dshash_table *hash_table, const void *key,
+ bool exclusive, bool nowait, bool *lock_acquired)
 {
  dshash_hash hash;
  size_t partition;
  dshash_table_item *item;
 
+ /* allowing !nowait returning the result is just not sensible */
+ Assert(nowait || !lock_acquired);
+
  hash = hash_key(hash_table, key);
  partition = PARTITION_FOR_HASH(hash);
 
  Assert(hash_table->control->magic == DSHASH_MAGIC);
  Assert(!hash_table->find_locked);
 
- LWLockAcquire(PARTITION_LOCK(hash_table, partition),
-  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ if (nowait)
+ {
+ if (!LWLockConditionalAcquire(PARTITION_LOCK(hash_table, partition),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED))
+ {
+ if (lock_acquired)
+ *lock_acquired = false;
+ return NULL;
+ }
+ }
+ else
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+
+ if (lock_acquired)
+ *lock_acquired = true;
+
  ensure_valid_bucket_pointers(hash_table);
 
  /* Search the active bucket. */
@@ -441,6 +470,22 @@ void *
 dshash_find_or_insert(dshash_table *hash_table,
   const void *key,
   bool *found)
+{
+ return dshash_find_or_insert_extended(hash_table, key, found, false);
+}
+
+/*
+ * Addition to dshash_find_or_insert, returns NULL if nowait is true and lock
+ * was not acquired.
+ *
+ * Notes above dshash_find_extended() regarding locking and error handling
+ * equally apply here.
+ */
+void *
+dshash_find_or_insert_extended(dshash_table *hash_table,
+   const void *key,
+   bool *found,
+   bool nowait)
 {
  dshash_hash hash;
  size_t partition_index;
@@ -455,8 +500,16 @@ dshash_find_or_insert(dshash_table *hash_table,
  Assert(!hash_table->find_locked);
 
 restart:
- LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
-  LW_EXCLUSIVE);
+ if (nowait)
+ {
+ if (!LWLockConditionalAcquire(
+ PARTITION_LOCK(hash_table, partition_index),
+ LW_EXCLUSIVE))
+ return NULL;
+ }
+ else
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
+  LW_EXCLUSIVE);
  ensure_valid_bucket_pointers(hash_table);
 
  /* Search the active bucket. */
@@ -626,9 +679,11 @@ dshash_memhash(const void *v, size_t size, void *arg)
  * As opposed to the equivalent for dynanash, the caller is not supposed to
  * delete the returned element before continuing the scan.
  *
- * If consistent is set for dshash_seq_init, the whole hash table is
- * non-exclusively locked. Otherwise a part of the hash table is locked in the
- * same mode (partition lock).
+ * If consistent is set for dshash_seq_init, the all hash table
+ * partitions are locked in the requested mode (as determined by the
+ * exclusive flag), and the locks are held until the end of the scan.
+ * Otherwise the partition locks are acquired and released as needed
+ * during the scan (up to two partitions may be locked at the same time).
  */
 void
 dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index 79698a6ad6..67f7d77f71 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -90,8 +90,14 @@ extern void dshash_destroy(dshash_table *hash_table);
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
  const void *key, bool exclusive);
+extern void *dshash_find_extended(dshash_table *hash_table, const void *key,
+  bool exclusive, bool nowait,
+  bool *lock_acquired);
 extern void *dshash_find_or_insert(dshash_table *hash_table,
    const void *key, bool *found);
+extern void *dshash_find_or_insert_extended(dshash_table *hash_table,
+ const void *key, bool *found,
+ bool nowait);
 extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
--
2.23.0


From c4b747064b46f18de7d212a41210952ea27e3c5c Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Wed, 7 Nov 2018 16:53:49 +0900
Subject: [PATCH v24 3/5] Make archiver process an auxiliary process

This is a preliminary patch for shared-memory based stats collector.
Archiver process must be a auxiliary process since it uses shared
memory after stats data wes moved onto shared-memory. Make the process
an auxiliary process in order to make it work.
---
 src/backend/bootstrap/bootstrap.c   |  8 +++
 src/backend/postmaster/pgarch.c     | 98 +++++++----------------------
 src/backend/postmaster/pgstat.c     |  6 ++
 src/backend/postmaster/postmaster.c | 35 ++++++++---
 src/include/miscadmin.h             |  2 +
 src/include/pgstat.h                |  1 +
 src/include/postmaster/pgarch.h     |  4 +-
 7 files changed, 67 insertions(+), 87 deletions(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 8ea033610d..6e38f9a3d2 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -329,6 +329,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
  case BgWriterProcess:
  statmsg = pgstat_get_backend_desc(B_BG_WRITER);
  break;
+ case ArchiverProcess:
+ statmsg = pgstat_get_backend_desc(B_ARCHIVER);
+ break;
  case CheckpointerProcess:
  statmsg = pgstat_get_backend_desc(B_CHECKPOINTER);
  break;
@@ -456,6 +459,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
  BackgroundWriterMain();
  proc_exit(1); /* should never return */
 
+ case ArchiverProcess:
+ /* don't set signals, archiver has its own agenda */
+ PgArchiverMain();
+ proc_exit(1); /* should never return */
+
  case CheckpointerProcess:
  /* don't set signals, checkpointer has its own agenda */
  CheckpointerMain();
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index f84f882c4c..4342ebdab4 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -77,7 +77,6 @@
  * Local data
  * ----------
  */
-static time_t last_pgarch_start_time;
 static time_t last_sigterm_time = 0;
 
 /*
@@ -96,7 +95,6 @@ static volatile sig_atomic_t ready_to_stop = false;
 static pid_t pgarch_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]) pg_attribute_noreturn();
 static void pgarch_exit(SIGNAL_ARGS);
 static void ArchSigHupHandler(SIGNAL_ARGS);
 static void ArchSigTermHandler(SIGNAL_ARGS);
@@ -114,75 +112,6 @@ static void pgarch_archiveDone(char *xlog);
  * ------------------------------------------------------------
  */
 
-/*
- * pgarch_start
- *
- * Called from postmaster at startup or after an existing archiver
- * died.  Attempt to fire up a fresh archiver process.
- *
- * Returns PID of child process, or 0 if fail.
- *
- * Note: if fail, we will be called again from the postmaster main loop.
- */
-int
-pgarch_start(void)
-{
- time_t curtime;
- pid_t pgArchPid;
-
- /*
- * Do nothing if no archiver needed
- */
- if (!XLogArchivingActive())
- return 0;
-
- /*
- * Do nothing if too soon since last archiver start.  This is a safety
- * valve to protect against continuous respawn attempts if the archiver is
- * dying immediately at launch. Note that since we will be re-called from
- * the postmaster main loop, we will get another chance later.
- */
- curtime = time(NULL);
- if ((unsigned int) (curtime - last_pgarch_start_time) <
- (unsigned int) PGARCH_RESTART_INTERVAL)
- return 0;
- last_pgarch_start_time = curtime;
-
-#ifdef EXEC_BACKEND
- switch ((pgArchPid = pgarch_forkexec()))
-#else
- switch ((pgArchPid = fork_process()))
-#endif
- {
- case -1:
- ereport(LOG,
- (errmsg("could not fork archiver: %m")));
- return 0;
-
-#ifndef EXEC_BACKEND
- case 0:
- /* in postmaster child ... */
- InitPostmasterChild();
-
- /* Close the postmaster's sockets */
- ClosePostmasterPorts(false);
-
- /* Drop our connection to postmaster's shared memory, as well */
- dsm_detach_all();
- PGSharedMemoryDetach();
-
- PgArchiverMain(0, NULL);
- break;
-#endif
-
- default:
- return (int) pgArchPid;
- }
-
- /* shouldn't get here */
- return 0;
-}
-
 /* ------------------------------------------------------------
  * Local functions called by archiver follow
  * ------------------------------------------------------------
@@ -222,8 +151,8 @@ pgarch_forkexec(void)
  * The argc/argv parameters are valid only in EXEC_BACKEND case.  However,
  * since we don't use 'em, it hardly matters...
  */
-NON_EXEC_STATIC void
-PgArchiverMain(int argc, char *argv[])
+void
+PgArchiverMain(void)
 {
  /*
  * Ignore all signals usually bound to some action in the postmaster,
@@ -255,8 +184,27 @@ PgArchiverMain(int argc, char *argv[])
 static void
 pgarch_exit(SIGNAL_ARGS)
 {
- /* SIGQUIT means curl up and die ... */
- exit(1);
+ PG_SETMASK(&BlockSig);
+
+ /*
+ * We DO NOT want to run proc_exit() callbacks -- we're here because
+ * shared memory may be corrupted, so we don't want to try to clean up our
+ * transaction.  Just nail the windows shut and get out of town.  Now that
+ * there's an atexit callback to prevent third-party code from breaking
+ * things by calling exit() directly, we have to reset the callbacks
+ * explicitly to make this work as intended.
+ */
+ on_exit_reset();
+
+ /*
+ * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+ * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+ * backend.  This is necessary precisely because we don't clean up our
+ * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+ * should ensure the postmaster sees this as a crash, too, but no harm in
+ * being doubly sure.)
+ */
+ exit(2);
 }
 
 /* SIGHUP signal handler for archiver process */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index fabcf31de8..8299d2a435 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2932,6 +2932,9 @@ pgstat_bestart(void)
  case StartupProcess:
  lbeentry.st_backendType = B_STARTUP;
  break;
+ case ArchiverProcess:
+ beentry->st_backendType = B_ARCHIVER;
+ break;
  case BgWriterProcess:
  lbeentry.st_backendType = B_BG_WRITER;
  break;
@@ -4275,6 +4278,9 @@ pgstat_get_backend_desc(BackendType backendType)
 
  switch (backendType)
  {
+ case B_ARCHIVER:
+ backendDesc = "archiver";
+ break;
  case B_AUTOVAC_LAUNCHER:
  backendDesc = "autovacuum launcher";
  break;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 9ff2832c00..84fda38249 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -146,7 +146,8 @@
 #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */
 #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */
 #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */
-#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */
+#define BACKEND_TYPE_ARCHIVER 0x0010 /* archiver process */
+#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */
 
 #define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER)
 
@@ -539,6 +540,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #endif /* EXEC_BACKEND */
 
 #define StartupDataBase() StartChildProcess(StartupProcess)
+#define StartArchiver() StartChildProcess(ArchiverProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
 #define StartCheckpointer() StartChildProcess(CheckpointerProcess)
 #define StartWalWriter() StartChildProcess(WalWriterProcess)
@@ -1785,7 +1787,7 @@ ServerLoop(void)
 
  /* If we have lost the archiver, try to start a new one. */
  if (PgArchPID == 0 && PgArchStartupAllowed())
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
 
  /* If we need to signal the autovacuum launcher, do so now */
  if (avlauncher_needs_signal)
@@ -3042,7 +3044,7 @@ reaper(SIGNAL_ARGS)
  if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0)
  AutoVacPID = StartAutoVacLauncher();
  if (PgArchStartupAllowed() && PgArchPID == 0)
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
  if (PgStatPID == 0)
  PgStatPID = pgstat_start();
 
@@ -3187,10 +3189,8 @@ reaper(SIGNAL_ARGS)
  {
  PgArchPID = 0;
  if (!EXIT_STATUS_0(exitstatus))
- LogChildExit(LOG, _("archiver process"),
- pid, exitstatus);
- if (PgArchStartupAllowed())
- PgArchPID = pgarch_start();
+ HandleChildCrash(pid, exitstatus,
+ _("archiver process"));
  continue;
  }
 
@@ -3438,7 +3438,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, archiver or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -3643,6 +3643,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
  }
 
+ /* Take care of the archiver too */
+ if (pid == PgArchPID)
+ PgArchPID = 0;
+ else if (PgArchPID != 0 && take_action)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) PgArchPID)));
+ signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
  /*
  * Force a power-cycle of the pgarch process too.  (This isn't absolutely
  * necessary, but it seems like a good idea for robustness, and it
@@ -3915,6 +3927,7 @@ PostmasterStateMachine(void)
  Assert(CheckpointerPID == 0);
  Assert(WalWriterPID == 0);
  Assert(AutoVacPID == 0);
+ Assert(PgArchPID == 0);
  /* syslogger is not considered here */
  pmState = PM_NO_CHILDREN;
  }
@@ -5190,7 +5203,7 @@ sigusr1_handler(SIGNAL_ARGS)
  */
  Assert(PgArchPID == 0);
  if (XLogArchivingAlways())
- PgArchPID = pgarch_start();
+ PgArchPID = StartArchiver();
 
  /*
  * If we aren't planning to enter hot standby mode later, treat
@@ -5475,6 +5488,10 @@ StartChildProcess(AuxProcType type)
  ereport(LOG,
  (errmsg("could not fork startup process: %m")));
  break;
+ case ArchiverProcess:
+ ereport(LOG,
+ (errmsg("could not fork archiver process: %m")));
+ break;
  case BgWriterProcess:
  ereport(LOG,
  (errmsg("could not fork background writer process: %m")));
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index bc6e03fbc7..1f4db67f3f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -399,6 +399,7 @@ typedef enum
  BootstrapProcess,
  StartupProcess,
  BgWriterProcess,
+ ArchiverProcess,
  CheckpointerProcess,
  WalWriterProcess,
  WalReceiverProcess,
@@ -411,6 +412,7 @@ extern AuxProcType MyAuxProcType;
 #define AmBootstrapProcess() (MyAuxProcType == BootstrapProcess)
 #define AmStartupProcess() (MyAuxProcType == StartupProcess)
 #define AmBackgroundWriterProcess() (MyAuxProcType == BgWriterProcess)
+#define AmArchiverProcess() (MyAuxProcType == ArchiverProcess)
 #define AmCheckpointerProcess() (MyAuxProcType == CheckpointerProcess)
 #define AmWalWriterProcess() (MyAuxProcType == WalWriterProcess)
 #define AmWalReceiverProcess() (MyAuxProcType == WalReceiverProcess)
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe076d823d..65713abc2b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -718,6 +718,7 @@ typedef struct PgStat_GlobalStats
  */
 typedef enum BackendType
 {
+ B_ARCHIVER,
  B_AUTOVAC_LAUNCHER,
  B_AUTOVAC_WORKER,
  B_BACKEND,
diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h
index 2474eac26a..88f16863d4 100644
--- a/src/include/postmaster/pgarch.h
+++ b/src/include/postmaster/pgarch.h
@@ -32,8 +32,6 @@
  */
 extern int pgarch_start(void);
 
-#ifdef EXEC_BACKEND
-extern void PgArchiverMain(int argc, char *argv[]) pg_attribute_noreturn();
-#endif
+extern void PgArchiverMain(void) pg_attribute_noreturn();
 
 #endif /* _PGARCH_H */
--
2.23.0


From ecea667e9180e80f9f860326056d6180696c04bd Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Thu, 21 Feb 2019 12:44:56 +0900
Subject: [PATCH v24 4/5] Shared-memory based stats collector

Previously activity statistics is shared via files on disk. Every
backend sends the numbers to the stats collector process via a socket.
It makes snapshots as a set of files on disk with a certain interval
then every backend reads them as necessary. It worked fine for
comparatively small set of statistics but the set is under the
pressure to growing up and the file size has reached the order of
megabytes. To deal with larger statistics set, this patch let backends
directly share the statistics via shared memory.
---
 doc/src/sgml/monitoring.sgml                 |    6 +-
 src/backend/postmaster/autovacuum.c          |   12 +-
 src/backend/postmaster/pgstat.c              | 4652 ++++++++----------
 src/backend/postmaster/postmaster.c          |   85 +-
 src/backend/storage/ipc/ipci.c               |    2 +
 src/backend/storage/lmgr/lwlock.c            |    1 +
 src/backend/tcop/postgres.c                  |   26 +-
 src/backend/utils/init/globals.c             |    1 +
 src/backend/utils/init/postinit.c            |   11 +
 src/bin/pg_basebackup/t/010_pg_basebackup.pl |    4 +-
 src/include/miscadmin.h                      |    1 +
 src/include/pgstat.h                         |  441 +-
 src/include/storage/lwlock.h                 |    1 +
 src/include/utils/timeout.h                  |    1 +
 14 files changed, 2132 insertions(+), 3112 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index a3c5f86b7e..eb94dec119 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -53,7 +53,6 @@ postgres  15554  0.0  0.0  57536  1184 ?        Ss   18:02   0:00 postgres: back
 postgres  15555  0.0  0.0  57536   916 ?        Ss   18:02   0:00 postgres: checkpointer
 postgres  15556  0.0  0.0  57536   916 ?        Ss   18:02   0:00 postgres: walwriter
 postgres  15557  0.0  0.0  58504  2244 ?        Ss   18:02   0:00 postgres: autovacuum launcher
-postgres  15558  0.0  0.0  17512  1068 ?        Ss   18:02   0:00 postgres: stats collector
 postgres  15582  0.0  0.0  58772  3080 ?        Ss   18:04   0:00 postgres: joe runbug 127.0.0.1 idle
 postgres  15606  0.0  0.0  58772  3052 ?        Ss   18:07   0:00 postgres: tgl regression [local] SELECT waiting
 postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl regression [local] idle in transaction
@@ -65,9 +64,8 @@ postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl
    master server process.  The command arguments
    shown for it are the same ones used when it was launched.  The next five
    processes are background worker processes automatically launched by the
-   master process.  (The <quote>stats collector</quote> process will not be present
-   if you have set the system not to start the statistics collector; likewise
-   the <quote>autovacuum launcher</quote> process can be disabled.)
+   master process.  (The <quote>autovacuum launcher</quote> process will not
+   be present if you have set the system not to start it.)
    Each of the remaining
    processes is a server process handling one client connection.  Each such
    process sets its command line display in the form
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index c1dd8168ca..e07c7cb311 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -1961,15 +1961,15 @@ do_autovacuum(void)
   ALLOCSET_DEFAULT_SIZES);
  MemoryContextSwitchTo(AutovacMemCxt);
 
+ /* Start a transaction so our commands have one to play into. */
+ StartTransactionCommand();
+
  /*
  * may be NULL if we couldn't find an entry (only happens if we are
  * forcing a vacuum for anti-wrap purposes).
  */
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
 
- /* Start a transaction so our commands have one to play into. */
- StartTransactionCommand();
-
  /*
  * Clean up any dead statistics collector entries for this DB. We always
  * want to do this exactly once per DB-processing cycle, even if we find
@@ -2752,12 +2752,10 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
  if (isshared)
  {
  if (PointerIsValid(shared))
- tabentry = hash_search(shared->tables, &relid,
-   HASH_FIND, NULL);
+ tabentry = pgstat_fetch_stat_tabentry_extended(shared, relid);
  }
  else if (PointerIsValid(dbentry))
- tabentry = hash_search(dbentry->tables, &relid,
-   HASH_FIND, NULL);
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
 
  return tabentry;
 }
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 8299d2a435..bcf8c6f371 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1,15 +1,23 @@
 /* ----------
  * pgstat.c
  *
- * All the statistics collector stuff hacked up in one big, ugly file.
+ * Statistics collector facility.
  *
- * TODO: - Separate collector, postmaster and backend stuff
- *  into different files.
+ *  Collects per-table and per-function usage statistics of all backends on
+ *  shared memory. pg_count_*() and friends interfaces stores activity of
+ *  every backend during a transaction. Then pgstat_flush_stat() is called at
+ *  the end of a transaction to flush out the local numbers to shared memory.
  *
- * - Add some automatic call for pgstat vacuuming.
+ *  To avoid congestion on the shared memory, we update shared stats no more
+ *  often than intervals of PGSTAT_STAT_MIN_INTERVAL(500ms). Still it is
+ *  possible that a backend cannot flush all or a part of local numbers
+ *  immediately, we postpone updates and try the next chance after the
+ *  interval of PGSTAT_STAT_RETRY_INTERVAL(100ms), but they are not kept
+ *  longer than PGSTAT_STAT_MAX_INTERVAL(1000ms).
  *
- * - Add a pgstat config column to pg_database, so this
- *  entire thing can be enabled/disabled on a per db basis.
+ *  The first process that uses stats collector creates the area then load the
+ *  stored stats file if any, and the last process at shutdown writes the
+ *  shared stats to the file then destroy the area before exit.
  *
  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  *
@@ -19,18 +27,6 @@
 #include "postgres.h"
 
 #include <unistd.h>
-#include <fcntl.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/socket.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <signal.h>
-#include <time.h>
-#ifdef HAVE_SYS_SELECT_H
-#include <sys/select.h>
-#endif
 
 #include "access/heapam.h"
 #include "access/htup_details.h"
@@ -40,66 +36,39 @@
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_proc.h"
-#include "common/ip.h"
 #include "libpq/libpq.h"
-#include "libpq/pqsignal.h"
-#include "mb/pg_wchar.h"
 #include "miscadmin.h"
-#include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
-#include "postmaster/fork_process.h"
-#include "postmaster/postmaster.h"
 #include "replication/walsender.h"
-#include "storage/backendid.h"
-#include "storage/dsm.h"
-#include "storage/fd.h"
 #include "storage/ipc.h"
-#include "storage/latch.h"
 #include "storage/lmgr.h"
-#include "storage/pg_shmem.h"
+#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "storage/sinvaladt.h"
 #include "utils/ascii.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
-#include "utils/ps_status.h"
-#include "utils/rel.h"
+#include "utils/probes.h"
 #include "utils/snapmgr.h"
-#include "utils/timestamp.h"
 
 /* ----------
  * Timer definitions.
  * ----------
  */
-#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file
- * updates; in milliseconds. */
-
-#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a
- * new file; in milliseconds. */
-
-#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats
- * file update; in milliseconds. */
-
-#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a
- * new file; in milliseconds. */
+#define PGSTAT_STAT_MIN_INTERVAL 500 /* Minimum time between stats data
+ * updates; in milliseconds. */
 
-#define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a
- * failed statistics collector; in
- * seconds. */
-
-#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
-#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
-
-/* Minimum receive buffer size for the collector's socket. */
-#define PGSTAT_MIN_RCVBUF (100 * 1024)
+#define PGSTAT_STAT_RETRY_INTERVAL 100 /* Retry interval between after
+ * elapsed PGSTAT_MIN_INTERVAL*/
 
+#define PGSTAT_STAT_MAX_INTERVAL   1000 /* Maximum time between stats data
+ * updates; in milliseconds. */
 
 /* ----------
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
-#define PGSTAT_DB_HASH_SIZE 16
 #define PGSTAT_TAB_HASH_SIZE 512
 #define PGSTAT_FUNCTION_HASH_SIZE 512
 
@@ -115,6 +84,19 @@
  */
 #define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
 
+/*
+ * Operation mode and return code of pgstat_get_db_entry.
+ */
+#define PGSTAT_SHARED 0
+#define PGSTAT_EXCLUSIVE 1
+#define PGSTAT_NOWAIT 2
+
+typedef enum PgStat_TableLookupResult
+{
+ NOT_FOUND,
+ FOUND,
+ LOCK_FAILED
+} PgStat_TableLookupResult;
 
 /* ----------
  * GUC parameters
@@ -130,31 +112,63 @@ int pgstat_track_activity_query_size = 1024;
  * ----------
  */
 char   *pgstat_stat_directory = NULL;
+
+/* No longer used, but will be removed with GUC */
 char   *pgstat_stat_filename = NULL;
 char   *pgstat_stat_tmpname = NULL;
 
-/*
- * BgWriter global statistics counters (unused in other processes).
- * Stored directly in a stats message structure so it can be sent
- * without needing to copy things around.  We assume this inits to zeroes.
- */
-PgStat_MsgBgWriter BgWriterStats;
-
-/* ----------
- * Local data
- * ----------
- */
-NON_EXEC_STATIC pgsocket pgStatSock = PGINVALID_SOCKET;
-
-static struct sockaddr_storage pgStatAddr;
+#define StatsLock (&StatsShmem->StatsMainLock)
 
-static time_t last_pgstat_start_time;
+/* Shared stats bootstrap information */
+typedef struct StatsShmemStruct
+{
+ LWLock StatsMainLock; /* lock protecting this struct */
+ dsa_handle stats_dsa_handle; /* DSA handle for stats collector */
+ dshash_table_handle db_hash_handle;
+ dsa_pointer global_stats;
+ dsa_pointer archiver_stats;
+ int refcount;
+} StatsShmemStruct;
 
-static bool pgStatRunningInCollector = false;
+/*
+ * BgWriter global statistics counters. The name is the remnant from the time
+ * when the stats collector was a dedicate process, which used sockets to send
+ * it.
+ */
+PgStat_MsgBgWriter BgWriterStats = {0};
+
+/* Variables lives for the backend lifetime */
+static StatsShmemStruct * StatsShmem = NULL;
+static dsa_area *area = NULL;
+static dshash_table *pgStatDBHash = NULL;
+
+
+/* parameter for each type of shared hash */
+static const dshash_parameters dsh_dbparams = {
+ sizeof(Oid),
+ SHARED_DBENT_SIZE,
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
+static const dshash_parameters dsh_tblparams = {
+ sizeof(Oid),
+ sizeof(PgStat_StatTabEntry),
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
+static const dshash_parameters dsh_funcparams = {
+ sizeof(Oid),
+ sizeof(PgStat_StatFuncEntry),
+ dshash_memcmp,
+ dshash_memhash,
+ LWTRANCHE_STATS
+};
 
 /*
  * Structures in which backends store per-table info that's waiting to be
- * sent to the collector.
+ * written to shared memory.
  *
  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
  * for the life of the backend.  Also, we zero out the t_id fields of the
@@ -189,8 +203,8 @@ typedef struct TabStatHashEntry
 static HTAB *pgStatTabHash = NULL;
 
 /*
- * Backends store per-function info that's waiting to be sent to the collector
- * in this hash table (indexed by function OID).
+ * Backends store per-function info that's waiting to be flushed out to shared
+ * memory in this hash table (indexed by function OID).
  */
 static HTAB *pgStatFunctions = NULL;
 
@@ -200,6 +214,68 @@ static HTAB *pgStatFunctions = NULL;
  */
 static bool have_function_stats = false;
 
+/* common header of snapshot entry in backend snapshot hash */
+typedef struct PgStat_snapshot
+{
+ Oid key;
+ bool negative;
+ void   *body; /* end of header part: to keep alignment */
+} PgStat_snapshot;
+
+/* context struct for snapshot_statentry */
+typedef struct pgstat_snapshot_param
+{
+ char   *hash_name; /* name of the snapshot hash */
+ int hash_entsize; /* element size of hash entry */
+ dshash_table_handle dsh_handle; /* dsh handle to attach */
+ const dshash_parameters *dsh_params;/* dshash params */
+ HTAB  **hash; /* points to variable to hold hash */
+ dshash_table  **dshash; /* ditto for dshash */
+} pgstat_snapshot_param;
+
+/*
+ * Backends store various database-wide info that's waiting to be flushed out
+ * to shared memory in these variables.
+ *
+ * checksum_failures is the exception in that it is cluster-wide.
+ */
+typedef struct BackendDBStats
+{
+ int n_conflict_tablespace;
+ int n_conflict_lock;
+ int n_conflict_snapshot;
+ int n_conflict_bufferpin;
+ int n_conflict_startup_deadlock;
+ int n_deadlocks;
+ size_t n_tmpfiles;
+ size_t tmpfilesize;
+ HTAB *checksum_failures;
+} BackendDBStats;
+
+/* Hash entry struct for checksum_failures above */
+typedef struct ChecksumFailureEnt
+{
+ Oid dboid;
+ int count;
+} ChecksumFailureEnt;
+
+static BackendDBStats BeDBStats = {0};
+
+/* macros to check BeDBStats at once */
+#define HAVE_PENDING_CONFLICTS() \
+ (BeDBStats.n_conflict_tablespace > 0 || \
+ BeDBStats.n_conflict_lock > 0 || \
+ BeDBStats.n_conflict_bufferpin > 0 || \
+ BeDBStats.n_conflict_startup_deadlock > 0)
+
+#define HAVE_PENDING_DBSTATS() \
+ (HAVE_PENDING_CONFLICTS() || \
+ BeDBStats.n_deadlocks > 0 || \
+ BeDBStats.n_tmpfiles > 0 || \
+ /* no need to check tmpfilesize */ \
+ BeDBStats.checksum_failures != NULL)
+
+
 /*
  * Tuple insertion/deletion counts for an open transaction can't be propagated
  * into PgStat_TableStatus counters until we know if it is going to commit
@@ -235,11 +311,11 @@ typedef struct TwoPhasePgStatRecord
  bool t_truncated; /* was the relation truncated? */
 } TwoPhasePgStatRecord;
 
-/*
- * Info about current "snapshot" of stats file
- */
+/* Variables for backend status snapshot */
 static MemoryContext pgStatLocalContext = NULL;
-static HTAB *pgStatDBHash = NULL;
+static MemoryContext pgStatSnapshotContext = NULL;
+static HTAB *pgStatLocalHash = NULL;
+static bool clear_snapshot = false;
 
 /* Status for backends including auxiliary */
 static LocalPgBackendStatus *localBackendStatusTable = NULL;
@@ -248,23 +324,35 @@ static LocalPgBackendStatus *localBackendStatusTable = NULL;
 static int localNumBackends = 0;
 
 /*
- * Cluster wide statistics, kept in the stats collector.
- * Contains statistics that are not collected per database
- * or per table.
+ * Struct for context for pgstat_flush_* functions
+ *
+ * To avoid repeated attach/detch of the same dshash, dshashes once attached
+ * is stored in this structure and moved around multiple calls and multiple
+ * functions. generation here means the value returned by pin_hashes().
  */
-static PgStat_ArchiverStats archiverStats;
-static PgStat_GlobalStats globalStats;
+typedef struct pgstat_flush_stat_context
+{
+ int shgeneration; /* "generation" of shdb_tabhash below */
+ PgStat_StatDBEntry *shdbentry; /* dbentry for shared tables (oid = 0) */
+ dshash_table *shdb_tabhash; /* tabentry dshash of shared tables */
+
+ int mygeneration; /* "generation" of mydb_tabhash below */
+ PgStat_StatDBEntry *mydbentry; /* dbengry for my database */
+ dshash_table *mydb_tabhash; /* tabentry dshash of my database */
+} pgstat_flush_stat_context;
 
 /*
- * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
- * it means to write only the shared-catalog stats ("DB 0"); otherwise, we
- * will write both that DB's data and the shared stats.
+ * Cluster wide statistics.
+ *
+ * Contains statistics that are collected not per database nor per table
+ * basis.  shared_* points to shared memroy and snapshot_* are backend
+ * snapshots. Their validity is indicated by global_snapshot_is_valid.
  */
-static List *pending_write_requests = NIL;
-
-/* Signal handler flags */
-static volatile bool need_exit = false;
-static volatile bool got_SIGHUP = false;
+static bool global_snapshot_is_valid = false;
+static PgStat_ArchiverStats *shared_archiverStats;
+static PgStat_ArchiverStats snapshot_archiverStats;
+static PgStat_GlobalStats *shared_globalStats;
+static PgStat_GlobalStats snapshot_globalStats;
 
 /*
  * Total time charged to functions so far in the current backend.
@@ -278,35 +366,41 @@ static instr_time total_func_time;
  * Local function forward declarations
  * ----------
  */
-#ifdef EXEC_BACKEND
-static pid_t pgstat_forkexec(void);
-#endif
 
-NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-static void pgstat_exit(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
-static void pgstat_sighup_handler(SIGNAL_ARGS);
-
-static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
+static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, int op,
+ PgStat_TableLookupResult *status);
+static PgStat_StatTabEntry *pgstat_get_tab_entry(dshash_table *table,
  Oid tableoid, bool create);
-static void pgstat_write_statsfiles(bool permanent, bool allDbs);
-static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
-static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
-static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
-static void backend_read_statsfile(void);
+static void pgstat_write_pgStatDBHashfile(PgStat_StatDBEntry *dbentry);
+static void pgstat_read_pgStatDBHashfile(PgStat_StatDBEntry *dbentry);
 static void pgstat_read_current_status(void);
-
-static bool pgstat_write_statsfile_needed(void);
-static bool pgstat_db_requested(Oid databaseid);
-
-static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
-static void pgstat_send_funcstats(void);
+static bool pgstat_flush_stat(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_flush_tabstat(pgstat_flush_stat_context *cxt, bool nowait,
+ PgStat_TableStatus *entry);
+static bool pgstat_flush_funcstats(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_flush_dbstats(pgstat_flush_stat_context *cxt, bool nowait);
+static bool pgstat_update_tabentry(dshash_table *tabhash,
+   PgStat_TableStatus *stat, bool nowait);
+static void pgstat_update_dbentry(PgStat_StatDBEntry *dbentry,
+  PgStat_TableStatus *stat);
 static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid);
 
+static void pgstat_remove_useless_entries(const dshash_table_handle dshhandle,
+  const dshash_parameters *dshparams,
+  HTAB *oidtab);
 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
 
 static void pgstat_setup_memcxt(void);
+static void pgstat_flush_recovery_conflict(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_deadlock(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_checksum_failure(PgStat_StatDBEntry *dbentry);
+static void pgstat_flush_tempfile(PgStat_StatDBEntry *dbentry);
+static HTAB *create_tabstat_hash(void);
+static PgStat_SubXactStatus *get_tabstat_stack_level(int nest_level);
+static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level);
+static PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry_extended(PgStat_StatDBEntry *dbent, Oid funcid);
+static void pgstat_snapshot_global_stats(void);
 
 static const char *pgstat_get_wait_activity(WaitEventActivity w);
 static const char *pgstat_get_wait_client(WaitEventClient w);
@@ -314,557 +408,491 @@ static const char *pgstat_get_wait_ipc(WaitEventIPC w);
 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
 static const char *pgstat_get_wait_io(WaitEventIO w);
 
-static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
-static void pgstat_send(void *msg, int len);
-
-static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
-static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
-static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
-static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
-static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
-static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len);
-static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len);
-static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
-static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
-static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
-static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
-static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
-static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
-static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
-static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
-static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
-static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
-static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
+/* ------------------------------------------------------------
+ * Local support functions follow
+ * ------------------------------------------------------------
+ */
+static int pin_hashes(PgStat_StatDBEntry *dbentry);
+static void unpin_hashes(PgStat_StatDBEntry *dbentry, int generation);
+static dshash_table *attach_table_hash(PgStat_StatDBEntry *dbent, int gen);
+static dshash_table *attach_function_hash(PgStat_StatDBEntry *dbent, int gen);
+static void reset_dbentry_counters(PgStat_StatDBEntry *dbentry);
 
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
  */
 
-/* ----------
- * pgstat_init() -
- *
- * Called from postmaster at startup. Create the resources required
- * by the statistics collector process.  If unable to do so, do not
- * fail --- better to let the postmaster start with stats collection
- * disabled.
- * ----------
+/*
+ * StatsShmemSize
+ * Compute space needed for stats collector's shared memory
+ */
+Size
+StatsShmemSize(void)
+{
+ return sizeof(StatsShmemStruct);
+}
+
+/*
+ * StatsShmemInit - initialize during shared-memory creation
  */
 void
-pgstat_init(void)
+StatsShmemInit(void)
 {
- ACCEPT_TYPE_ARG3 alen;
- struct addrinfo *addrs = NULL,
-   *addr,
- hints;
- int ret;
- fd_set rset;
- struct timeval tv;
- char test_byte;
- int sel_res;
- int tries = 0;
-
-#define TESTBYTEVAL ((char) 199)
+ bool found;
 
- /*
- * This static assertion verifies that we didn't mess up the calculations
- * involved in selecting maximum payload sizes for our UDP messages.
- * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would
- * be silent performance loss from fragmentation, it seems worth having a
- * compile-time cross-check that we didn't.
- */
- StaticAssertStmt(sizeof(PgStat_Msg) <= PGSTAT_MAX_MSG_SIZE,
- "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE");
+ StatsShmem = (StatsShmemStruct *)
+ ShmemInitStruct("Stats area", StatsShmemSize(),
+ &found);
 
- /*
- * Create the UDP socket for sending and receiving statistic messages
- */
- hints.ai_flags = AI_PASSIVE;
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_DGRAM;
- hints.ai_protocol = 0;
- hints.ai_addrlen = 0;
- hints.ai_addr = NULL;
- hints.ai_canonname = NULL;
- hints.ai_next = NULL;
- ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
- if (ret || !addrs)
+ if (!IsUnderPostmaster)
  {
- ereport(LOG,
- (errmsg("could not resolve \"localhost\": %s",
- gai_strerror(ret))));
- goto startup_failed;
- }
+ Assert(!found);
 
- /*
- * On some platforms, pg_getaddrinfo_all() may return multiple addresses
- * only one of which will actually work (eg, both IPv6 and IPv4 addresses
- * when kernel will reject IPv6).  Worse, the failure may occur at the
- * bind() or perhaps even connect() stage.  So we must loop through the
- * results till we find a working combination. We will generate LOG
- * messages, but no error, for bogus combinations.
- */
- for (addr = addrs; addr; addr = addr->ai_next)
- {
-#ifdef HAVE_UNIX_SOCKETS
- /* Ignore AF_UNIX sockets, if any are returned. */
- if (addr->ai_family == AF_UNIX)
- continue;
-#endif
+ StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+ }
 
- if (++tries > 1)
- ereport(LOG,
- (errmsg("trying another address for the statistics collector")));
+ LWLockInitialize(StatsLock, LWTRANCHE_STATS);
+}
 
- /*
- * Create the socket.
- */
- if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not create socket for statistics collector: %m")));
- continue;
- }
+/* ----------
+ * pgstat_attach_shared_stats() -
+ *
+ * Attach shared or create stats memory.
+ * ---------
+ */
+static void
+pgstat_attach_shared_stats(void)
+{
+ MemoryContext oldcontext;
 
- /*
- * Bind it to a kernel assigned port on localhost and get the assigned
- * port via getsockname().
- */
- if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not bind socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ /*
+ * Don't use dsm under postmaster, when not tracking counts.
+ */
+ if (!pgstat_track_counts || !IsUnderPostmaster)
+ return;
 
- alen = sizeof(pgStatAddr);
- if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not get address of socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ pgstat_setup_memcxt();
 
- /*
- * Connect the socket to its own address.  This saves a few cycles by
- * not having to respecify the target address on every send. This also
- * provides a kernel-level check that only packets from this same
- * address will be received.
- */
- if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not connect socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ if (area)
+ return;
 
- /*
- * Try to send and receive a one-byte test message on the socket. This
- * is to catch situations where the socket can be created but will not
- * actually pass data (for instance, because kernel packet filtering
- * rules prevent it).
- */
- test_byte = TESTBYTEVAL;
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 
-retry1:
- if (send(pgStatSock, &test_byte, 1, 0) != 1)
- {
- if (errno == EINTR)
- goto retry1; /* if interrupted, just retry */
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not send test message on socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
 
- /*
- * There could possibly be a little delay before the message can be
- * received.  We arbitrarily allow up to half a second before deciding
- * it's broken.
- */
- for (;;) /* need a loop to handle EINTR */
- {
- FD_ZERO(&rset);
- FD_SET(pgStatSock, &rset);
+ if (StatsShmem->refcount > 0)
+ StatsShmem->refcount++;
+ else
+ {
+ /* Need to create shared memory area and load saved stats if any. */
+ Assert(StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID);
 
- tv.tv_sec = 0;
- tv.tv_usec = 500000;
- sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
- if (sel_res >= 0 || errno != EINTR)
- break;
- }
- if (sel_res < 0)
- {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("select() failed in statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
- if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
- {
- /*
- * This is the case we actually think is likely, so take pains to
- * give a specific message for it.
- *
- * errno will not be set meaningfully here, so don't use it.
- */
- ereport(LOG,
- (errcode(ERRCODE_CONNECTION_FAILURE),
- errmsg("test message did not get through on socket for statistics collector")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ /* Initialize shared memory area */
+ area = dsa_create(LWTRANCHE_STATS);
+ pgStatDBHash = dshash_create(area, &dsh_dbparams, 0);
 
- test_byte++; /* just make sure variable is changed */
+ StatsShmem->stats_dsa_handle = dsa_get_handle(area);
+ StatsShmem->global_stats =
+ dsa_allocate0(area, sizeof(PgStat_GlobalStats));
+ StatsShmem->archiver_stats =
+ dsa_allocate0(area, sizeof(PgStat_ArchiverStats));
+ StatsShmem->db_hash_handle = dshash_get_hash_table_handle(pgStatDBHash);
 
-retry2:
- if (recv(pgStatSock, &test_byte, 1, 0) != 1)
- {
- if (errno == EINTR)
- goto retry2; /* if interrupted, just retry */
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not receive test message on socket for statistics collector: %m")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ shared_globalStats = (PgStat_GlobalStats *)
+ dsa_get_address(area, StatsShmem->global_stats);
+ shared_archiverStats = (PgStat_ArchiverStats *)
+ dsa_get_address(area, StatsShmem->archiver_stats);
 
- if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */
- {
- ereport(LOG,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("incorrect test message transmission on socket for statistics collector")));
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
- continue;
- }
+ /* Load saved data if any. */
+ pgstat_read_statsfiles();
 
- /* If we get here, we have a working socket */
- break;
+ StatsShmem->refcount = 1;
  }
 
- /* Did we find a working address? */
- if (!addr || pgStatSock == PGINVALID_SOCKET)
- goto startup_failed;
+ LWLockRelease(StatsLock);
 
  /*
- * Set the socket to non-blocking IO.  This ensures that if the collector
- * falls behind, statistics messages will be discarded; backends won't
- * block waiting to send messages to the collector.
+ * If we're not the first process, attach existing shared stats area
+ * outside StatsLock.
  */
- if (!pg_set_noblock(pgStatSock))
+ if (!area)
  {
- ereport(LOG,
- (errcode_for_socket_access(),
- errmsg("could not set statistics collector socket to nonblocking mode: %m")));
- goto startup_failed;
+ /* Shared area already exists. Just attach it. */
+ area = dsa_attach(StatsShmem->stats_dsa_handle);
+ pgStatDBHash = dshash_attach(area, &dsh_dbparams,
+ StatsShmem->db_hash_handle, 0);
+
+ /* Setup local variables */
+ pgStatLocalHash = NULL;
+ shared_globalStats = (PgStat_GlobalStats *)
+ dsa_get_address(area, StatsShmem->global_stats);
+ shared_archiverStats = (PgStat_ArchiverStats *)
+ dsa_get_address(area, StatsShmem->archiver_stats);
  }
 
- /*
- * Try to ensure that the socket's receive buffer is at least
- * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose
- * data.  Use of UDP protocol means that we are willing to lose data under
- * heavy load, but we don't want it to happen just because of ridiculously
- * small default buffer sizes (such as 8KB on older Windows versions).
- */
- {
- int old_rcvbuf;
- int new_rcvbuf;
- ACCEPT_TYPE_ARG3 rcvbufsize = sizeof(old_rcvbuf);
+ MemoryContextSwitchTo(oldcontext);
 
- if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
-   (char *) &old_rcvbuf, &rcvbufsize) < 0)
- {
- elog(LOG, "getsockopt(SO_RCVBUF) failed: %m");
- /* if we can't get existing size, always try to set it */
- old_rcvbuf = 0;
- }
+ dsa_pin_mapping(area);
+ global_snapshot_is_valid = false;
+}
 
- new_rcvbuf = PGSTAT_MIN_RCVBUF;
- if (old_rcvbuf < new_rcvbuf)
- {
- if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
-   (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0)
- elog(LOG, "setsockopt(SO_RCVBUF) failed: %m");
- }
+/* ----------
+ * pgstat_detach_shared_stats() -
+ *
+ * Detach shared stats. Write out to file if we're the last process and
+ * instructed to write file.
+ * ----------
+ */
+static void
+pgstat_detach_shared_stats(bool write_stats)
+{
+ if (!area || !IsUnderPostmaster)
+ return;
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+
+ /* write out the shared stats to file if needed */
+ if (--StatsShmem->refcount < 1)
+ {
+ if (write_stats)
+ pgstat_write_statsfiles();
+
+ /* We're the last process. Invalidate the dsa area handle. */
+ StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
  }
 
- pg_freeaddrinfo_all(hints.ai_family, addrs);
+ LWLockRelease(StatsLock);
 
- return;
+ /*
+ * Detach the area. Automatically destroyed when the last process detached
+ * it.
+ */
+ dsa_detach(area);
 
-startup_failed:
- ereport(LOG,
- (errmsg("disabling statistics collector for lack of working socket")));
+ area = NULL;
+ pgStatDBHash = NULL;
+ shared_globalStats = NULL;
+ shared_archiverStats = NULL;
+ pgStatLocalHash = NULL;
+ global_snapshot_is_valid = false;
+}
 
- if (addrs)
- pg_freeaddrinfo_all(hints.ai_family, addrs);
+/*
+ * pgstat_reset_all() -
+ *
+ * Remove the stats file.  This is currently used only if WAL recovery is
+ * needed after a crash.
+ */
+void
+pgstat_reset_all(void)
+{
+ /* we must have shared stats attached */
+ Assert (StatsShmem->stats_dsa_handle != DSM_HANDLE_INVALID);
 
- if (pgStatSock != PGINVALID_SOCKET)
- closesocket(pgStatSock);
- pgStatSock = PGINVALID_SOCKET;
+ /* Startup must be the only user of shared stats */
+ Assert (StatsShmem->refcount == 1);
 
  /*
- * Adjust GUC variables to suppress useless activity, and for debugging
- * purposes (seeing track_counts off is a clue that we failed here). We
- * use PGC_S_OVERRIDE because there is no point in trying to turn it back
- * on from postgresql.conf without a restart.
+ * We could directly remove files and recreate the shared memory area. But
+ * detach then attach for simplicity.
  */
- SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
+ pgstat_detach_shared_stats(false); /* Don't write */
+ pgstat_attach_shared_stats();
 }
 
-/*
- * subroutine for pgstat_reset_all
+/* ------------------------------------------------------------
+ * Public functions used by backends follow
+ *------------------------------------------------------------
  */
-static void
-pgstat_reset_remove_files(const char *directory)
+
+/* ----------
+ * pgstat_report_stat() -
+ *
+ * Must be called by processes that performs DML: tcop/postgres.c, logical
+ * receiver processes, SPI worker, etc. to apply the so far collected
+ * per-table and function usage statistics to the shared statistics hashes.
+ *
+ *  Updates are applied not more frequent than the interval of
+ *  PGSTAT_STAT_MIN_INTERVAL milliseconds. They are also postponed on lock
+ *  failure if force is false and there's no pending updates longer than
+ *  PGSTAT_STAT_MAX_INTERVAL milliseconds. Postponed updates are retried in
+ *  succeeding calls of this function.
+ *
+ * Returns the time until the next timing when updates are applied in
+ * milliseconds if there are no updates holded for more than
+ * PGSTAT_STAT_MIN_INTERVAL milliseconds.
+ *
+ * Note that this is called only out of a transaction, so it is fine to use
+ * transaction stop time as an approximation of current time.
+ * ----------
+ */
+long
+pgstat_report_stat(bool force)
 {
- DIR   *dir;
- struct dirent *entry;
- char fname[MAXPGPATH * 2];
+ static TimestampTz next_flush = 0;
+ static TimestampTz pending_since = 0;
+ TimestampTz now;
+ pgstat_flush_stat_context cxt = {0};
+ bool pending_stats = false;
+ long elapsed;
+ long secs;
+ int usecs;
+
+ /* Don't expend a clock check if nothing to do */
+ if (area == NULL ||
+ ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
+ pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
+ !HAVE_PENDING_DBSTATS()  && !have_function_stats))
+ return 0;
+
+ now = GetCurrentTransactionStopTimestamp();
 
- dir = AllocateDir(directory);
- while ((entry = ReadDir(dir, directory)) != NULL)
+ if (!force)
  {
- int nchars;
- Oid tmp_oid;
+ /*
+ * Don't flush stats unless it's the time.  Returns time to wait in
+ * milliseconds.
+ */
+ if (now < next_flush)
+ {
+ /* Record the oldest pending update if not yet. */
+ if (pending_since == 0)
+ pending_since = now;
+
+ /* now < next_flush here */
+ return (next_flush - now) / 1000;
+ }
 
  /*
- * Skip directory entries that don't match the file names we write.
- * See get_dbstat_filename for the database-specific pattern.
+ * Don't keep pending updates longer than PGSTAT_STAT_MAX_INTERVAL.
  */
- if (strncmp(entry->d_name, "global.", 7) == 0)
- nchars = 7;
- else
+ if (pending_since > 0)
  {
- nchars = 0;
- (void) sscanf(entry->d_name, "db_%u.%n",
-  &tmp_oid, &nchars);
- if (nchars <= 0)
- continue;
- /* %u allows leading whitespace, so reject that */
- if (strchr("0123456789", entry->d_name[3]) == NULL)
- continue;
+ TimestampDifference(pending_since, now, &secs, &usecs);
+ elapsed = secs * 1000 + usecs /1000;
+
+ if(elapsed > PGSTAT_STAT_MAX_INTERVAL)
+ force = true;
  }
+ }
 
- if (strcmp(entry->d_name + nchars, "tmp") != 0 &&
- strcmp(entry->d_name + nchars, "stat") != 0)
- continue;
+ /* Flush out table stats */
+ if (pgStatTabList != NULL && !pgstat_flush_stat(&cxt, !force))
+ pending_stats = true;
 
- snprintf(fname, sizeof(fname), "%s/%s", directory,
- entry->d_name);
- unlink(fname);
+ /* Flush out function stats */
+ if (pgStatFunctions != NULL && !pgstat_flush_funcstats(&cxt, !force))
+ pending_stats = true;
+
+ /* Flush out database-wide stats */
+ if (HAVE_PENDING_DBSTATS())
+ {
+ if (!pgstat_flush_dbstats(&cxt, !force))
+ pending_stats = true;
  }
- FreeDir(dir);
-}
 
-/*
- * pgstat_reset_all() -
- *
- * Remove the stats files.  This is currently used only if WAL
- * recovery is needed after a crash.
- */
-void
-pgstat_reset_all(void)
-{
- pgstat_reset_remove_files(pgstat_stat_directory);
- pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY);
-}
+ /* Unpin dbentry if pinned */
+ if (cxt.mydb_tabhash)
+ {
+ dshash_detach(cxt.mydb_tabhash);
+ unpin_hashes(cxt.mydbentry, cxt.mygeneration);
+ cxt.mydb_tabhash = NULL;
+ cxt.mydbentry = NULL;
+ }
 
-#ifdef EXEC_BACKEND
+ /* Publish the last flush time */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ if (shared_globalStats->stats_timestamp < now)
+ shared_globalStats->stats_timestamp = now;
+ LWLockRelease(StatsLock);
 
-/*
- * pgstat_forkexec() -
- *
- * Format up the arglist for, then fork and exec, statistics collector process
- */
-static pid_t
-pgstat_forkexec(void)
-{
- char   *av[10];
- int ac = 0;
+ /* Record how long we are keepnig pending updats. */
+ if (pending_stats)
+ {
+ /* Preserve the first value */
+ if (pending_since == 0)
+ pending_since = now;
 
- av[ac++] = "postgres";
- av[ac++] = "--forkcol";
- av[ac++] = NULL; /* filled in by postmaster_forkexec */
+ /*
+ * It's possible that the retry interval is longer than the limit by
+ * PGSTAT_STAT_MAX_INTERVAL. We don't bother that since it's not so
+ * much.
+ */
+ return PGSTAT_STAT_RETRY_INTERVAL;
+ }
 
- av[ac] = NULL;
- Assert(ac < lengthof(av));
+ /* Set the next time to update stats */
+ next_flush = now + PGSTAT_STAT_MIN_INTERVAL * 1000;
+ pending_since = 0;
 
- return postmaster_forkexec(ac, av);
+ return 0;
 }
-#endif /* EXEC_BACKEND */
-
 
 /*
- * pgstat_start() -
+ * snapshot_statentry() - Common routine for functions
+ * pgstat_fetch_stat_*entry()
  *
- * Called from postmaster at startup or after an existing collector
- * died.  Attempt to fire up a fresh statistics collector.
+ *  Returns the pointer to a snapshot of a shared entry for the key or NULL if
+ *  not found. Returned snapshots are stable during the current transaction or
+ *  until pgstat_clear_snapshot() is called.
  *
- * Returns PID of child process, or 0 if fail.
+ *  The snapshots are stored in a hash, pointer to which is stored in the
+ *  *HTAB variable pointed by cxt->hash. If not created yet, it is created
+ *  using hash_name, hash_entsize in cxt.
  *
- * Note: if fail, we will be called again from the postmaster main loop.
+ *  cxt->dshash points to dshash_table for dbstat entries. If not yet
+ *  attached, it is attached using cxt->dsh_handle.
  */
-int
-pgstat_start(void)
+static void *
+snapshot_statentry(pgstat_snapshot_param *cxt, Oid key)
 {
- time_t curtime;
- pid_t pgStatPid;
+ PgStat_snapshot *lentry = NULL;
+ size_t keysize = cxt->dsh_params->key_size;
+ size_t dsh_entrysize = cxt->dsh_params->entry_size;
+ bool found;
 
  /*
- * Check that the socket is there, else pgstat_init failed and we can do
- * nothing useful.
+ * We don't want so frequent update of stats snapshot. Keep it at least
+ * for PGSTAT_STAT_MIN_INTERVAL ms. Not postpone but just ignore the cue.
  */
- if (pgStatSock == PGINVALID_SOCKET)
- return 0;
+ if (clear_snapshot)
+ {
+ clear_snapshot = false;
 
- /*
- * Do nothing if too soon since last collector start.  This is a safety
- * valve to protect against continuous respawn attempts if the collector
- * is dying immediately at launch.  Note that since we will be re-called
- * from the postmaster main loop, we will get another chance later.
- */
- curtime = time(NULL);
- if ((unsigned int) (curtime - last_pgstat_start_time) <
- (unsigned int) PGSTAT_RESTART_INTERVAL)
- return 0;
- last_pgstat_start_time = curtime;
+ if (pgStatSnapshotContext &&
+ snapshot_globalStats.stats_timestamp <
+ GetCurrentStatementStartTimestamp() -
+ PGSTAT_STAT_MIN_INTERVAL * 1000)
+ {
+ MemoryContextReset(pgStatSnapshotContext);
+
+ /* Reset variables */
+ global_snapshot_is_valid = false;
+ pgStatSnapshotContext = NULL;
+ pgStatLocalHash = NULL;
+
+ pgstat_setup_memcxt();
+ }
+ }
 
  /*
- * Okay, fork off the collector.
+ * Create new hash, with rather arbitrary initial number of entries since
+ * we don't know how this hash will grow.
  */
-#ifdef EXEC_BACKEND
- switch ((pgStatPid = pgstat_forkexec()))
-#else
- switch ((pgStatPid = fork_process()))
-#endif
+ if (!*cxt->hash)
  {
- case -1:
- ereport(LOG,
- (errmsg("could not fork statistics collector: %m")));
- return 0;
+ HASHCTL ctl;
 
-#ifndef EXEC_BACKEND
- case 0:
- /* in postmaster child ... */
- InitPostmasterChild();
+ /*
+ * Create the hash in the stats context
+ *
+ * The entry is prepended by common header part represented by
+ * PgStat_snapshot.
+ */
 
- /* Close the postmaster's sockets */
- ClosePostmasterPorts(false);
+ ctl.keysize = keysize;
+ ctl.entrysize = offsetof(PgStat_snapshot, body) + cxt->hash_entsize;
+ ctl.hcxt = pgStatSnapshotContext;
+ *cxt->hash = hash_create(cxt->hash_name, 32, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ }
 
- /* Drop our connection to postmaster's shared memory, as well */
- dsm_detach_all();
- PGSharedMemoryDetach();
+ lentry = hash_search(*cxt->hash, &key, HASH_ENTER, &found);
 
- PgstatCollectorMain(0, NULL);
- break;
-#endif
+ /*
+ * Refer shared hash if not found in the local hash. We return up-to-date
+ * entries outside a transaction so do the same even if the snapshot is
+ * found.
+ */
+ if (!found || !IsTransactionState())
+ {
+ void *sentry;
 
- default:
- return (int) pgStatPid;
- }
+ /* attach shared hash if not given, leave it alone for later use */
+ if (!*cxt->dshash)
+ {
+ MemoryContext oldcxt;
 
- /* shouldn't get here */
- return 0;
-}
+ Assert (cxt->dsh_handle != DSM_HANDLE_INVALID);
+ oldcxt = MemoryContextSwitchTo(pgStatSnapshotContext);
+ *cxt->dshash =
+ dshash_attach(area, cxt->dsh_params, cxt->dsh_handle, NULL);
+ MemoryContextSwitchTo(oldcxt);
+ }
 
-void
-allow_immediate_pgstat_restart(void)
-{
- last_pgstat_start_time = 0;
-}
+ sentry = dshash_find(*cxt->dshash, &key, false);
 
-/* ------------------------------------------------------------
- * Public functions used by backends follow
- *------------------------------------------------------------
- */
+ if (sentry)
+ {
+ /*
+ * In transaction state, it is obvious that we should create local
+ * cache entries for consistency. If we are not, we return an
+ * up-to-date entry. Having said that, we need a local copy since
+ * dshash entry must be released immediately. We share the same
+ * local hash entry for the purpose.
+ */
+ memcpy(&lentry->body, sentry, dsh_entrysize);
+ dshash_release_lock(*cxt->dshash, sentry);
 
+ /* then zero out the local additional space if any */
+ if (dsh_entrysize < cxt->hash_entsize)
+ MemSet((char *)&lentry->body + dsh_entrysize, 0,
+   cxt->hash_entsize - dsh_entrysize);
+ }
 
-/* ----------
- * pgstat_report_stat() -
+ lentry->negative = !sentry;
+ }
+
+ if (lentry->negative)
+ return NULL;
+
+ return &lentry->body;
+}
+
+/*
+ * pgstat_flush_stat: Flushes table stats out to shared statistics.
  *
- * Must be called by processes that performs DML: tcop/postgres.c, logical
- * receiver processes, SPI worker, etc. to send the so far collected
- * per-table and function usage statistics to the collector.  Note that this
- * is called only when not within a transaction, so it is fair to use
- * transaction stop time as an approximation of current time.
- * ----------
+ *  If nowait is true, returns false if required lock was not acquired
+ *  immediately. In that case, unapplied table stats updates are left alone in
+ *  TabStatusArray to wait for the next chance. cxt holds some dshash related
+ *  values that we want to carry around while updating shared stats.
+ *
+ *  Returns true if all stats info are flushed. Caller must detach dshashes
+ *  stored in cxt after use.
  */
-void
-pgstat_report_stat(bool force)
+static bool
+pgstat_flush_stat(pgstat_flush_stat_context *cxt, bool nowait)
 {
- /* we assume this inits to all zeroes: */
  static const PgStat_TableCounts all_zeroes;
- static TimestampTz last_report = 0;
-
- TimestampTz now;
- PgStat_MsgTabstat regular_msg;
- PgStat_MsgTabstat shared_msg;
  TabStatusArray *tsa;
- int i;
-
- /* Don't expend a clock check if nothing to do */
- if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
- pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
- !have_function_stats)
- return;
+ HTAB   *new_tsa_hash = NULL;
+ TabStatusArray *dest_tsa = pgStatTabList;
+ int dest_elem = 0;
+ int i;
 
- /*
- * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
- * msec since we last sent one, or the caller wants to force stats out.
- */
- now = GetCurrentTransactionStopTimestamp();
- if (!force &&
- !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
- return;
- last_report = now;
+ /* nothing to do, just return  */
+ if (pgStatTabHash == NULL)
+ return true;
 
  /*
  * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry
- * entries it points to.  (Should we fail partway through the loop below,
- * it's okay to have removed the hashtable already --- the only
- * consequence is we'd get multiple entries for the same table in the
- * pgStatTabList, and that's safe.)
+ * entries it points to.
  */
- if (pgStatTabHash)
- hash_destroy(pgStatTabHash);
+ hash_destroy(pgStatTabHash);
  pgStatTabHash = NULL;
 
  /*
  * Scan through the TabStatusArray struct(s) to find tables that actually
- * have counts, and build messages to send.  We have to separate shared
- * relations from regular ones because the databaseid field in the message
- * header has to depend on that.
+ * have counts, and try flushing it out to shared stats. We may fail on
+ * some entries in the array. Leaving the entries being packed at the
+ * beginning of the array.
  */
- regular_msg.m_databaseid = MyDatabaseId;
- shared_msg.m_databaseid = InvalidOid;
- regular_msg.m_nentries = 0;
- shared_msg.m_nentries = 0;
-
  for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
  {
  for (i = 0; i < tsa->tsa_used; i++)
  {
  PgStat_TableStatus *entry = &tsa->tsa_entries[i];
- PgStat_MsgTabstat *this_msg;
- PgStat_TableEntry *this_ent;
 
  /* Shouldn't have any pending transaction-dependent counts */
  Assert(entry->trans == NULL);
@@ -877,178 +905,352 @@ pgstat_report_stat(bool force)
    sizeof(PgStat_TableCounts)) == 0)
  continue;
 
- /*
- * OK, insert data into the appropriate message, and send if full.
- */
- this_msg = entry->t_shared ? &shared_msg : &regular_msg;
- this_ent = &this_msg->m_entry[this_msg->m_nentries];
- this_ent->t_id = entry->t_id;
- memcpy(&this_ent->t_counts, &entry->t_counts,
-   sizeof(PgStat_TableCounts));
- if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+ /* try to apply the tab stats */
+ if (!pgstat_flush_tabstat(cxt, nowait, entry))
  {
- pgstat_send_tabstat(this_msg);
- this_msg->m_nentries = 0;
+ /*
+ * Failed. Move it to the beginning in TabStatusArray and
+ * leave it.
+ */
+ TabStatHashEntry *hash_entry;
+ bool found;
+
+ if (new_tsa_hash == NULL)
+ new_tsa_hash = create_tabstat_hash();
+
+ /* Create hash entry for this entry */
+ hash_entry = hash_search(new_tsa_hash, &entry->t_id,
+ HASH_ENTER, &found);
+ Assert(!found);
+
+ /*
+ * Move insertion pointer to the next segment if the segment
+ * is filled up.
+ */
+ if (dest_elem >= TABSTAT_QUANTUM)
+ {
+ Assert(dest_tsa->tsa_next != NULL);
+ dest_tsa = dest_tsa->tsa_next;
+ dest_elem = 0;
+ }
+
+ /*
+ * Pack the entry at the begining of the array. Do nothing if
+ * no need to be moved.
+ */
+ if (tsa != dest_tsa || i != dest_elem)
+ {
+ PgStat_TableStatus *new_entry;
+ new_entry = &dest_tsa->tsa_entries[dest_elem];
+ *new_entry = *entry;
+
+ /* use new_entry as entry hereafter */
+ entry = new_entry;
+ }
+
+ hash_entry->tsa_entry = entry;
+ dest_elem++;
  }
  }
- /* zero out PgStat_TableStatus structs after use */
- MemSet(tsa->tsa_entries, 0,
-   tsa->tsa_used * sizeof(PgStat_TableStatus));
- tsa->tsa_used = 0;
  }
 
+ /* zero out unused area of TableStatus */
+ dest_tsa->tsa_used = dest_elem;
+ MemSet(&dest_tsa->tsa_entries[dest_elem], 0,
+   (TABSTAT_QUANTUM - dest_elem) * sizeof(PgStat_TableStatus));
+ while (dest_tsa->tsa_next)
+ {
+ dest_tsa = dest_tsa->tsa_next;
+ MemSet(dest_tsa->tsa_entries, 0,
+   dest_tsa->tsa_used * sizeof(PgStat_TableStatus));
+ dest_tsa->tsa_used = 0;
+ }
+
+ /* and set the new TabStatusArray hash if any */
+ pgStatTabHash = new_tsa_hash;
+
  /*
- * Send partial messages.  Make sure that any pending xact commit/abort
- * gets counted, even if there are no table stats to send.
+ * We no longer need shared database and table entries, but that for my
+ * database may be used later.
  */
- if (regular_msg.m_nentries > 0 ||
- pgStatXactCommit > 0 || pgStatXactRollback > 0)
- pgstat_send_tabstat(&regular_msg);
- if (shared_msg.m_nentries > 0)
- pgstat_send_tabstat(&shared_msg);
-
- /* Now, send function statistics */
- pgstat_send_funcstats();
+ if (cxt->shdb_tabhash)
+ {
+ dshash_detach(cxt->shdb_tabhash);
+ unpin_hashes(cxt->shdbentry, cxt->shgeneration);
+ cxt->shdb_tabhash = NULL;
+ cxt->shdbentry = NULL;
+ }
+
+ return pgStatTabHash == NULL;
 }
 
+/* -------
+ * Subroutines for pgstat_flush_stat.
+ * -------
+ */
 /*
- * Subroutine for pgstat_report_stat: finish and send a tabstat message
+ * pgstat_flush_tabstat: Flushes a table stats entry.
+ *
+ *  If nowait is true, returns false on lock failure.  Dshashes for table and
+ *  function stats are kept attached in ctx. The caller must detach them after
+ *  use.
+ *
+ *  Returns true if the entry is flushed out.
  */
-static void
-pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
+bool
+pgstat_flush_tabstat(pgstat_flush_stat_context *cxt, bool nowait,
+ PgStat_TableStatus *entry)
 {
- int n;
- int len;
+ Oid dboid = entry->t_shared ? InvalidOid : MyDatabaseId;
+ int table_mode = PGSTAT_EXCLUSIVE;
+ bool updated = false;
+ dshash_table *tabhash;
+ PgStat_StatDBEntry *dbent;
+ int generation;
+
+ if (nowait)
+ table_mode |= PGSTAT_NOWAIT;
+
+ /* Attach required table hash if not yet. */
+ if ((entry->t_shared ? cxt->shdb_tabhash : cxt->mydb_tabhash) == NULL)
+ {
+ /*
+ *  Return if we don't have corresponding dbentry. It would've been
+ *  removed.
+ */
+ dbent = pgstat_get_db_entry(dboid, table_mode, NULL);
+ if (!dbent)
+ return false;
 
- /* It's unlikely we'd get here with no socket, but maybe not impossible */
- if (pgStatSock == PGINVALID_SOCKET)
- return;
+ /*
+ * We don't hold lock on the dbentry since it cannot be dropped while
+ * we are working on it.
+ */
+ generation = pin_hashes(dbent);
+ tabhash = attach_table_hash(dbent, generation);
 
- /*
- * Report and reset accumulated xact commit/rollback and I/O timings
- * whenever we send a normal tabstat message
- */
- if (OidIsValid(tsmsg->m_databaseid))
- {
- tsmsg->m_xact_commit = pgStatXactCommit;
- tsmsg->m_xact_rollback = pgStatXactRollback;
- tsmsg->m_block_read_time = pgStatBlockReadTime;
- tsmsg->m_block_write_time = pgStatBlockWriteTime;
- pgStatXactCommit = 0;
- pgStatXactRollback = 0;
- pgStatBlockReadTime = 0;
- pgStatBlockWriteTime = 0;
+ if (entry->t_shared)
+ {
+ cxt->shgeneration = generation;
+ cxt->shdbentry = dbent;
+ cxt->shdb_tabhash = tabhash;
+ }
+ else
+ {
+ cxt->mygeneration = generation;
+ cxt->mydbentry = dbent;
+ cxt->mydb_tabhash = tabhash;
+
+ /*
+ * We come here once per database. Take the chance to update
+ * database-wide stats
+ */
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
+ dbent->n_xact_commit += pgStatXactCommit;
+ dbent->n_xact_rollback += pgStatXactRollback;
+ dbent->n_block_read_time += pgStatBlockReadTime;
+ dbent->n_block_write_time += pgStatBlockWriteTime;
+ LWLockRelease(&dbent->lock);
+ pgStatXactCommit = 0;
+ pgStatXactRollback = 0;
+ pgStatBlockReadTime = 0;
+ pgStatBlockWriteTime = 0;
+ }
+ }
+ else if (entry->t_shared)
+ {
+ dbent = cxt->shdbentry;
+ tabhash = cxt->shdb_tabhash;
  }
  else
  {
- tsmsg->m_xact_commit = 0;
- tsmsg->m_xact_rollback = 0;
- tsmsg->m_block_read_time = 0;
- tsmsg->m_block_write_time = 0;
+ dbent = cxt->mydbentry;
+ tabhash = cxt->mydb_tabhash;
  }
 
- n = tsmsg->m_nentries;
- len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
- n * sizeof(PgStat_TableEntry);
 
- pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
- pgstat_send(tsmsg, len);
+ /*
+ * Local table stats should be applied to both dbentry and tabentry at
+ * once. Update dbentry only if we could update tabentry.
+ */
+ if (pgstat_update_tabentry(tabhash, entry, nowait))
+ {
+ pgstat_update_dbentry(dbent, entry);
+ updated = true;
+ }
+
+ return updated;
 }
 
 /*
- * Subroutine for pgstat_report_stat: populate and send a function stat message
+ * pgstat_flush_funcstats: Flushes function stats.
+ *
+ *  If nowait is true, returns false on lock failure. Unapplied local hash
+ *  entryis are left alone.
+ *
+ *  Returns true if all entries are flushed out.
  */
-static void
-pgstat_send_funcstats(void)
+static bool
+pgstat_flush_funcstats(pgstat_flush_stat_context *cxt, bool nowait)
 {
  /* we assume this inits to all zeroes: */
  static const PgStat_FunctionCounts all_zeroes;
-
- PgStat_MsgFuncstat msg;
- PgStat_BackendFunctionEntry *entry;
+ dshash_table   *funchash;
  HASH_SEQ_STATUS fstat;
+ PgStat_BackendFunctionEntry *bestat;
 
+ /* nothing to do, just return  */
  if (pgStatFunctions == NULL)
- return;
+ return true;
+
+ /* get dbentry into cxt if not yet.  */
+ if (cxt->mydbentry == NULL)
+ {
+ int op = PGSTAT_EXCLUSIVE;
+
+ if (nowait)
+ op |= PGSTAT_NOWAIT;
+
+ cxt->mydbentry = pgstat_get_db_entry(MyDatabaseId, op, NULL);
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT);
- msg.m_databaseid = MyDatabaseId;
- msg.m_nentries = 0;
+ if (cxt->mydbentry == NULL)
+ return false;
 
+ cxt->mygeneration = pin_hashes(cxt->mydbentry);
+ }
+
+ funchash = attach_function_hash(cxt->mydbentry, cxt->mygeneration);
+ if (funchash == NULL)
+ return false;
+
+ have_function_stats = false;
+
+ /*
+ * Scan through the pgStatFunctions to find functions that actually have
+ * counts, and try flushing it out to shared stats.
+ */
  hash_seq_init(&fstat, pgStatFunctions);
- while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
+ while ((bestat = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
  {
- PgStat_FunctionEntry *m_ent;
+ bool found;
+ PgStat_StatFuncEntry *funcent = NULL;
 
- /* Skip it if no counts accumulated since last time */
- if (memcmp(&entry->f_counts, &all_zeroes,
+ /* Skip it if no counts accumulated for it so far */
+ if (memcmp(&bestat->f_counts, &all_zeroes,
    sizeof(PgStat_FunctionCounts)) == 0)
  continue;
 
- /* need to convert format of time accumulators */
- m_ent = &msg.m_entry[msg.m_nentries];
- m_ent->f_id = entry->f_id;
- m_ent->f_numcalls = entry->f_counts.f_numcalls;
- m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time);
- m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time);
+ funcent = (PgStat_StatFuncEntry *)
+ dshash_find_or_insert_extended(funchash, (void *) &(bestat->f_id),
+   &found, nowait);
 
- if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
+ /*
+ * We couldn't acquire lock on the required entry. Leave the local
+ * entry alone.
+ */
+ if (!funcent)
  {
- pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
- msg.m_nentries * sizeof(PgStat_FunctionEntry));
- msg.m_nentries = 0;
+ have_function_stats = true;
+ continue;
  }
 
- /* reset the entry's counts */
- MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
- }
+ /* Initialize if it's new, or add to it. */
+ if (!found)
+ {
+ funcent->functionid = bestat->f_id;
+ funcent->f_numcalls = bestat->f_counts.f_numcalls;
+ funcent->f_total_time =
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_total_time);
+ funcent->f_self_time =
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_self_time);
+ }
+ else
+ {
+ funcent->f_numcalls += bestat->f_counts.f_numcalls;
+ funcent->f_total_time +=
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_total_time);
+ funcent->f_self_time +=
+ INSTR_TIME_GET_MICROSEC(bestat->f_counts.f_self_time);
+ }
+ dshash_release_lock(funchash, funcent);
 
- if (msg.m_nentries > 0)
- pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
- msg.m_nentries * sizeof(PgStat_FunctionEntry));
+ /* reset used counts */
+ MemSet(&bestat->f_counts, 0, sizeof(PgStat_FunctionCounts));
+ }
 
- have_function_stats = false;
+ return !have_function_stats;
 }
 
+/*
+ * pgstat_flush_dbstats: Flushes out miscellaneous database stats.
+ *
+ *  If nowait is true, returns with false on lock failure on dbentry.
+ *
+ *  Returns true if all stats are flushed out.
+ */
+static bool
+pgstat_flush_dbstats(pgstat_flush_stat_context *cxt, bool nowait)
+{
+ /* get dbentry if not yet.  */
+ if (cxt->mydbentry == NULL)
+ {
+ int op = PGSTAT_EXCLUSIVE;
+ if (nowait)
+ op |= PGSTAT_NOWAIT;
+
+ cxt->mydbentry = pgstat_get_db_entry(MyDatabaseId, op, NULL);
+
+ /* return if lock failed. */
+ if (cxt->mydbentry == NULL)
+ return false;
+
+ /* we use this generation of table /function stats in this turn */
+ cxt->mygeneration = pin_hashes(cxt->mydbentry);
+ }
+
+ LWLockAcquire(&cxt->mydbentry->lock, LW_EXCLUSIVE);
+ if (HAVE_PENDING_CONFLICTS())
+ pgstat_flush_recovery_conflict(cxt->mydbentry);
+ if (BeDBStats.n_deadlocks != 0)
+ pgstat_flush_deadlock(cxt->mydbentry);
+ if (BeDBStats.n_tmpfiles != 0)
+ pgstat_flush_tempfile(cxt->mydbentry);
+ if (BeDBStats.checksum_failures != NULL)
+ pgstat_flush_checksum_failure(cxt->mydbentry);
+ LWLockRelease(&cxt->mydbentry->lock);
+
+ return true;
+}
 
 /* ----------
  * pgstat_vacuum_stat() -
  *
- * Will tell the collector about objects he can get rid of.
+ * Remove objects we can get rid of.
  * ----------
  */
 void
 pgstat_vacuum_stat(void)
 {
- HTAB   *htab;
- PgStat_MsgTabpurge msg;
- PgStat_MsgFuncpurge f_msg;
- HASH_SEQ_STATUS hstat;
+ HTAB   *oidtab;
+ dshash_seq_status dshstat;
  PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
- PgStat_StatFuncEntry *funcentry;
- int len;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ /* we don't collect stats under standalone mode */
+ if (!IsUnderPostmaster)
  return;
 
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
-
  /*
  * Read pg_database and make a list of OIDs of all existing databases
  */
- htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
+ oidtab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
 
  /*
- * Search the database hash table for dead databases and tell the
- * collector to drop them.
+ * Search the database hash table for dead databases and drop them
+ * from the hash.
  */
- hash_seq_init(&hstat, pgStatDBHash);
- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+
+ dshash_seq_init(&dshstat, pgStatDBHash, false, true);
+ while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&dshstat)) != NULL)
  {
  Oid dbid = dbentry->databaseid;
 
@@ -1056,137 +1258,43 @@ pgstat_vacuum_stat(void)
 
  /* the DB entry for shared tables (with InvalidOid) is never dropped */
  if (OidIsValid(dbid) &&
- hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
+ hash_search(oidtab, (void *) &dbid, HASH_FIND, NULL) == NULL)
  pgstat_drop_database(dbid);
  }
 
  /* Clean up */
- hash_destroy(htab);
+ hash_destroy(oidtab);
 
  /*
  * Lookup our own database entry; if not found, nothing more to do.
  */
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &MyDatabaseId,
- HASH_FIND, NULL);
- if (dbentry == NULL || dbentry->tables == NULL)
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, NULL);
+ if (!dbentry)
  return;
 
  /*
  * Similarly to above, make a list of all known relations in this DB.
  */
- htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
-
- /*
- * Initialize our messages table counter to zero
- */
- msg.m_nentries = 0;
+ oidtab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
 
  /*
  * Check for all tables listed in stats hashtable if they still exist.
+ * Stats cache is useless here so directly search the shared hash.
  */
- hash_seq_init(&hstat, dbentry->tables);
- while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
- {
- Oid tabid = tabentry->tableid;
-
- CHECK_FOR_INTERRUPTS();
-
- if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
- continue;
-
- /*
- * Not there, so add this table's Oid to the message
- */
- msg.m_tableid[msg.m_nentries++] = tabid;
-
- /*
- * If the message is full, send it out and reinitialize to empty
- */
- if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
- {
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
- + msg.m_nentries * sizeof(Oid);
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
-
- msg.m_nentries = 0;
- }
- }
-
- /*
- * Send the rest
- */
- if (msg.m_nentries > 0)
- {
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
- + msg.m_nentries * sizeof(Oid);
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
- }
-
- /* Clean up */
- hash_destroy(htab);
+ pgstat_remove_useless_entries(dbentry->tables, &dsh_tblparams, oidtab);
 
  /*
- * Now repeat the above steps for functions.  However, we needn't bother
- * in the common case where no function stats are being collected.
+ * Repeat the above but we needn't bother in the common case where no
+ * function stats are being collected.
  */
- if (dbentry->functions != NULL &&
- hash_get_num_entries(dbentry->functions) > 0)
+ if (dbentry->functions != DSM_HANDLE_INVALID)
  {
- htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
-
- pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE);
- f_msg.m_databaseid = MyDatabaseId;
- f_msg.m_nentries = 0;
-
- hash_seq_init(&hstat, dbentry->functions);
- while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
- {
- Oid funcid = funcentry->functionid;
-
- CHECK_FOR_INTERRUPTS();
-
- if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
- continue;
-
- /*
- * Not there, so add this function's Oid to the message
- */
- f_msg.m_functionid[f_msg.m_nentries++] = funcid;
-
- /*
- * If the message is full, send it out and reinitialize to empty
- */
- if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
- {
- len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
- + f_msg.m_nentries * sizeof(Oid);
-
- pgstat_send(&f_msg, len);
-
- f_msg.m_nentries = 0;
- }
- }
-
- /*
- * Send the rest
- */
- if (f_msg.m_nentries > 0)
- {
- len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
- + f_msg.m_nentries * sizeof(Oid);
-
- pgstat_send(&f_msg, len);
- }
+ oidtab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
 
- hash_destroy(htab);
+ pgstat_remove_useless_entries(dbentry->functions, &dsh_funcparams,
+  oidtab);
  }
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
 
@@ -1240,66 +1348,99 @@ pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid)
  return htab;
 }
 
-
-/* ----------
- * pgstat_drop_database() -
+/*
+ * pgstat_remove_useless_entries - Remove useless entries from per
+ * table/function dshashes.
  *
- * Tell the collector that we just dropped a database.
- * (If the message gets lost, we will still clean the dead DB eventually
- * via future invocations of pgstat_vacuum_stat().)
- * ----------
+ *  Scan the dshash specified by dshhandle removing entries that are not in
+ *  oidtab. oidtab is destroyed before returning.
  */
 void
-pgstat_drop_database(Oid databaseid)
+pgstat_remove_useless_entries(const dshash_table_handle dshhandle,
+  const dshash_parameters *dshparams,
+  HTAB *oidtab)
 {
- PgStat_MsgDropdb msg;
+ dshash_table *dshtable;
+ dshash_seq_status dshstat;
+ void *ent;
 
- if (pgStatSock == PGINVALID_SOCKET)
- return;
+ dshtable = dshash_attach(area, dshparams, dshhandle, 0);
+ dshash_seq_init(&dshstat, dshtable, false, true);
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
- msg.m_databaseid = databaseid;
- pgstat_send(&msg, sizeof(msg));
-}
+ while ((ent = dshash_seq_next(&dshstat)) != NULL)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* The first member of the entries must be Oid */
+ if (hash_search(oidtab, ent, HASH_FIND, NULL) != NULL)
+ continue;
 
+ /* Not there, so purge this entry */
+ dshash_delete_entry(dshtable, ent);
+ }
+ dshash_detach(dshtable);
+ hash_destroy(oidtab);
+}
 
 /* ----------
- * pgstat_drop_relation() -
+ * pgstat_drop_database() -
  *
- * Tell the collector that we just dropped a relation.
- * (If the message gets lost, we will still clean the dead entry eventually
- * via future invocations of pgstat_vacuum_stat().)
+ * Remove entry for the database that we just dropped.
  *
- * Currently not used for lack of any good place to call it; we rely
- * entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
+ * If some stats are flushed after this, this entry will be re-created but we
+ * will still clean the dead DB eventually via future invocations of
+ * pgstat_vacuum_stat().
  * ----------
  */
-#ifdef NOT_USED
 void
-pgstat_drop_relation(Oid relid)
+pgstat_drop_database(Oid databaseid)
 {
- PgStat_MsgTabpurge msg;
- int len;
+ PgStat_StatDBEntry *dbentry;
+
+ Assert (OidIsValid(databaseid));
 
- if (pgStatSock == PGINVALID_SOCKET)
+ if (!IsUnderPostmaster || !pgStatDBHash)
  return;
 
- msg.m_tableid[0] = relid;
- msg.m_nentries = 1;
+ /*
+ * Lookup the database in the hashtable with exclusive lock.
+ */
+ dbentry = pgstat_get_db_entry(databaseid, PGSTAT_EXCLUSIVE, NULL);
+
+ /*
+ * If found, remove it.
+ */
+ if (dbentry)
+ {
+ /* LWLock is needed to rewrite */
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+
+ /* No one is using tables/functions in this dbentry */
+ Assert(dbentry->refcnt == 0);
 
- len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
+ /* Remove table/function stats dshash first. */
+ if (dbentry->tables != DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl =
+ dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_destroy(tbl);
+ }
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl =
+ dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_destroy(tbl);
+ }
+ LWLockRelease(&dbentry->lock);
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, len);
+ dshash_delete_entry(pgStatDBHash, (void *)dbentry);
+ }
 }
-#endif /* NOT_USED */
-
 
 /* ----------
  * pgstat_reset_counters() -
  *
- * Tell the statistics collector to reset counters for our database.
+ * Reset counters for our database.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1308,20 +1449,32 @@ pgstat_drop_relation(Oid relid)
 void
 pgstat_reset_counters(void)
 {
- PgStat_MsgResetcounter msg;
+ PgStat_StatDBEntry   *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ if (!pgStatDBHash)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, sizeof(msg));
+ /*
+ * Lookup the database in the hashtable.  Nothing to do if not there.
+ */
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, &status);
+
+ if (!dbentry)
+ return;
+
+ /* This database is active, safe to release the lock immediately. */
+ dshash_release_lock(pgStatDBHash, dbentry);
+
+ /* Reset database-level stats. */
+ reset_dbentry_counters(dbentry);
+
 }
 
 /* ----------
  * pgstat_reset_shared_counters() -
  *
- * Tell the statistics collector to reset cluster-wide shared counters.
+ * Reset cluster-wide shared counters.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1330,29 +1483,37 @@ pgstat_reset_counters(void)
 void
 pgstat_reset_shared_counters(const char *target)
 {
- PgStat_MsgResetsharedcounter msg;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
+ /* Reset the archiver statistics for the cluster. */
  if (strcmp(target, "archiver") == 0)
- msg.m_resettarget = RESET_ARCHIVER;
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ MemSet(shared_archiverStats, 0, sizeof(*shared_archiverStats));
+ shared_archiverStats->stat_reset_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+ /* Reset the bgwriter statistics for the cluster. */
  else if (strcmp(target, "bgwriter") == 0)
- msg.m_resettarget = RESET_BGWRITER;
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ MemSet(shared_globalStats, 0, sizeof(*shared_globalStats));
+ shared_globalStats->stat_reset_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
  else
  ereport(ERROR,
  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  errmsg("unrecognized reset target: \"%s\"", target),
  errhint("Target must be \"archiver\" or \"bgwriter\".")));
-
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
- pgstat_send(&msg, sizeof(msg));
 }
 
 /* ----------
  * pgstat_reset_single_counter() -
  *
- * Tell the statistics collector to reset a single counter.
+ * Reset a single counter.
  *
  * Permission checking for this function is managed through the normal
  * GRANT system.
@@ -1361,18 +1522,43 @@ pgstat_reset_shared_counters(const char *target)
 void
 pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 {
- PgStat_MsgResetsinglecounter msg;
+ PgStat_StatDBEntry *dbentry;
+ TimestampTz ts;
+ int generation;
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId, PGSTAT_EXCLUSIVE, NULL);
 
- if (pgStatSock == PGINVALID_SOCKET)
+ if (!dbentry)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER);
- msg.m_databaseid = MyDatabaseId;
- msg.m_resettype = type;
- msg.m_objectid = objoid;
+ /* This database is active, safe to release the lock immediately. */
+ generation = pin_hashes(dbentry);
 
- pgstat_send(&msg, sizeof(msg));
-}
+ /* Set the reset timestamp for the whole database */
+ ts = GetCurrentTimestamp();
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->stat_reset_timestamp = ts;
+ LWLockRelease(&dbentry->lock);
+
+ /* Remove object if it exists, ignore if not */
+ if (type == RESET_TABLE)
+ {
+ dshash_table *t = attach_table_hash(dbentry, generation);
+ dshash_delete_key(t, (void *) &objoid);
+ dshash_detach(t);
+ }
+
+ if (type == RESET_FUNCTION)
+ {
+ dshash_table *t = attach_function_hash(dbentry, generation);
+ if (t)
+ {
+ dshash_delete_key(t, (void *) &objoid);
+ dshash_detach(t);
+ }
+ }
+ unpin_hashes(dbentry, generation);
+}
 
 /* ----------
  * pgstat_report_autovac() -
@@ -1385,48 +1571,81 @@ pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 void
 pgstat_report_autovac(Oid dboid)
 {
- PgStat_MsgAutovacStart msg;
+ PgStat_StatDBEntry *dbentry;
+ TimestampTz ts;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
- msg.m_databaseid = dboid;
- msg.m_start_time = GetCurrentTimestamp();
+ /*
+ * Store the last autovacuum time in the database's hashtable entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ dshash_release_lock(pgStatDBHash, dbentry);
+
+ ts = GetCurrentTimestamp();
 
- pgstat_send(&msg, sizeof(msg));
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->last_autovac_time = ts;
+ LWLockRelease(&dbentry->lock);
 }
 
 
 /* ---------
  * pgstat_report_vacuum() -
  *
- * Tell the collector about the table we just vacuumed.
+ * Report about the table we just vacuumed.
  * ---------
  */
 void
 pgstat_report_vacuum(Oid tableoid, bool shared,
  PgStat_Counter livetuples, PgStat_Counter deadtuples)
 {
- PgStat_MsgVacuum msg;
+ Oid dboid;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ dshash_table *table;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
- msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
- msg.m_tableoid = tableoid;
- msg.m_autovacuum = IsAutoVacuumWorkerProcess();
- msg.m_vacuumtime = GetCurrentTimestamp();
- msg.m_live_tuples = livetuples;
- msg.m_dead_tuples = deadtuples;
- pgstat_send(&msg, sizeof(msg));
+ dboid = shared ? InvalidOid : MyDatabaseId;
+
+ /*
+ * Store the data in the table's hash table entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ generation = pin_hashes(dbentry);
+ table = attach_table_hash(dbentry, generation);
+
+ tabentry = pgstat_get_tab_entry(table, tableoid, true);
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_vacuum_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_vacuum_count++;
+ }
+ else
+ {
+ tabentry->vacuum_timestamp = GetCurrentTimestamp();
+ tabentry->vacuum_count++;
+ }
+ dshash_release_lock(table, tabentry);
+
+ dshash_detach(table);
+ unpin_hashes(dbentry, generation);
 }
 
 /* --------
  * pgstat_report_analyze() -
  *
- * Tell the collector about the table we just analyzed.
+ * Report about the table we just analyzed.
  *
  * Caller must provide new live- and dead-tuples estimates, as well as a
  * flag indicating whether to reset the changes_since_analyze counter.
@@ -1437,9 +1656,14 @@ pgstat_report_analyze(Relation rel,
   PgStat_Counter livetuples, PgStat_Counter deadtuples,
   bool resetcounter)
 {
- PgStat_MsgAnalyze msg;
+ Oid dboid;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ dshash_table   *table;
+ int generation;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
  /*
@@ -1468,78 +1692,153 @@ pgstat_report_analyze(Relation rel,
  deadtuples = Max(deadtuples, 0);
  }
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
- msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
- msg.m_tableoid = RelationGetRelid(rel);
- msg.m_autovacuum = IsAutoVacuumWorkerProcess();
- msg.m_resetcounter = resetcounter;
- msg.m_analyzetime = GetCurrentTimestamp();
- msg.m_live_tuples = livetuples;
- msg.m_dead_tuples = deadtuples;
- pgstat_send(&msg, sizeof(msg));
+ dboid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
+
+ /*
+ * Store the data in the table's hashtable entry.
+ */
+ dbentry = pgstat_get_db_entry(dboid, PGSTAT_EXCLUSIVE, NULL);
+ generation = pin_hashes(dbentry);
+ table = attach_table_hash(dbentry, generation);
+ tabentry = pgstat_get_tab_entry(table, RelationGetRelid(rel), true);
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ /*
+ * If commanded, reset changes_since_analyze to zero.  This forgets any
+ * changes that were committed while the ANALYZE was in progress, but we
+ * have no good way to estimate how many of those there were.
+ */
+ if (resetcounter)
+ tabentry->changes_since_analyze = 0;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_analyze_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_analyze_count++;
+ }
+ else
+ {
+ tabentry->analyze_timestamp = GetCurrentTimestamp();
+ tabentry->analyze_count++;
+ }
+ dshash_release_lock(table, tabentry);
+
+ dshash_detach(table);
+ unpin_hashes(dbentry, generation);
 }
 
 /* --------
  * pgstat_report_recovery_conflict() -
  *
- * Tell the collector about a Hot Standby recovery conflict.
+ * Report a Hot Standby recovery conflict.
  * --------
  */
 void
 pgstat_report_recovery_conflict(int reason)
 {
- PgStat_MsgRecoveryConflict msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
+
+ /* return if we are not collecting stats */
+ if (!area)
+ return;
+
+ switch (reason)
+ {
+ case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+
+ /*
+ * Since we drop the information about the database as soon as it
+ * replicates, there is no point in counting these conflicts.
+ */
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ BeDBStats.n_conflict_tablespace++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ BeDBStats.n_conflict_lock++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ BeDBStats.n_conflict_snapshot++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ BeDBStats.n_conflict_bufferpin++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ BeDBStats.n_conflict_startup_deadlock++;
+ break;
+ }
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ if (status == LOCK_FAILED)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT);
- msg.m_databaseid = MyDatabaseId;
- msg.m_reason = reason;
- pgstat_send(&msg, sizeof(msg));
+ /* We had a chance to flush immediately */
+ pgstat_flush_recovery_conflict(dbentry);
+
+ dshash_release_lock(pgStatDBHash, dbentry);
+}
+
+/*
+ * flush recovery conflict stats
+ */
+static void
+pgstat_flush_recovery_conflict(PgStat_StatDBEntry *dbentry)
+{
+ dbentry->n_conflict_tablespace += BeDBStats.n_conflict_tablespace;
+ dbentry->n_conflict_lock += BeDBStats.n_conflict_lock;
+ dbentry->n_conflict_snapshot += BeDBStats.n_conflict_snapshot;
+ dbentry->n_conflict_bufferpin += BeDBStats.n_conflict_bufferpin;
+ dbentry->n_conflict_startup_deadlock += BeDBStats.n_conflict_startup_deadlock;
+
+ BeDBStats.n_conflict_tablespace = 0;
+ BeDBStats.n_conflict_lock = 0;
+ BeDBStats.n_conflict_snapshot = 0;
+ BeDBStats.n_conflict_bufferpin = 0;
+ BeDBStats.n_conflict_startup_deadlock = 0;
 }
 
 /* --------
  * pgstat_report_deadlock() -
  *
- * Tell the collector about a deadlock detected.
+ * Report a deadlock detected.
  * --------
  */
 void
 pgstat_report_deadlock(void)
 {
- PgStat_MsgDeadlock msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK);
- msg.m_databaseid = MyDatabaseId;
- pgstat_send(&msg, sizeof(msg));
-}
-
+ BeDBStats.n_deadlocks++;
 
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
 
-/* --------
- * pgstat_report_checksum_failures_in_db() -
- *
- * Tell the collector about one or more checksum failures.
- * --------
- */
-void
-pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
-{
- PgStat_MsgChecksumFailure msg;
-
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ if (status == LOCK_FAILED)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
- msg.m_databaseid = dboid;
- msg.m_failurecount = failurecount;
- msg.m_failure_time = GetCurrentTimestamp();
+ dshash_release_lock(pgStatDBHash, dbentry);
+}
 
- pgstat_send(&msg, sizeof(msg));
+/*
+ * flush dead lock stats
+ */
+static void
+pgstat_flush_deadlock(PgStat_StatDBEntry *dbentry)
+{
+ dbentry->n_deadlocks += BeDBStats.n_deadlocks;
+ BeDBStats.n_deadlocks = 0;
 }
 
 /* --------
@@ -1557,60 +1856,153 @@ pgstat_report_checksum_failure(void)
 /* --------
  * pgstat_report_tempfile() -
  *
- * Tell the collector about a temporary file.
+ * Report a temporary file.
  * --------
  */
 void
 pgstat_report_tempfile(size_t filesize)
 {
- PgStat_MsgTempFile msg;
+ PgStat_StatDBEntry *dbentry;
+ PgStat_TableLookupResult status;
+
+ /* return if we are not collecting stats */
+ if (!area)
+ return;
+
+ if (filesize > 0) /* Is there a case where filesize is really 0? */
+ {
+ BeDBStats.tmpfilesize += filesize; /* needs check overflow */
+ BeDBStats.n_tmpfiles++;
+ }
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ if (BeDBStats.n_tmpfiles == 0)
  return;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE);
- msg.m_databaseid = MyDatabaseId;
- msg.m_filesize = filesize;
- pgstat_send(&msg, sizeof(msg));
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ return;
+
+ /* We had a chance to flush immediately */
+ pgstat_flush_tempfile(dbentry);
+
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
+/*
+ * flush temporary file stats
+ */
+static void
+pgstat_flush_tempfile(PgStat_StatDBEntry *dbentry)
+{
 
-/* ----------
- * pgstat_ping() -
+ dbentry->n_temp_bytes += BeDBStats.tmpfilesize;
+ dbentry->n_temp_files += BeDBStats.n_tmpfiles;
+ BeDBStats.tmpfilesize = 0;
+ BeDBStats.n_tmpfiles = 0;
+}
+
+/* --------
+ * pgstat_report_checksum_failures_in_db(dboid, failure_count) -
  *
- * Send some junk data to the collector to increase traffic.
- * ----------
+ * Tell the collector about one or more checksum failures.
+ * --------
  */
 void
-pgstat_ping(void)
+pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
 {
- PgStat_MsgDummy msg;
+ PgStat_StatDBEntry   *dbentry;
+ PgStat_TableLookupResult status;
+ ChecksumFailureEnt   *failent = NULL;
+
+ /* return if we are not collecting stats */
+ if (!area)
+ return;
+
+ if (BeDBStats.checksum_failures != NULL)
+ {
+ failent = hash_search(BeDBStats.checksum_failures, &dboid,
+  HASH_FIND, NULL);
+ if (failent)
+ failurecount += failent->count;
+ }
+
+ if (failurecount == 0)
+ return;
+
+ dbentry = pgstat_get_db_entry(MyDatabaseId,
+  PGSTAT_EXCLUSIVE | PGSTAT_NOWAIT,
+  &status);
+
+ if (status == LOCK_FAILED)
+ {
+ if (!failent)
+ {
+ if (!BeDBStats.checksum_failures)
+ {
+ HASHCTL ctl;
 
- if (pgStatSock == PGINVALID_SOCKET)
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(ChecksumFailureEnt);
+ BeDBStats.checksum_failures =
+ hash_create("pgstat checksum failure count hash",
+ 32, &ctl, HASH_ELEM | HASH_BLOBS);
+ }
+
+ failent = hash_search(BeDBStats.checksum_failures,
+  &dboid, HASH_ENTER, NULL);
+ }
+
+ failent->count = failurecount;
  return;
+ }
+
+ /* We have a chance to flush immediately */
+ dbentry->n_checksum_failures += failurecount;
+ BeDBStats.checksum_failures = NULL;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
- pgstat_send(&msg, sizeof(msg));
+ dshash_release_lock(pgStatDBHash, dbentry);
 }
 
-/* ----------
- * pgstat_send_inquiry() -
- *
- * Notify collector that we need fresh data.
- * ----------
+/*
+ * flush checkpoint failure count for all databases
  */
 static void
-pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
+pgstat_flush_checksum_failure(PgStat_StatDBEntry *dbentry)
 {
- PgStat_MsgInquiry msg;
+ HASH_SEQ_STATUS stat;
+ ChecksumFailureEnt *ent;
+ bool release_dbent;
 
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
- msg.clock_time = clock_time;
- msg.cutoff_time = cutoff_time;
- msg.databaseid = databaseid;
- pgstat_send(&msg, sizeof(msg));
-}
+ if (BeDBStats.checksum_failures == NULL)
+ return;
+
+ hash_seq_init(&stat, BeDBStats.checksum_failures);
+ while ((ent = (ChecksumFailureEnt *) hash_seq_search(&stat)) != NULL)
+ {
+ release_dbent = false;
+
+ if (dbentry->databaseid != ent->dboid)
+ {
+ dbentry = pgstat_get_db_entry(ent->dboid,
+  PGSTAT_EXCLUSIVE, NULL);
+ if (!dbentry)
+ continue;
+
+ release_dbent = true;
+ }
+
+ dbentry->n_checksum_failures += ent->count;
 
+ if (release_dbent)
+ dshash_release_lock(pgStatDBHash, dbentry);
+ }
+
+ hash_destroy(BeDBStats.checksum_failures);
+ BeDBStats.checksum_failures = NULL;
+}
 
 /*
  * Initialize function call usage data.
@@ -1762,7 +2154,8 @@ pgstat_initstats(Relation rel)
  return;
  }
 
- if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ /* return if we are not collecting stats */
+ if (!area)
  {
  /* We're not counting at all */
  rel->pgstat_info = NULL;
@@ -1781,6 +2174,24 @@ pgstat_initstats(Relation rel)
  rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
 }
 
+/*
+ * create_tabstat_hash - create local hash as transactional storage
+ */
+static HTAB *
+create_tabstat_hash(void)
+{
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(ctl));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(TabStatHashEntry);
+
+ return hash_create("pgstat TabStatusArray lookup hash table",
+   TABSTAT_QUANTUM,
+   &ctl,
+   HASH_ELEM | HASH_BLOBS);
+}
+
 /*
  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
  */
@@ -1796,18 +2207,7 @@ get_tabstat_entry(Oid rel_id, bool isshared)
  * Create hash table if we don't have it already.
  */
  if (pgStatTabHash == NULL)
- {
- HASHCTL ctl;
-
- memset(&ctl, 0, sizeof(ctl));
- ctl.keysize = sizeof(Oid);
- ctl.entrysize = sizeof(TabStatHashEntry);
-
- pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table",
- TABSTAT_QUANTUM,
- &ctl,
- HASH_ELEM | HASH_BLOBS);
- }
+ pgStatTabHash = create_tabstat_hash();
 
  /*
  * Find an entry or create a new one.
@@ -2420,30 +2820,33 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info,
 /* ----------
  * pgstat_fetch_stat_dbentry() -
  *
- * Support function for the SQL-callable pgstat* functions. Returns
- * the collected statistics for one database or NULL. NULL doesn't mean
- * that the database doesn't exist, it is just not yet known by the
- * collector, so the caller is better off to report ZERO instead.
- * ----------
+ * Find database stats entry on backends. The returned entries are cached
+ * until transaction end or pgstat_clear_snapshot() is called.
  */
 PgStat_StatDBEntry *
 pgstat_fetch_stat_dbentry(Oid dbid)
 {
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
-
- /*
- * Lookup the requested database; return NULL if not found
- */
- return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-  (void *) &dbid,
-  HASH_FIND, NULL);
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "local database stats hash",
+ .hash_entsize = sizeof(PgStat_StatDBEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,   /* already attached */
+ .dsh_params = &dsh_dbparams,
+ .hash = &pgStatLocalHash,
+ .dshash = &pgStatDBHash
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ /* If not done for this transaction, take a snapshot of global stats */
+ pgstat_snapshot_global_stats();
+
+ /* caller doesn't have a business with snapshot-local members  */
+ return (PgStat_StatDBEntry *) snapshot_statentry(&param, dbid);
 }
 
-
 /* ----------
  * pgstat_fetch_stat_tabentry() -
  *
@@ -2456,51 +2859,66 @@ pgstat_fetch_stat_dbentry(Oid dbid)
 PgStat_StatTabEntry *
 pgstat_fetch_stat_tabentry(Oid relid)
 {
- Oid dbid;
  PgStat_StatDBEntry *dbentry;
  PgStat_StatTabEntry *tabentry;
 
- /*
- * If not done for this transaction, read the statistics collector stats
- * file into some hash tables.
- */
- backend_read_statsfile();
+ /* Lookup our database, then look in its table hash table. */
+ dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
+ if (dbentry == NULL)
+ return NULL;
 
- /*
- * Lookup our database, then look in its table hash table.
- */
- dbid = MyDatabaseId;
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_FIND, NULL);
- if (dbentry != NULL && dbentry->tables != NULL)
- {
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &relid,
-   HASH_FIND, NULL);
- if (tabentry)
- return tabentry;
- }
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
+ if (tabentry != NULL)
+ return tabentry;
 
  /*
  * If we didn't find it, maybe it's a shared table.
  */
- dbid = InvalidOid;
- dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_FIND, NULL);
- if (dbentry != NULL && dbentry->tables != NULL)
- {
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &relid,
-   HASH_FIND, NULL);
- if (tabentry)
- return tabentry;
- }
+ dbentry = pgstat_fetch_stat_dbentry(InvalidOid);
+ if (dbentry == NULL)
+ return NULL;
+
+ tabentry = pgstat_fetch_stat_tabentry_extended(dbentry, relid);
+ if (tabentry != NULL)
+ return tabentry;
 
  return NULL;
 }
 
+/* ----------
+ * pgstat_fetch_stat_tabentry_extended() -
+ *
+ * Find table stats entry on backends. The returned entries are cached until
+ * transaction end or pgstat_clear_snapshot() is called.
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry_extended(PgStat_StatDBEntry *dbent, Oid reloid)
+{
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "table stats snapshot hash",
+ .hash_entsize = sizeof(PgStat_StatTabEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,
+ .dsh_params = &dsh_tblparams,
+ .hash = NULL,
+ .dshash = NULL
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ /* set target shared hash */
+ param.dsh_handle = dbent->tables;
+
+ /* tell snapshot_statentry what variables to use */
+ param.hash = &dbent->snapshot_tables;
+ param.dshash = &dbent->dshash_tables;
+
+ return (PgStat_StatTabEntry *)
+ snapshot_statentry(&param, reloid);
+}
+
 
 /* ----------
  * pgstat_fetch_stat_funcentry() -
@@ -2515,21 +2933,90 @@ pgstat_fetch_stat_funcentry(Oid func_id)
  PgStat_StatDBEntry *dbentry;
  PgStat_StatFuncEntry *funcentry = NULL;
 
- /* load the stats file if needed */
- backend_read_statsfile();
-
- /* Lookup our database, then find the requested function.  */
+ /* Lookup our database, then find the requested function */
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
- if (dbentry != NULL && dbentry->functions != NULL)
- {
- funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
- (void *) &func_id,
- HASH_FIND, NULL);
- }
+ if (dbentry == NULL)
+ return NULL;
+
+ funcentry = pgstat_fetch_stat_funcentry_extended(dbentry, func_id);
 
  return funcentry;
 }
 
+/* ----------
+ * pgstat_fetch_stat_funcentry_extended() -
+ *
+ * Find function stats entry on backends. The returned entries are cached
+ * until transaction end or pgstat_clear_snapshot() is called.
+ *
+ *  dbent is type of (PgStat_StatDBEntry *) but it's body must be an
+ *  PgSTat_StatDBEntry returned from pgstat_fetch_stat_dbentry().
+ */
+static PgStat_StatFuncEntry *
+pgstat_fetch_stat_funcentry_extended(PgStat_StatDBEntry *dbent, Oid funcid)
+{
+ /* context for snapshot_statentry */
+ static pgstat_snapshot_param param =
+ {
+ .hash_name = "function stats snapshot hash",
+ .hash_entsize = sizeof(PgStat_StatFuncEntry),
+ .dsh_handle = DSM_HANDLE_INVALID,
+ .dsh_params = &dsh_funcparams,
+ .hash = NULL,
+ .dshash = NULL
+ };
+
+ /* should be called from backends  */
+ Assert(IsUnderPostmaster);
+
+ if (dbent->functions == DSM_HANDLE_INVALID)
+ return NULL;
+
+ /* set target shared hash */
+ param.dsh_handle = dbent->functions;
+
+ /* tell snapshot_statentry what variables to use */
+ param.hash = &dbent->snapshot_functions;
+ param.dshash = &dbent->dshash_functions;
+
+ return (PgStat_StatFuncEntry *)
+ snapshot_statentry(&param, funcid);
+}
+
+/*
+ * pgstat_snapshot_global_stats() -
+ *
+ * Makes a snapshot of global stats if not done yet.  They will be kept until
+ * subsequent call of pgstat_clear_snapshot() or the end of the current
+ * memory context (typically TopTransactionContext).
+ */
+static void
+pgstat_snapshot_global_stats(void)
+{
+ MemoryContext oldcontext;
+
+ pgstat_attach_shared_stats();
+
+ /* Nothing to do if already done */
+ if (global_snapshot_is_valid)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(pgStatSnapshotContext);
+
+ LWLockAcquire(StatsLock, LW_SHARED);
+ memcpy(&snapshot_globalStats, shared_globalStats,
+   sizeof(PgStat_GlobalStats));
+
+ memcpy(&snapshot_archiverStats, shared_archiverStats,
+   sizeof(PgStat_ArchiverStats));
+ LWLockRelease(StatsLock);
+
+ global_snapshot_is_valid = true;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return;
+}
 
 /* ----------
  * pgstat_fetch_stat_beentry() -
@@ -2601,9 +3088,10 @@ pgstat_fetch_stat_numbackends(void)
 PgStat_ArchiverStats *
 pgstat_fetch_stat_archiver(void)
 {
- backend_read_statsfile();
+ /* If not done for this transaction, take a stats snapshot */
+ pgstat_snapshot_global_stats();
 
- return &archiverStats;
+ return &snapshot_archiverStats;
 }
 
 
@@ -2618,9 +3106,10 @@ pgstat_fetch_stat_archiver(void)
 PgStat_GlobalStats *
 pgstat_fetch_global(void)
 {
- backend_read_statsfile();
+ /* If not done for this transaction, take a stats snapshot */
+ pgstat_snapshot_global_stats();
 
- return &globalStats;
+ return &snapshot_globalStats;
 }
 
 
@@ -2834,8 +3323,8 @@ pgstat_initialize(void)
  MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType];
  }
 
- /* Set up a process-exit hook to clean up */
- on_shmem_exit(pgstat_beshutdown_hook, 0);
+ /* need to be called before dsm shutodwn */
+ before_shmem_exit(pgstat_beshutdown_hook, 0);
 }
 
 /* ----------
@@ -2933,7 +3422,7 @@ pgstat_bestart(void)
  lbeentry.st_backendType = B_STARTUP;
  break;
  case ArchiverProcess:
- beentry->st_backendType = B_ARCHIVER;
+ lbeentry.st_backendType = B_ARCHIVER;
  break;
  case BgWriterProcess:
  lbeentry.st_backendType = B_BG_WRITER;
@@ -3069,6 +3558,10 @@ pgstat_bestart(void)
  /* Update app name to current GUC setting */
  if (application_name)
  pgstat_report_appname(application_name);
+
+
+ /* attach shared database stats area */
+ pgstat_attach_shared_stats();
 }
 
 /*
@@ -3104,6 +3597,8 @@ pgstat_beshutdown_hook(int code, Datum arg)
  beentry->st_procpid = 0; /* mark invalid */
 
  PGSTAT_END_WRITE_ACTIVITY(beentry);
+
+ pgstat_detach_shared_stats(true);
 }
 
 
@@ -3364,7 +3859,8 @@ pgstat_read_current_status(void)
 #endif
  int i;
 
- Assert(!pgStatRunningInCollector);
+ Assert(IsUnderPostmaster);
+
  if (localBackendStatusTable)
  return; /* already done */
 
@@ -3659,9 +4155,6 @@ pgstat_get_wait_activity(WaitEventActivity w)
  case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
  event_name = "LogicalLauncherMain";
  break;
- case WAIT_EVENT_PGSTAT_MAIN:
- event_name = "PgStatMain";
- break;
  case WAIT_EVENT_RECOVERY_WAL_ALL:
  event_name = "RecoveryWalAll";
  break;
@@ -4321,75 +4814,43 @@ pgstat_get_backend_desc(BackendType backendType)
  * ------------------------------------------------------------
  */
 
-
 /* ----------
- * pgstat_setheader() -
+ * pgstat_send_archiver() -
  *
- * Set common header fields in a statistics message
+ * Report archiver statistics
  * ----------
  */
-static void
-pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
+void
+pgstat_send_archiver(const char *xlog, bool failed)
 {
- hdr->m_type = mtype;
-}
+ TimestampTz now = GetCurrentTimestamp();
 
-
-/* ----------
- * pgstat_send() -
- *
- * Send out one statistics message to the collector
- * ----------
- */
-static void
-pgstat_send(void *msg, int len)
-{
- int rc;
-
- if (pgStatSock == PGINVALID_SOCKET)
- return;
-
- ((PgStat_MsgHdr *) msg)->m_size = len;
-
- /* We'll retry after EINTR, but ignore all other failures */
- do
- {
- rc = send(pgStatSock, msg, len, 0);
- } while (rc < 0 && errno == EINTR);
-
-#ifdef USE_ASSERT_CHECKING
- /* In debug builds, log send failures ... */
- if (rc < 0)
- elog(LOG, "could not send to statistics collector: %m");
-#endif
-}
-
-/* ----------
- * pgstat_send_archiver() -
- *
- * Tell the collector about the WAL file that we successfully
- * archived or failed to archive.
- * ----------
- */
-void
-pgstat_send_archiver(const char *xlog, bool failed)
-{
- PgStat_MsgArchiver msg;
-
- /*
- * Prepare and send the message
- */
- pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER);
- msg.m_failed = failed;
- StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
- msg.m_timestamp = GetCurrentTimestamp();
- pgstat_send(&msg, sizeof(msg));
-}
+ if (failed)
+ {
+ /* Failed archival attempt */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ ++shared_archiverStats->failed_count;
+ memcpy(shared_archiverStats->last_failed_wal, xlog,
+   sizeof(shared_archiverStats->last_failed_wal));
+ shared_archiverStats->last_failed_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+ else
+ {
+ /* Successful archival operation */
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ ++shared_archiverStats->archived_count;
+ memcpy(shared_archiverStats->last_archived_wal, xlog,
+   sizeof(shared_archiverStats->last_archived_wal));
+ shared_archiverStats->last_archived_timestamp = now;
+ LWLockRelease(StatsLock);
+ }
+}
 
 /* ----------
  * pgstat_send_bgwriter() -
  *
- * Send bgwriter statistics to the collector
+ * Report bgwriter statistics
  * ----------
  */
 void
@@ -4398,6 +4859,8 @@ pgstat_send_bgwriter(void)
  /* We assume this initializes to zeroes */
  static const PgStat_MsgBgWriter all_zeroes;
 
+ PgStat_MsgBgWriter *s = &BgWriterStats;
+
  /*
  * This function can be called even if nothing at all has happened. In
  * this case, avoid sending a completely empty message to the stats
@@ -4406,11 +4869,18 @@ pgstat_send_bgwriter(void)
  if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
  return;
 
- /*
- * Prepare and send the message
- */
- pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
- pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
+ LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+ shared_globalStats->timed_checkpoints += s->m_timed_checkpoints;
+ shared_globalStats->requested_checkpoints += s->m_requested_checkpoints;
+ shared_globalStats->checkpoint_write_time += s->m_checkpoint_write_time;
+ shared_globalStats->checkpoint_sync_time += s->m_checkpoint_sync_time;
+ shared_globalStats->buf_written_checkpoints += s->m_buf_written_checkpoints;
+ shared_globalStats->buf_written_clean += s->m_buf_written_clean;
+ shared_globalStats->maxwritten_clean += s->m_maxwritten_clean;
+ shared_globalStats->buf_written_backend += s->m_buf_written_backend;
+ shared_globalStats->buf_fsync_backend += s->m_buf_fsync_backend;
+ shared_globalStats->buf_alloc += s->m_buf_alloc;
+ LWLockRelease(StatsLock);
 
  /*
  * Clear out the statistics buffer, so it can be re-used.
@@ -4419,305 +4889,164 @@ pgstat_send_bgwriter(void)
 }
 
 
-/* ----------
- * PgstatCollectorMain() -
- *
- * Start up the statistics collector process.  This is the body of the
- * postmaster child process.
+/*
+ * Pin and Unpin dbentry.
  *
- * The argc/argv parameters are valid only in EXEC_BACKEND case.
- * ----------
+ * To keep less memory usage, and for speed, counters are by recreation of
+ * dshash instead of removing entries one-by-one keeping whole-dshash lock. On
+ * the other hand dshash cannot be destroyed until all referrers have gone. As
+ * the result, other backend may be kept waiting the counter reset for not a
+ * short time. We isolate the hashes under destruction as another generation,
+ * which means no longer used but cannot be removed yet.
+
+ * When we start accessing hashes on a dbentry, call pin_hashes() and acquire
+ * the current "generation". Unlock removes the older generation's hashes when
+ * all refers have gone.
  */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+static int
+pin_hashes(PgStat_StatDBEntry *dbentry)
 {
- int len;
- PgStat_Msg msg;
- int wr;
+ int generation;
 
- /*
- * Ignore all signals usually bound to some action in the postmaster,
- * except SIGHUP and SIGQUIT.  Note we don't need a SIGUSR1 handler to
- * support latch operations, because we only use a local latch.
- */
- pqsignal(SIGHUP, pgstat_sighup_handler);
- pqsignal(SIGINT, SIG_IGN);
- pqsignal(SIGTERM, SIG_IGN);
- pqsignal(SIGQUIT, pgstat_exit);
- pqsignal(SIGALRM, SIG_IGN);
- pqsignal(SIGPIPE, SIG_IGN);
- pqsignal(SIGUSR1, SIG_IGN);
- pqsignal(SIGUSR2, SIG_IGN);
- /* Reset some signals that are accepted by postmaster but not here */
- pqsignal(SIGCHLD, SIG_DFL);
- PG_SETMASK(&UnBlockSig);
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->refcnt++;
+ generation = dbentry->generation;
+ LWLockRelease(&dbentry->lock);
 
- /*
- * Identify myself via ps
- */
- init_ps_display("stats collector", "", "", "");
+ dshash_release_lock(pgStatDBHash, dbentry);
 
- /*
- * Read in existing stats files or initialize the stats to zero.
- */
- pgStatRunningInCollector = true;
- pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
+ return generation;
+}
 
- /*
- * Loop to process messages until we get SIGQUIT or detect ungraceful
- * death of our parent postmaster.
- *
- * For performance reasons, we don't want to do ResetLatch/WaitLatch after
- * every message; instead, do that only after a recv() fails to obtain a
- * message.  (This effectively means that if backends are sending us stuff
- * like mad, we won't notice postmaster death until things slack off a
- * bit; which seems fine.) To do that, we have an inner loop that
- * iterates as long as recv() succeeds.  We do recognize got_SIGHUP inside
- * the inner loop, which means that such interrupts will get serviced but
- * the latch won't get cleared until next time there is a break in the
- * action.
- */
- for (;;)
+/*
+ * Unpin hashes in dbentry. If given generation is isolated, destroy it after
+ * all referrers has gone. Otherwise just decrease reference count then return.
+ */
+static void
+unpin_hashes(PgStat_StatDBEntry *dbentry, int generation)
+{
+ dshash_table *tables;
+ dshash_table *funcs = NULL;
+
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+
+ /* using current generation, just decrease refcount */
+ if (dbentry->generation == generation)
  {
- /* Clear any already-pending wakeups */
- ResetLatch(MyLatch);
+ dbentry->refcnt--;
+ LWLockRelease(&dbentry->lock);
+ return;
+ }
 
- /*
- * Quit if we get SIGQUIT from the postmaster.
- */
- if (need_exit)
- break;
+ /*
+ * It is isolated, waiting for all referrers to end.
+ */
+ Assert(dbentry->generation == generation + 1);
 
- /*
- * Inner loop iterates as long as we keep getting messages, or until
- * need_exit becomes set.
- */
- while (!need_exit)
- {
- /*
- * Reload configuration if we got SIGHUP from the postmaster.
- */
- if (got_SIGHUP)
- {
- got_SIGHUP = false;
- ProcessConfigFile(PGC_SIGHUP);
- }
+ if (--dbentry->prev_refcnt > 0)
+ {
+ LWLockRelease(&dbentry->lock);
+ return;
+ }
 
- /*
- * Write the stats file(s) if a new request has arrived that is
- * not satisfied by existing file(s).
- */
- if (pgstat_write_statsfile_needed())
- pgstat_write_statsfiles(false, false);
+ /* no referrer remains, remove the hashes */
+ tables = dshash_attach(area, &dsh_tblparams, dbentry->prev_tables, 0);
+ if (dbentry->prev_functions != DSM_HANDLE_INVALID)
+ funcs = dshash_attach(area, &dsh_funcparams,
+  dbentry->prev_functions, 0);
 
- /*
- * Try to receive and process a message.  This will not block,
- * since the socket is set to non-blocking mode.
- *
- * XXX On Windows, we have to force pgwin32_recv to cooperate,
- * despite the previous use of pg_set_noblock() on the socket.
- * This is extremely broken and should be fixed someday.
- */
-#ifdef WIN32
- pgwin32_noblock = 1;
-#endif
+ dbentry->prev_tables = DSM_HANDLE_INVALID;
+ dbentry->prev_functions = DSM_HANDLE_INVALID;
 
- len = recv(pgStatSock, (char *) &msg,
-   sizeof(PgStat_Msg), 0);
+ /* release the entry immediately */
+ LWLockRelease(&dbentry->lock);
 
-#ifdef WIN32
- pgwin32_noblock = 0;
-#endif
+ dshash_destroy(tables);
+ if (funcs)
+ dshash_destroy(funcs);
 
- if (len < 0)
- {
- if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
- break; /* out of inner loop */
- ereport(ERROR,
- (errcode_for_socket_access(),
- errmsg("could not read statistics message: %m")));
- }
+ return;
+}
 
- /*
- * We ignore messages that are smaller than our common header
- */
- if (len < sizeof(PgStat_MsgHdr))
- continue;
+/*
+ * attach and return the specified generation of table hash
+ * Returns NULL on lock failure.
+ */
+static dshash_table *
+attach_table_hash(PgStat_StatDBEntry *dbent, int gen)
+{
+ dshash_table *ret;
 
- /*
- * The received length must match the length in the header
- */
- if (msg.msg_hdr.m_size != len)
- continue;
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
 
- /*
- * O.K. - we accept this message.  Process it.
- */
- switch (msg.msg_hdr.m_type)
- {
- case PGSTAT_MTYPE_DUMMY:
- break;
-
- case PGSTAT_MTYPE_INQUIRY:
- pgstat_recv_inquiry(&msg.msg_inquiry, len);
- break;
-
- case PGSTAT_MTYPE_TABSTAT:
- pgstat_recv_tabstat(&msg.msg_tabstat, len);
- break;
-
- case PGSTAT_MTYPE_TABPURGE:
- pgstat_recv_tabpurge(&msg.msg_tabpurge, len);
- break;
-
- case PGSTAT_MTYPE_DROPDB:
- pgstat_recv_dropdb(&msg.msg_dropdb, len);
- break;
-
- case PGSTAT_MTYPE_RESETCOUNTER:
- pgstat_recv_resetcounter(&msg.msg_resetcounter, len);
- break;
-
- case PGSTAT_MTYPE_RESETSHAREDCOUNTER:
- pgstat_recv_resetsharedcounter(
-   &msg.msg_resetsharedcounter,
-   len);
- break;
-
- case PGSTAT_MTYPE_RESETSINGLECOUNTER:
- pgstat_recv_resetsinglecounter(
-   &msg.msg_resetsinglecounter,
-   len);
- break;
-
- case PGSTAT_MTYPE_AUTOVAC_START:
- pgstat_recv_autovac(&msg.msg_autovacuum_start, len);
- break;
-
- case PGSTAT_MTYPE_VACUUM:
- pgstat_recv_vacuum(&msg.msg_vacuum, len);
- break;
-
- case PGSTAT_MTYPE_ANALYZE:
- pgstat_recv_analyze(&msg.msg_analyze, len);
- break;
-
- case PGSTAT_MTYPE_ARCHIVER:
- pgstat_recv_archiver(&msg.msg_archiver, len);
- break;
-
- case PGSTAT_MTYPE_BGWRITER:
- pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
- break;
-
- case PGSTAT_MTYPE_FUNCSTAT:
- pgstat_recv_funcstat(&msg.msg_funcstat, len);
- break;
-
- case PGSTAT_MTYPE_FUNCPURGE:
- pgstat_recv_funcpurge(&msg.msg_funcpurge, len);
- break;
-
- case PGSTAT_MTYPE_RECOVERYCONFLICT:
- pgstat_recv_recoveryconflict(
- &msg.msg_recoveryconflict,
- len);
- break;
-
- case PGSTAT_MTYPE_DEADLOCK:
- pgstat_recv_deadlock(&msg.msg_deadlock, len);
- break;
-
- case PGSTAT_MTYPE_TEMPFILE:
- pgstat_recv_tempfile(&msg.msg_tempfile, len);
- break;
-
- case PGSTAT_MTYPE_CHECKSUMFAILURE:
- pgstat_recv_checksum_failure(
- &msg.msg_checksumfailure,
- len);
- break;
-
- default:
- break;
- }
- } /* end of inner message-processing loop */
-
- /* Sleep until there's something to do */
-#ifndef WIN32
- wr = WaitLatchOrSocket(MyLatch,
-   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-   pgStatSock, -1L,
-   WAIT_EVENT_PGSTAT_MAIN);
-#else
+ if (dbent->generation == gen)
+ ret = dshash_attach(area, &dsh_tblparams, dbent->tables, 0);
+ else
+ {
+ Assert (dbent->generation == gen + 1);
+ Assert (dbent->prev_tables != DSM_HANDLE_INVALID);
+ ret = dshash_attach(area, &dsh_tblparams, dbent->prev_tables, 0);
+ }
+ LWLockRelease(&dbent->lock);
 
- /*
- * Windows, at least in its Windows Server 2003 R2 incarnation,
- * sometimes loses FD_READ events.  Waking up and retrying the recv()
- * fixes that, so don't sleep indefinitely.  This is a crock of the
- * first water, but until somebody wants to debug exactly what's
- * happening there, this is the best we can do.  The two-second
- * timeout matches our pre-9.2 behavior, and needs to be short enough
- * to not provoke "using stale statistics" complaints from
- * backend_read_statsfile.
- */
- wr = WaitLatchOrSocket(MyLatch,
-   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-   pgStatSock,
-   2 * 1000L /* msec */ ,
-   WAIT_EVENT_PGSTAT_MAIN);
-#endif
+ return ret;
+}
 
- /*
- * Emergency bailout if postmaster has died.  This is to avoid the
- * necessity for manual cleanup of all postmaster children.
- */
- if (wr & WL_POSTMASTER_DEATH)
- break;
- } /* end of outer loop */
+/* attach and return the specified generation of function hash */
+static dshash_table *
+attach_function_hash(PgStat_StatDBEntry *dbent, int gen)
+{
+ dshash_table *ret = NULL;
 
- /*
- * Save the final stats to reuse at next startup.
- */
- pgstat_write_statsfiles(true, true);
 
- exit(0);
-}
+ LWLockAcquire(&dbent->lock, LW_EXCLUSIVE);
 
+ if (dbent->generation == gen)
+ {
+ if (dbent->functions == DSM_HANDLE_INVALID)
+ {
+ dshash_table *funchash =
+ dshash_create(area, &dsh_funcparams, 0);
+ dbent->functions = dshash_get_hash_table_handle(funchash);
 
-/* SIGQUIT signal handler for collector process */
-static void
-pgstat_exit(SIGNAL_ARGS)
-{
- int save_errno = errno;
+ ret = funchash;
+ }
+ else
+ ret =  dshash_attach(area, &dsh_funcparams, dbent->functions, 0);
+ }
+ /* don't bother creating useless hash */
 
- need_exit = true;
- SetLatch(MyLatch);
+ LWLockRelease(&dbent->lock);
 
- errno = save_errno;
+ return  ret;
 }
 
-/* SIGHUP handler for collector process */
 static void
-pgstat_sighup_handler(SIGNAL_ARGS)
+init_dbentry(PgStat_StatDBEntry *dbentry)
 {
- int save_errno = errno;
-
- got_SIGHUP = true;
- SetLatch(MyLatch);
-
- errno = save_errno;
+ LWLockInitialize(&dbentry->lock, LWTRANCHE_STATS);
+ dbentry->generation = 0;
+ dbentry->refcnt = 0;
+ dbentry->prev_refcnt = 0;
+ dbentry->tables = DSM_HANDLE_INVALID;
+ dbentry->prev_tables = DSM_HANDLE_INVALID;
+ dbentry->functions = DSM_HANDLE_INVALID;
+ dbentry->prev_functions = DSM_HANDLE_INVALID;
 }
 
 /*
  * Subroutine to clear stats in a database entry
  *
- * Tables and functions hashes are initialized to empty.
+ * Reset all counters in the dbentry. Tables and functions dshashes are
+ * destroyed.  If any backend is pinning this dbentry, the current dshashes
+ * are stashed out to the previous "generation" to wait for all accessors are
+ * gone. If the previous generation is already occupied, the current dshashes
+ * are so fresh that they doesn't need to be cleared.
  */
 static void
 reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 {
- HASHCTL hash_ctl;
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
 
  dbentry->n_xact_commit = 0;
  dbentry->n_xact_rollback = 0;
@@ -4742,130 +5071,118 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
  dbentry->n_block_read_time = 0;
  dbentry->n_block_write_time = 0;
 
- dbentry->stat_reset_timestamp = GetCurrentTimestamp();
- dbentry->stats_timestamp = 0;
-
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
- dbentry->tables = hash_create("Per-database table",
-  PGSTAT_TAB_HASH_SIZE,
-  &hash_ctl,
-  HASH_ELEM | HASH_BLOBS);
-
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
- dbentry->functions = hash_create("Per-database function",
- PGSTAT_FUNCTION_HASH_SIZE,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS);
-}
+ if (dbentry->refcnt == 0)
+ {
+ /*
+ * No one is referring to the current hash. It's very costly to remove
+ * entries in dshash individually so just destroy the whole.  If
+ * someone pined this entry just after, pin_hashes() returns the
+ * current generation and attach will happen after the following
+ * LWLock released.
+ */
+ dshash_table *tbl;
 
-/*
- * Lookup the hash table entry for the specified database. If no hash
- * table entry exists, initialize it, if the create parameter is true.
- * Else, return NULL.
- */
-static PgStat_StatDBEntry *
-pgstat_get_db_entry(Oid databaseid, bool create)
-{
- PgStat_StatDBEntry *result;
- bool found;
- HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
+ if (dbentry->tables != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_destroy(tbl);
+ dbentry->tables = DSM_HANDLE_INVALID;
+ }
+ if (dbentry->functions != DSM_HANDLE_INVALID)
+ {
+ tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_destroy(tbl);
+ dbentry->functions = DSM_HANDLE_INVALID;
+ }
+ }
+ else if (dbentry->prev_refcnt == 0)
+ {
+ /*
+ * Someone is still referring to the current hash and previous slot is
+ * vacant. Stash out the current hash to the previous slot.
+ */
+ dbentry->prev_refcnt = dbentry->refcnt;
+ dbentry->prev_tables = dbentry->tables;
+ dbentry->prev_functions = dbentry->functions;
+ dbentry->refcnt = 0;
+ dbentry->tables = DSM_HANDLE_INVALID;
+ dbentry->functions = DSM_HANDLE_INVALID;
+ dbentry->generation++;
+ }
+ else
+ {
+ Assert(dbentry->prev_refcnt > 0 && dbentry->refcnt > 0);
+ /*
+ * If we get here, we just have got another reset request and the old
+ * hashes are waiting to all referrers to be released. It must be
+ * quite a short time so we can just ignore this request.
+ *
+ * As the side effect, the resetter can see non-zero values before
+ * anyone updates them but it's not distinctive with someone updated
+ * them before reading.
+ */
+ }
 
- /* Lookup or create the hash table entry for this database */
- result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
- &databaseid,
- action, &found);
+ /* Create new table hash if not exists */
+ if (dbentry->tables == DSM_HANDLE_INVALID)
+ {
+ dshash_table *tbl = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->tables = dshash_get_hash_table_handle(tbl);
+ dshash_detach(tbl);
+ }
 
- if (!create && !found)
- return NULL;
+ /* Create new function hash if not exists and needed. */
+ if (dbentry->functions == DSM_HANDLE_INVALID &&
+ pgstat_track_functions != TRACK_FUNC_OFF)
+ {
+ dshash_table *tbl = dshash_create(area, &dsh_funcparams, 0);
+ dbentry->functions = dshash_get_hash_table_handle(tbl);
+ dshash_detach(tbl);
+ }
 
- /*
- * If not found, initialize the new one.  This creates empty hash tables
- * for tables and functions, too.
- */
- if (!found)
- reset_dbentry_counters(result);
+ dbentry->stat_reset_timestamp = GetCurrentTimestamp();
 
- return result;
+ LWLockRelease(&dbentry->lock);
 }
 
-
 /*
- * Lookup the hash table entry for the specified table. If no hash
- * table entry exists, initialize it, if the create parameter is true.
- * Else, return NULL.
+ * Create the filename for a DB stat file; filename is output parameter points
+ * to a character buffer of length len.
  */
-static PgStat_StatTabEntry *
-pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
+static void
+get_dbstat_filename(bool tempname, Oid databaseid, char *filename, int len)
 {
- PgStat_StatTabEntry *result;
- bool found;
- HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
-
- /* Lookup or create the hash table entry for this table */
- result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
- &tableoid,
- action, &found);
-
- if (!create && !found)
- return NULL;
-
- /* If not found, initialize the new one. */
- if (!found)
- {
- result->numscans = 0;
- result->tuples_returned = 0;
- result->tuples_fetched = 0;
- result->tuples_inserted = 0;
- result->tuples_updated = 0;
- result->tuples_deleted = 0;
- result->tuples_hot_updated = 0;
- result->n_live_tuples = 0;
- result->n_dead_tuples = 0;
- result->changes_since_analyze = 0;
- result->blocks_fetched = 0;
- result->blocks_hit = 0;
- result->vacuum_timestamp = 0;
- result->vacuum_count = 0;
- result->autovac_vacuum_timestamp = 0;
- result->autovac_vacuum_count = 0;
- result->analyze_timestamp = 0;
- result->analyze_count = 0;
- result->autovac_analyze_timestamp = 0;
- result->autovac_analyze_count = 0;
- }
+ int printed;
 
- return result;
+ /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
+ printed = snprintf(filename, len, "%s/db_%u.%s",
+   PGSTAT_STAT_PERMANENT_DIRECTORY,
+   databaseid,
+   tempname ? "tmp" : "stat");
+ if (printed >= len)
+ elog(ERROR, "overlength pgstat path");
 }
 
-
 /* ----------
  * pgstat_write_statsfiles() -
- * Write the global statistics file, as well as requested DB files.
- *
- * 'permanent' specifies writing to the permanent files not temporary ones.
- * When true (happens only when the collector is shutting down), also remove
- * the temporary files so that backends starting up under a new postmaster
- * can't read old data before the new collector is ready.
- *
- * When 'allDbs' is false, only the requested databases (listed in
- * pending_write_requests) will be written; otherwise, all databases
- * will be written.
+ * Write the global statistics file, as well as DB files.
  * ----------
  */
-static void
-pgstat_write_statsfiles(bool permanent, bool allDbs)
+void
+pgstat_write_statsfiles(void)
 {
- HASH_SEQ_STATUS hstat;
+ dshash_seq_status hstat;
  PgStat_StatDBEntry *dbentry;
  FILE   *fpout;
  int32 format_id;
- const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+ const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
  int rc;
 
+ /* stats is not initialized yet. just return. */
+ if (StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID)
+ return;
+
  elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
  /*
@@ -4884,7 +5201,7 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
  /*
  * Set the timestamp of the stats file.
  */
- globalStats.stats_timestamp = GetCurrentTimestamp();
+ shared_globalStats->stats_timestamp = GetCurrentTimestamp();
 
  /*
  * Write the file header --- currently just a format ID.
@@ -4896,39 +5213,37 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
  /*
  * Write global stats struct
  */
- rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+ rc = fwrite(shared_globalStats, sizeof(*shared_globalStats), 1, fpout);
  (void) rc; /* we'll check for error with ferror */
 
  /*
  * Write archiver stats struct
  */
- rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
+ rc = fwrite(shared_archiverStats, sizeof(*shared_archiverStats), 1, fpout);
  (void) rc; /* we'll check for error with ferror */
 
  /*
  * Walk through the database table.
  */
- hash_seq_init(&hstat, pgStatDBHash);
- while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+ dshash_seq_init(&hstat, pgStatDBHash, false, false);
+ while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL)
  {
  /*
  * Write out the table and function stats for this DB into the
  * appropriate per-DB stat file, if required.
  */
- if (allDbs || pgstat_db_requested(dbentry->databaseid))
- {
- /* Make DB's timestamp consistent with the global stats */
- dbentry->stats_timestamp = globalStats.stats_timestamp;
+ /* Make DB's timestamp consistent with the global stats */
+ dbentry->stats_timestamp = shared_globalStats->stats_timestamp;
 
- pgstat_write_db_statsfile(dbentry, permanent);
- }
+ pgstat_write_pgStatDBHashfile(dbentry);
 
  /*
  * Write out the DB entry. We don't write the tables or functions
  * pointers, since they're of no use to any other process.
  */
  fputc('D', fpout);
- rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
+ rc = fwrite(dbentry,
+ offsetof(PgStat_StatDBEntry, generation), 1, fpout);
  (void) rc; /* we'll check for error with ferror */
  }
 
@@ -4964,53 +5279,18 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
  tmpfile, statfile)));
  unlink(tmpfile);
  }
-
- if (permanent)
- unlink(pgstat_stat_filename);
-
- /*
- * Now throw away the list of requests.  Note that requests sent after we
- * started the write are still waiting on the network socket.
- */
- list_free(pending_write_requests);
- pending_write_requests = NIL;
-}
-
-/*
- * return the filename for a DB stat file; filename is the output buffer,
- * of length len.
- */
-static void
-get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
- char *filename, int len)
-{
- int printed;
-
- /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
- printed = snprintf(filename, len, "%s/db_%u.%s",
-   permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
-   pgstat_stat_directory,
-   databaseid,
-   tempname ? "tmp" : "stat");
- if (printed >= len)
- elog(ERROR, "overlength pgstat path");
 }
 
 /* ----------
- * pgstat_write_db_statsfile() -
+ * pgstat_write_pgStatDBHashfile() -
  * Write the stat file for a single database.
- *
- * If writing to the permanent file (happens when the collector is
- * shutting down only), remove the temporary file so that backends
- * starting up under a new postmaster can't read the old data before
- * the new collector is ready.
  * ----------
  */
 static void
-pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
+pgstat_write_pgStatDBHashfile(PgStat_StatDBEntry *dbentry)
 {
- HASH_SEQ_STATUS tstat;
- HASH_SEQ_STATUS fstat;
+ dshash_seq_status tstat;
+ dshash_seq_status fstat;
  PgStat_StatTabEntry *tabentry;
  PgStat_StatFuncEntry *funcentry;
  FILE   *fpout;
@@ -5019,9 +5299,10 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
  int rc;
  char tmpfile[MAXPGPATH];
  char statfile[MAXPGPATH];
+ dshash_table *tbl;
 
- get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
- get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
+ get_dbstat_filename(true, dbid, tmpfile, MAXPGPATH);
+ get_dbstat_filename(false, dbid, statfile, MAXPGPATH);
 
  elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
@@ -5048,23 +5329,30 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
  /*
  * Walk through the database's access stats per table.
  */
- hash_seq_init(&tstat, dbentry->tables);
- while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
+ tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+ dshash_seq_init(&tstat, tbl, false, false);
+ while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&tstat)) != NULL)
  {
  fputc('T', fpout);
  rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
  (void) rc; /* we'll check for error with ferror */
  }
+ dshash_detach(tbl);
 
  /*
  * Walk through the database's function stats table.
  */
- hash_seq_init(&fstat, dbentry->functions);
- while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
+ if (dbentry->functions != DSM_HANDLE_INVALID)
  {
- fputc('F', fpout);
- rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
- (void) rc; /* we'll check for error with ferror */
+ tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+ dshash_seq_init(&fstat, tbl, false, false);
+ while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&fstat)) != NULL)
+ {
+ fputc('F', fpout);
+ rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
+ (void) rc; /* we'll check for error with ferror */
+ }
+ dshash_detach(tbl);
  }
 
  /*
@@ -5099,76 +5387,37 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
  tmpfile, statfile)));
  unlink(tmpfile);
  }
-
- if (permanent)
- {
- get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
- unlink(statfile);
- }
 }
 
 /* ----------
  * pgstat_read_statsfiles() -
  *
- * Reads in some existing statistics collector files and returns the
- * databases hash table that is the top level of the data.
- *
- * If 'onlydb' is not InvalidOid, it means we only want data for that DB
- * plus the shared catalogs ("DB 0").  We'll still populate the DB hash
- * table for all databases, but we don't bother even creating table/function
- * hash tables for other databases.
- *
- * 'permanent' specifies reading from the permanent files not temporary ones.
- * When true (happens only when the collector is starting up), remove the
- * files after reading; the in-memory status is now authoritative, and the
- * files would be out of date in case somebody else reads them.
+ * Reads in existing statistics collector files into the shared stats hash.
  *
- * If a 'deep' read is requested, table/function stats are read, otherwise
- * the table/function hash tables remain empty.
  * ----------
  */
-static HTAB *
-pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
+void
+pgstat_read_statsfiles(void)
 {
  PgStat_StatDBEntry *dbentry;
  PgStat_StatDBEntry dbbuf;
- HASHCTL hash_ctl;
- HTAB   *dbhash;
  FILE   *fpin;
  int32 format_id;
  bool found;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+
+ /* shouldn't be called from postmaster  */
+ Assert(IsUnderPostmaster);
+
+ elog(DEBUG2, "reading stats file \"%s\"", statfile);
 
  /*
- * The tables will live in pgStatLocalContext.
+ * Set the current timestamp (will be kept only in case we can't load an
+ * existing statsfile).
  */
- pgstat_setup_memcxt();
-
- /*
- * Create the DB hashtable
- */
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- /*
- * Clear out global and archiver statistics so they start from zero in
- * case we can't load an existing statsfile.
- */
- memset(&globalStats, 0, sizeof(globalStats));
- memset(&archiverStats, 0, sizeof(archiverStats));
-
- /*
- * Set the current timestamp (will be kept only in case we can't load an
- * existing statsfile).
- */
- globalStats.stat_reset_timestamp = GetCurrentTimestamp();
- archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
+ shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
+ shared_archiverStats->stat_reset_timestamp =
+ shared_globalStats->stat_reset_timestamp;
 
  /*
  * Try to open the stats file. If it doesn't exist, the backends simply
@@ -5182,11 +5431,11 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
  {
  if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errcode_for_file_access(),
  errmsg("could not open statistics file \"%s\": %m",
  statfile)));
- return dbhash;
+ return;
  }
 
  /*
@@ -5195,7 +5444,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
  format_id != PGSTAT_FILE_FORMAT_ID)
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"", statfile)));
  goto done;
  }
@@ -5203,32 +5452,24 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
  /*
  * Read global stats struct
  */
- if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+ if (fread(shared_globalStats, 1, sizeof(*shared_globalStats), fpin) !=
+ sizeof(*shared_globalStats))
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"", statfile)));
- memset(&globalStats, 0, sizeof(globalStats));
+ MemSet(shared_globalStats, 0, sizeof(*shared_globalStats));
  goto done;
  }
 
- /*
- * In the collector, disregard the timestamp we read from the permanent
- * stats file; we should be willing to write a temp stats file immediately
- * upon the first request from any backend.  This only matters if the old
- * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
- * an unusual scenario.
- */
- if (pgStatRunningInCollector)
- globalStats.stats_timestamp = 0;
-
  /*
  * Read archiver stats struct
  */
- if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
+ if (fread(shared_archiverStats, 1, sizeof(*shared_archiverStats), fpin) !=
+ sizeof(*shared_archiverStats))
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"", statfile)));
- memset(&archiverStats, 0, sizeof(archiverStats));
+ MemSet(shared_archiverStats, 0, sizeof(*shared_archiverStats));
  goto done;
  }
 
@@ -5245,10 +5486,10 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
  * follows.
  */
  case 'D':
- if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
-  fpin) != offsetof(PgStat_StatDBEntry, tables))
+ if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, generation),
+  fpin) != offsetof(PgStat_StatDBEntry, generation))
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
@@ -5257,76 +5498,36 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
  /*
  * Add to the DB hash
  */
- dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
- (void *) &dbbuf.databaseid,
- HASH_ENTER,
- &found);
+ dbentry = (PgStat_StatDBEntry *)
+ dshash_find_or_insert(pgStatDBHash, (void *) &dbbuf.databaseid,
+  &found);
+
+ /* don't allow duplicate dbentries */
  if (found)
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ dshash_release_lock(pgStatDBHash, dbentry);
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
  }
 
- memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
- dbentry->tables = NULL;
- dbentry->functions = NULL;
-
- /*
- * In the collector, disregard the timestamp we read from the
- * permanent stats file; we should be willing to write a temp
- * stats file immediately upon the first request from any
- * backend.
- */
- if (pgStatRunningInCollector)
- dbentry->stats_timestamp = 0;
-
- /*
- * Don't create tables/functions hashtables for uninteresting
- * databases.
- */
- if (onlydb != InvalidOid)
- {
- if (dbbuf.databaseid != onlydb &&
- dbbuf.databaseid != InvalidOid)
- break;
- }
-
- memset(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbentry->tables = hash_create("Per-database table",
-  PGSTAT_TAB_HASH_SIZE,
-  &hash_ctl,
-  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
- hash_ctl.keysize = sizeof(Oid);
- hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
- hash_ctl.hcxt = pgStatLocalContext;
- dbentry->functions = hash_create("Per-database function",
- PGSTAT_FUNCTION_HASH_SIZE,
- &hash_ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ /* initialize the new shared entry */
+ init_dbentry(dbentry);
 
- /*
- * If requested, read the data from the database-specific
- * file.  Otherwise we just leave the hashtables empty.
- */
- if (deep)
- pgstat_read_db_statsfile(dbentry->databaseid,
- dbentry->tables,
- dbentry->functions,
- permanent);
+ memcpy(dbentry, &dbbuf,
+   offsetof(PgStat_StatDBEntry, generation));
 
+ /* Read the data from the database-specific file. */
+ pgstat_read_pgStatDBHashfile(dbentry);
+ dshash_release_lock(pgStatDBHash, dbentry);
  break;
 
  case 'E':
  goto done;
 
  default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
@@ -5336,45 +5537,35 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 done:
  FreeFile(fpin);
 
- /* If requested to read the permanent file, also get rid of it. */
- if (permanent)
- {
- elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
- unlink(statfile);
- }
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
 
- return dbhash;
+ return;
 }
 
 
 /* ----------
- * pgstat_read_db_statsfile() -
+ * pgstat_read_pgStatDBHashfile() -
  *
- * Reads in the existing statistics collector file for the given database,
- * filling the passed-in tables and functions hash tables.
- *
- * As in pgstat_read_statsfiles, if the permanent file is requested, it is
- * removed after reading.
- *
- * Note: this code has the ability to skip storing per-table or per-function
- * data, if NULL is passed for the corresponding hashtable.  That's not used
- * at the moment though.
+ * Reads in the at-rest statistics file and create shared statistics
+ * tables. The file is removed after reading.
  * ----------
  */
 static void
-pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
- bool permanent)
+pgstat_read_pgStatDBHashfile(PgStat_StatDBEntry *dbentry)
 {
  PgStat_StatTabEntry *tabentry;
  PgStat_StatTabEntry tabbuf;
  PgStat_StatFuncEntry funcbuf;
  PgStat_StatFuncEntry *funcentry;
+ dshash_table *tabhash = NULL;
+ dshash_table *funchash = NULL;
  FILE   *fpin;
  int32 format_id;
  bool found;
  char statfile[MAXPGPATH];
 
- get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
+ get_dbstat_filename(false, dbentry->databaseid, statfile, MAXPGPATH);
 
  /*
  * Try to open the stats file. If it doesn't exist, the backends simply
@@ -5388,7 +5579,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
  {
  if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errcode_for_file_access(),
  errmsg("could not open statistics file \"%s\": %m",
  statfile)));
@@ -5401,14 +5592,14 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
  format_id != PGSTAT_FILE_FORMAT_ID)
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"", statfile)));
  goto done;
  }
 
  /*
- * We found an existing collector stats file. Read it and put all the
- * hashtable entries into place.
+ * We found an existing statistics file. Read it and put all the hashtable
+ * entries into place.
  */
  for (;;)
  {
@@ -5421,31 +5612,35 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
  if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
   fpin) != sizeof(PgStat_StatTabEntry))
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
  }
 
- /*
- * Skip if table data not wanted.
- */
  if (tabhash == NULL)
- break;
+ {
+ tabhash = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->tables =
+ dshash_get_hash_table_handle(tabhash);
+ }
 
- tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-   (void *) &tabbuf.tableid,
-   HASH_ENTER, &found);
+ tabentry = (PgStat_StatTabEntry *)
+ dshash_find_or_insert(tabhash,
+  (void *) &tabbuf.tableid, &found);
 
+ /* don't allow duplicate entries */
  if (found)
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ dshash_release_lock(tabhash, tabentry);
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
  }
 
  memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+ dshash_release_lock(tabhash, tabentry);
  break;
 
  /*
@@ -5455,31 +5650,34 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
  if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
   fpin) != sizeof(PgStat_StatFuncEntry))
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
  }
 
- /*
- * Skip if function data not wanted.
- */
  if (funchash == NULL)
- break;
+ {
+ funchash = dshash_create(area, &dsh_tblparams, 0);
+ dbentry->functions =
+ dshash_get_hash_table_handle(funchash);
+ }
 
- funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
- (void *) &funcbuf.functionid,
- HASH_ENTER, &found);
+ funcentry = (PgStat_StatFuncEntry *)
+ dshash_find_or_insert(funchash,
+  (void *) &funcbuf.functionid, &found);
 
  if (found)
  {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ dshash_release_lock(funchash, funcentry);
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
  }
 
  memcpy(funcentry, &funcbuf, sizeof(funcbuf));
+ dshash_release_lock(funchash, funcentry);
  break;
 
  /*
@@ -5489,7 +5687,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
  goto done;
 
  default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
+ ereport(LOG,
  (errmsg("corrupted statistics file \"%s\"",
  statfile)));
  goto done;
@@ -5497,295 +5695,39 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
  }
 
 done:
- FreeFile(fpin);
-
- if (permanent)
- {
- elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
- unlink(statfile);
- }
-}
-
-/* ----------
- * pgstat_read_db_statsfile_timestamp() -
- *
- * Attempt to determine the timestamp of the last db statfile write.
- * Returns true if successful; the timestamp is stored in *ts.
- *
- * This needs to be careful about handling databases for which no stats file
- * exists, such as databases without a stat entry or those not yet written:
- *
- * - if there's a database entry in the global file, return the corresponding
- * stats_timestamp value.
- *
- * - if there's no db stat entry (e.g. for a new or inactive database),
- * there's no stats_timestamp value, but also nothing to write so we return
- * the timestamp of the global statfile.
- * ----------
- */
-static bool
-pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
-   TimestampTz *ts)
-{
- PgStat_StatDBEntry dbentry;
- PgStat_GlobalStats myGlobalStats;
- PgStat_ArchiverStats myArchiverStats;
- FILE   *fpin;
- int32 format_id;
- const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
- /*
- * Try to open the stats file.  As above, anything but ENOENT is worthy of
- * complaining about.
- */
- if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
- {
- if (errno != ENOENT)
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errcode_for_file_access(),
- errmsg("could not open statistics file \"%s\": %m",
- statfile)));
- return false;
- }
-
- /*
- * Verify it's of the expected format.
- */
- if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
- format_id != PGSTAT_FILE_FORMAT_ID)
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /*
- * Read global stats struct
- */
- if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
-  fpin) != sizeof(myGlobalStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /*
- * Read archiver stats struct
- */
- if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
-  fpin) != sizeof(myArchiverStats))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"", statfile)));
- FreeFile(fpin);
- return false;
- }
-
- /* By default, we're going to return the timestamp of the global file. */
- *ts = myGlobalStats.stats_timestamp;
-
- /*
- * We found an existing collector stats file.  Read it and look for a
- * record for the requested database.  If found, use its timestamp.
- */
- for (;;)
- {
- switch (fgetc(fpin))
- {
- /*
- * 'D' A PgStat_StatDBEntry struct describing a database
- * follows.
- */
- case 'D':
- if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
-  fpin) != offsetof(PgStat_StatDBEntry, tables))
- {
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
-
- /*
- * If this is the DB we're looking for, save its timestamp and
- * we're done.
- */
- if (dbentry.databaseid == databaseid)
- {
- *ts = dbentry.stats_timestamp;
- goto done;
- }
+ if (tabhash)
+ dshash_detach(tabhash);
+ if (funchash)
+ dshash_detach(funchash);
 
- break;
-
- case 'E':
- goto done;
-
- default:
- ereport(pgStatRunningInCollector ? LOG : WARNING,
- (errmsg("corrupted statistics file \"%s\"",
- statfile)));
- goto done;
- }
- }
-
-done:
  FreeFile(fpin);
- return true;
-}
-
-/*
- * If not already done, read the statistics collector stats file into
- * some hash tables.  The results will be kept until pgstat_clear_snapshot()
- * is called (typically, at end of transaction).
- */
-static void
-backend_read_statsfile(void)
-{
- TimestampTz min_ts = 0;
- TimestampTz ref_ts = 0;
- Oid inquiry_db;
- int count;
-
- /* already read it? */
- if (pgStatDBHash)
- return;
- Assert(!pgStatRunningInCollector);
-
- /*
- * In a normal backend, we check staleness of the data for our own DB, and
- * so we send MyDatabaseId in inquiry messages.  In the autovac launcher,
- * check staleness of the shared-catalog data, and send InvalidOid in
- * inquiry messages so as not to force writing unnecessary data.
- */
- if (IsAutoVacuumLauncherProcess())
- inquiry_db = InvalidOid;
- else
- inquiry_db = MyDatabaseId;
-
- /*
- * Loop until fresh enough stats file is available or we ran out of time.
- * The stats inquiry message is sent repeatedly in case collector drops
- * it; but not every single time, as that just swamps the collector.
- */
- for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
- {
- bool ok;
- TimestampTz file_ts = 0;
- TimestampTz cur_ts;
-
- CHECK_FOR_INTERRUPTS();
 
- ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
-
- cur_ts = GetCurrentTimestamp();
- /* Calculate min acceptable timestamp, if we didn't already */
- if (count == 0 || cur_ts < ref_ts)
- {
- /*
- * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
- * msec before now.  This indirectly ensures that the collector
- * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
- * an autovacuum worker, however, we want a lower delay to avoid
- * using stale data, so we use PGSTAT_RETRY_DELAY (since the
- * number of workers is low, this shouldn't be a problem).
- *
- * We don't recompute min_ts after sleeping, except in the
- * unlikely case that cur_ts went backwards.  So we might end up
- * accepting a file a bit older than PGSTAT_STAT_INTERVAL.  In
- * practice that shouldn't happen, though, as long as the sleep
- * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
- * tell the collector that our cutoff time is less than what we'd
- * actually accept.
- */
- ref_ts = cur_ts;
- if (IsAutoVacuumWorkerProcess())
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_RETRY_DELAY);
- else
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_STAT_INTERVAL);
- }
-
- /*
- * If the file timestamp is actually newer than cur_ts, we must have
- * had a clock glitch (system time went backwards) or there is clock
- * skew between our processor and the stats collector's processor.
- * Accept the file, but send an inquiry message anyway to make
- * pgstat_recv_inquiry do a sanity check on the collector's time.
- */
- if (ok && file_ts > cur_ts)
- {
- /*
- * A small amount of clock skew between processors isn't terribly
- * surprising, but a large difference is worth logging.  We
- * arbitrarily define "large" as 1000 msec.
- */
- if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
- {
- char   *filetime;
- char   *mytime;
-
- /* Copy because timestamptz_to_str returns a static buffer */
- filetime = pstrdup(timestamptz_to_str(file_ts));
- mytime = pstrdup(timestamptz_to_str(cur_ts));
- elog(LOG, "stats collector's time %s is later than backend local time %s",
- filetime, mytime);
- pfree(filetime);
- pfree(mytime);
- }
-
- pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
- break;
- }
-
- /* Normal acceptance case: file is not older than cutoff time */
- if (ok && file_ts >= min_ts)
- break;
-
- /* Not there or too old, so kick the collector and wait a bit */
- if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
- pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
-
- pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
- }
-
- if (count >= PGSTAT_POLL_LOOP_COUNT)
- ereport(LOG,
- (errmsg("using stale statistics instead of current ones "
- "because stats collector is not responding")));
-
- /*
- * Autovacuum launcher wants stats about all databases, but a shallow read
- * is sufficient.  Regular backends want a deep read for just the tables
- * they can see (MyDatabaseId + shared catalogs).
- */
- if (IsAutoVacuumLauncherProcess())
- pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
- else
- pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
 }
 
-
 /* ----------
  * pgstat_setup_memcxt() -
  *
- * Create pgStatLocalContext, if not already done.
+ * Create pgStatLocalContext and pgStatSnapshotContext, if not already done.
  * ----------
  */
 static void
 pgstat_setup_memcxt(void)
 {
  if (!pgStatLocalContext)
- pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
-   "Statistics snapshot",
-   ALLOCSET_SMALL_SIZES);
+ pgStatLocalContext =
+ AllocSetContextCreate(TopMemoryContext,
+  "Backend statistics snapshot",
+  ALLOCSET_SMALL_SIZES);
+
+ if (!pgStatSnapshotContext)
+ pgStatSnapshotContext =
+ AllocSetContextCreate(TopMemoryContext,
+  "Database statistics snapshot",
+  ALLOCSET_SMALL_SIZES);
 }
 
-
 /* ----------
  * pgstat_clear_snapshot() -
  *
@@ -5801,739 +5743,223 @@ pgstat_clear_snapshot(void)
 {
  /* Release memory, if any was allocated */
  if (pgStatLocalContext)
+ {
  MemoryContextDelete(pgStatLocalContext);
 
- /* Reset variables */
- pgStatLocalContext = NULL;
- pgStatDBHash = NULL;
- localBackendStatusTable = NULL;
- localNumBackends = 0;
-}
+ /* Reset variables */
+ pgStatLocalContext = NULL;
+ localBackendStatusTable = NULL;
+ localNumBackends = 0;
+ }
 
+ if (pgStatSnapshotContext)
+ clear_snapshot  = true;
+}
 
-/* ----------
- * pgstat_recv_inquiry() -
- *
- * Process stat inquiry requests.
- * ----------
- */
-static void
-pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
+static bool
+pgstat_update_tabentry(dshash_table *tabhash, PgStat_TableStatus *stat,
+   bool nowait)
 {
- PgStat_StatDBEntry *dbentry;
+ PgStat_StatTabEntry *tabentry;
+ bool found;
 
- elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
+ if (tabhash == NULL)
+ return false;
 
- /*
- * If there's already a write request for this DB, there's nothing to do.
- *
- * Note that if a request is found, we return early and skip the below
- * check for clock skew.  This is okay, since the only way for a DB
- * request to be present in the list is that we have been here since the
- * last write round.  It seems sufficient to check for clock skew once per
- * write round.
- */
- if (list_member_oid(pending_write_requests, msg->databaseid))
- return;
+ tabentry = (PgStat_StatTabEntry *)
+ dshash_find_or_insert_extended(tabhash, (void *) &(stat->t_id),
+   &found, nowait);
 
- /*
- * Check to see if we last wrote this database at a time >= the requested
- * cutoff time.  If so, this is a stale request that was generated before
- * we updated the DB file, and we don't need to do so again.
- *
- * If the requestor's local clock time is older than stats_timestamp, we
- * should suspect a clock glitch, ie system time going backwards; though
- * the more likely explanation is just delayed message receipt.  It is
- * worth expending a GetCurrentTimestamp call to be sure, since a large
- * retreat in the system clock reading could otherwise cause us to neglect
- * to update the stats file for a long time.
- */
- dbentry = pgstat_get_db_entry(msg->databaseid, false);
- if (dbentry == NULL)
+ /* failed to acquire lock */
+ if (tabentry == NULL)
+ return false;
+
+ if (!found)
  {
  /*
- * We have no data for this DB.  Enter a write request anyway so that
- * the global stats will get updated.  This is needed to prevent
- * backend_read_statsfile from waiting for data that we cannot supply,
- * in the case of a new DB that nobody has yet reported any stats for.
- * See the behavior of pgstat_read_db_statsfile_timestamp.
+ * If it's a new table entry, initialize counters to the values we
+ * just got.
  */
+ tabentry->numscans = stat->t_counts.t_numscans;
+ tabentry->tuples_returned = stat->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched = stat->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted = stat->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated = stat->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted = stat->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated = stat->t_counts.t_tuples_hot_updated;
+ tabentry->n_live_tuples = stat->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples = stat->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze = stat->t_counts.t_changed_tuples;
+ tabentry->blocks_fetched = stat->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit = stat->t_counts.t_blocks_hit;
+
+ tabentry->vacuum_timestamp = 0;
+ tabentry->vacuum_count = 0;
+ tabentry->autovac_vacuum_timestamp = 0;
+ tabentry->autovac_vacuum_count = 0;
+ tabentry->analyze_timestamp = 0;
+ tabentry->analyze_count = 0;
+ tabentry->autovac_analyze_timestamp = 0;
+ tabentry->autovac_analyze_count = 0;
  }
- else if (msg->clock_time < dbentry->stats_timestamp)
+ else
  {
- TimestampTz cur_ts = GetCurrentTimestamp();
-
- if (cur_ts < dbentry->stats_timestamp)
- {
- /*
- * Sure enough, time went backwards.  Force a new stats file write
- * to get back in sync; but first, log a complaint.
- */
- char   *writetime;
- char   *mytime;
-
- /* Copy because timestamptz_to_str returns a static buffer */
- writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
- mytime = pstrdup(timestamptz_to_str(cur_ts));
- elog(LOG,
- "stats_timestamp %s is later than collector's time %s for database %u",
- writetime, mytime, dbentry->databaseid);
- pfree(writetime);
- pfree(mytime);
- }
- else
+ /*
+ * Otherwise add the values to the existing entry.
+ */
+ tabentry->numscans += stat->t_counts.t_numscans;
+ tabentry->tuples_returned += stat->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched += stat->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted += stat->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated += stat->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted += stat->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated += stat->t_counts.t_tuples_hot_updated;
+ /* If table was truncated, first reset the live/dead counters */
+ if (stat->t_counts.t_truncated)
  {
- /*
- * Nope, it's just an old request.  Assuming msg's clock_time is
- * >= its cutoff_time, it must be stale, so we can ignore it.
- */
- return;
+ tabentry->n_live_tuples = 0;
+ tabentry->n_dead_tuples = 0;
  }
- }
- else if (msg->cutoff_time <= dbentry->stats_timestamp)
- {
- /* Stale request, ignore it */
- return;
+ tabentry->n_live_tuples += stat->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples += stat->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze += stat->t_counts.t_changed_tuples;
+ tabentry->blocks_fetched += stat->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit += stat->t_counts.t_blocks_hit;
  }
 
- /*
- * We need to write this DB, so create a request.
- */
- pending_write_requests = lappend_oid(pending_write_requests,
- msg->databaseid);
-}
+ /* Clamp n_live_tuples in case of negative delta_live_tuples */
+ tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+ /* Likewise for n_dead_tuples */
+ tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
 
+ dshash_release_lock(tabhash, tabentry);
+
+ return true;
+}
 
-/* ----------
- * pgstat_recv_tabstat() -
- *
- * Count what the backend has done.
- * ----------
- */
 static void
-pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
+pgstat_update_dbentry(PgStat_StatDBEntry *dbentry, PgStat_TableStatus *stat)
 {
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
- int i;
- bool found;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
  /*
- * Update database-wide stats.
+ * Add per-table stats to the per-database entry, too.
  */
- dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
- dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
- dbentry->n_block_read_time += msg->m_block_read_time;
- dbentry->n_block_write_time += msg->m_block_write_time;
+ LWLockAcquire(&dbentry->lock, LW_EXCLUSIVE);
+ dbentry->n_tuples_returned += stat->t_counts.t_tuples_returned;
+ dbentry->n_tuples_fetched += stat->t_counts.t_tuples_fetched;
+ dbentry->n_tuples_inserted += stat->t_counts.t_tuples_inserted;
+ dbentry->n_tuples_updated += stat->t_counts.t_tuples_updated;
+ dbentry->n_tuples_deleted += stat->t_counts.t_tuples_deleted;
+ dbentry->n_blocks_fetched += stat->t_counts.t_blocks_fetched;
+ dbentry->n_blocks_hit += stat->t_counts.t_blocks_hit;
+ LWLockRelease(&dbentry->lock);
+}
 
- /*
- * Process all table entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
+/*
+ * Lookup shared stats hash table for the specified database. Returns NULL
+ * when PGSTAT_NOWAIT and required lock cannot be acquired.
+ */
+static PgStat_StatDBEntry *
+pgstat_get_db_entry(Oid databaseid, int op, PgStat_TableLookupResult *status)
+{
+ PgStat_StatDBEntry *result;
+ bool nowait = ((op & PGSTAT_NOWAIT) != 0);
+ bool lock_acquired = true;
+ bool found = true;
 
- tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-   (void *) &(tabmsg->t_id),
-   HASH_ENTER, &found);
+ if (!IsUnderPostmaster || !pgStatDBHash)
+ return NULL;
 
- if (!found)
- {
- /*
- * If it's a new table entry, initialize counters to the values we
- * just got.
- */
- tabentry->numscans = tabmsg->t_counts.t_numscans;
- tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned;
- tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched;
- tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted;
- tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated;
- tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted;
- tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated;
- tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples;
- tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples;
- tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples;
- tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched;
- tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit;
-
- tabentry->vacuum_timestamp = 0;
- tabentry->vacuum_count = 0;
- tabentry->autovac_vacuum_timestamp = 0;
- tabentry->autovac_vacuum_count = 0;
- tabentry->analyze_timestamp = 0;
- tabentry->analyze_count = 0;
- tabentry->autovac_analyze_timestamp = 0;
- tabentry->autovac_analyze_count = 0;
- }
- else
+ /* Lookup or create the hash table entry for this database */
+ if (op & PGSTAT_EXCLUSIVE)
+ {
+ result = (PgStat_StatDBEntry *)
+ dshash_find_or_insert_extended(pgStatDBHash, &databaseid,
+   &found, nowait);
+ if (result == NULL)
+ lock_acquired = false;
+ else if (!found)
  {
  /*
- * Otherwise add the values to the existing entry.
+ * If not found, initialize the new one.  This creates empty hash
+ * tables hash, too.
  */
- tabentry->numscans += tabmsg->t_counts.t_numscans;
- tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned;
- tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
- tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
- tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated;
- tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
- tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated;
- /* If table was truncated, first reset the live/dead counters */
- if (tabmsg->t_counts.t_truncated)
- {
- tabentry->n_live_tuples = 0;
- tabentry->n_dead_tuples = 0;
- }
- tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples;
- tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples;
- tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples;
- tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
- tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit;
+ init_dbentry(result);
+ reset_dbentry_counters(result);
  }
-
- /* Clamp n_live_tuples in case of negative delta_live_tuples */
- tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
- /* Likewise for n_dead_tuples */
- tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
-
- /*
- * Add per-table stats to the per-database entry, too.
- */
- dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned;
- dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
- dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
- dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated;
- dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
- dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
- dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit;
- }
-}
-
-
-/* ----------
- * pgstat_recv_tabpurge() -
- *
- * Arrange for dead table removal.
- * ----------
- */
-static void
-pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- int i;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- /*
- * No need to purge if we don't even know the database.
- */
- if (!dbentry || !dbentry->tables)
- return;
-
- /*
- * Process all table entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
- {
- /* Remove from hashtable if present; we don't care if it's not. */
- (void) hash_search(dbentry->tables,
-   (void *) &(msg->m_tableid[i]),
-   HASH_REMOVE, NULL);
- }
-}
-
-
-/* ----------
- * pgstat_recv_dropdb() -
- *
- * Arrange for dead database removal
- * ----------
- */
-static void
-pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
-{
- Oid dbid = msg->m_databaseid;
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Lookup the database in the hashtable.
- */
- dbentry = pgstat_get_db_entry(dbid, false);
-
- /*
- * If found, remove it (along with the db statfile).
- */
- if (dbentry)
- {
- char statfile[MAXPGPATH];
-
- get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
- elog(DEBUG2, "removing stats file \"%s\"", statfile);
- unlink(statfile);
-
- if (dbentry->tables != NULL)
- hash_destroy(dbentry->tables);
- if (dbentry->functions != NULL)
- hash_destroy(dbentry->functions);
-
- if (hash_search(pgStatDBHash,
- (void *) &dbid,
- HASH_REMOVE, NULL) == NULL)
- ereport(ERROR,
- (errmsg("database hash table corrupted during cleanup --- abort")));
- }
-}
-
-
-/* ----------
- * pgstat_recv_resetcounter() -
- *
- * Reset the statistics for the specified database.
- * ----------
- */
-static void
-pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Lookup the database in the hashtable.  Nothing to do if not there.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- if (!dbentry)
- return;
-
- /*
- * We simply throw away all the database's table entries by recreating a
- * new hash table for them.
- */
- if (dbentry->tables != NULL)
- hash_destroy(dbentry->tables);
- if (dbentry->functions != NULL)
- hash_destroy(dbentry->functions);
-
- dbentry->tables = NULL;
- dbentry->functions = NULL;
-
- /*
- * Reset database-level stats, too.  This creates empty hash tables for
- * tables and functions.
- */
- reset_dbentry_counters(dbentry);
-}
-
-/* ----------
- * pgstat_recv_resetsharedcounter() -
- *
- * Reset some shared statistics of the cluster.
- * ----------
- */
-static void
-pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
-{
- if (msg->m_resettarget == RESET_BGWRITER)
- {
- /* Reset the global background writer statistics for the cluster. */
- memset(&globalStats, 0, sizeof(globalStats));
- globalStats.stat_reset_timestamp = GetCurrentTimestamp();
- }
- else if (msg->m_resettarget == RESET_ARCHIVER)
- {
- /* Reset the archiver statistics for the cluster. */
- memset(&archiverStats, 0, sizeof(archiverStats));
- archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
- }
-
- /*
- * Presumably the sender of this message validated the target, don't
- * complain here if it's not valid
- */
-}
-
-/* ----------
- * pgstat_recv_resetsinglecounter() -
- *
- * Reset a statistics for a single object
- * ----------
- */
-static void
-pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
- if (!dbentry)
- return;
-
- /* Set the reset timestamp for the whole database */
- dbentry->stat_reset_timestamp = GetCurrentTimestamp();
-
- /* Remove object if it exists, ignore it if not */
- if (msg->m_resettype == RESET_TABLE)
- (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid),
-   HASH_REMOVE, NULL);
- else if (msg->m_resettype == RESET_FUNCTION)
- (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid),
-   HASH_REMOVE, NULL);
-}
-
-/* ----------
- * pgstat_recv_autovac() -
- *
- * Process an autovacuum signalling message.
- * ----------
- */
-static void
-pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- /*
- * Store the last autovacuum time in the database's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->last_autovac_time = msg->m_start_time;
-}
-
-/* ----------
- * pgstat_recv_vacuum() -
- *
- * Process a VACUUM message.
- * ----------
- */
-static void
-pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
-
- /*
- * Store the data in the table's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
-
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
-
- if (msg->m_autovacuum)
- {
- tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
- tabentry->autovac_vacuum_count++;
  }
  else
  {
- tabentry->vacuum_timestamp = msg->m_vacuumtime;
- tabentry->vacuum_count++;
+ result = (PgStat_StatDBEntry *)
+ dshash_find_extended(pgStatDBHash, &databaseid, true, nowait,
+ nowait ? &lock_acquired : NULL);
+ if (result == NULL)
+ found = false;
  }
-}
 
-/* ----------
- * pgstat_recv_analyze() -
- *
- * Process an ANALYZE message.
- * ----------
- */
-static void
-pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
- PgStat_StatTabEntry *tabentry;
-
- /*
- * Store the data in the table's hashtable entry.
- */
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
-
- tabentry->n_live_tuples = msg->m_live_tuples;
- tabentry->n_dead_tuples = msg->m_dead_tuples;
-
- /*
- * If commanded, reset changes_since_analyze to zero.  This forgets any
- * changes that were committed while the ANALYZE was in progress, but we
- * have no good way to estimate how many of those there were.
- */
- if (msg->m_resetcounter)
- tabentry->changes_since_analyze = 0;
-
- if (msg->m_autovacuum)
+ /* Set return status if requested */
+ if (status)
  {
- tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
- tabentry->autovac_analyze_count++;
- }
- else
- {
- tabentry->analyze_timestamp = msg->m_analyzetime;
- tabentry->analyze_count++;
- }
-}
-
-
-/* ----------
- * pgstat_recv_archiver() -
- *
- * Process a ARCHIVER message.
- * ----------
- */
-static void
-pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
-{
- if (msg->m_failed)
- {
- /* Failed archival attempt */
- ++archiverStats.failed_count;
- memcpy(archiverStats.last_failed_wal, msg->m_xlog,
-   sizeof(archiverStats.last_failed_wal));
- archiverStats.last_failed_timestamp = msg->m_timestamp;
- }
- else
- {
- /* Successful archival operation */
- ++archiverStats.archived_count;
- memcpy(archiverStats.last_archived_wal, msg->m_xlog,
-   sizeof(archiverStats.last_archived_wal));
- archiverStats.last_archived_timestamp = msg->m_timestamp;
- }
-}
-
-/* ----------
- * pgstat_recv_bgwriter() -
- *
- * Process a BGWRITER message.
- * ----------
- */
-static void
-pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
-{
- globalStats.timed_checkpoints += msg->m_timed_checkpoints;
- globalStats.requested_checkpoints += msg->m_requested_checkpoints;
- globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
- globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
- globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
- globalStats.buf_written_clean += msg->m_buf_written_clean;
- globalStats.maxwritten_clean += msg->m_maxwritten_clean;
- globalStats.buf_written_backend += msg->m_buf_written_backend;
- globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
- globalStats.buf_alloc += msg->m_buf_alloc;
-}
-
-/* ----------
- * pgstat_recv_recoveryconflict() -
- *
- * Process a RECOVERYCONFLICT message.
- * ----------
- */
-static void
-pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- switch (msg->m_reason)
- {
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
-
- /*
- * Since we drop the information about the database as soon as it
- * replicates, there is no point in counting these conflicts.
- */
- break;
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
- dbentry->n_conflict_tablespace++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
- dbentry->n_conflict_lock++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
- dbentry->n_conflict_snapshot++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
- dbentry->n_conflict_bufferpin++;
- break;
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
- dbentry->n_conflict_startup_deadlock++;
- break;
- }
-}
-
-/* ----------
- * pgstat_recv_deadlock() -
- *
- * Process a DEADLOCK message.
- * ----------
- */
-static void
-pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_deadlocks++;
-}
-
-/* ----------
- * pgstat_recv_checksum_failure() -
- *
- * Process a CHECKSUMFAILURE message.
- * ----------
- */
-static void
-pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_checksum_failures += msg->m_failurecount;
- dbentry->last_checksum_failure = msg->m_failure_time;
-}
-
-/* ----------
- * pgstat_recv_tempfile() -
- *
- * Process a TEMPFILE message.
- * ----------
- */
-static void
-pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
-{
- PgStat_StatDBEntry *dbentry;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- dbentry->n_temp_bytes += msg->m_filesize;
- dbentry->n_temp_files += 1;
-}
-
-/* ----------
- * pgstat_recv_funcstat() -
- *
- * Count what the backend has done.
- * ----------
- */
-static void
-pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
-{
- PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
- PgStat_StatDBEntry *dbentry;
- PgStat_StatFuncEntry *funcentry;
- int i;
- bool found;
-
- dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
- /*
- * Process all function entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++, funcmsg++)
- {
- funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
- (void *) &(funcmsg->f_id),
- HASH_ENTER, &found);
-
- if (!found)
+ if (!lock_acquired)
  {
- /*
- * If it's a new function entry, initialize counters to the values
- * we just got.
- */
- funcentry->f_numcalls = funcmsg->f_numcalls;
- funcentry->f_total_time = funcmsg->f_total_time;
- funcentry->f_self_time = funcmsg->f_self_time;
+ Assert(nowait);
+ *status = LOCK_FAILED;
  }
+ else if (!found)
+ *status = NOT_FOUND;
  else
- {
- /*
- * Otherwise add the values to the existing entry.
- */
- funcentry->f_numcalls += funcmsg->f_numcalls;
- funcentry->f_total_time += funcmsg->f_total_time;
- funcentry->f_self_time += funcmsg->f_self_time;
- }
+ *status = FOUND;
  }
+
+ return result;
 }
 
-/* ----------
- * pgstat_recv_funcpurge() -
- *
- * Arrange for dead function removal.
- * ----------
+/*
+ * Lookup the hash table entry for the specified table. If no hash
+ * table entry exists, initialize it, if the create parameter is true.
+ * Else, return NULL.
  */
-static void
-pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
+static PgStat_StatTabEntry *
+pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create)
 {
- PgStat_StatDBEntry *dbentry;
- int i;
+ PgStat_StatTabEntry *result;
+ bool found;
 
- dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+ /* Lookup or create the hash table entry for this table */
+ if (create)
+ result = (PgStat_StatTabEntry *)
+ dshash_find_or_insert(table, &tableoid, &found);
+ else
+ result = (PgStat_StatTabEntry *) dshash_find(table, &tableoid, false);
 
- /*
- * No need to purge if we don't even know the database.
- */
- if (!dbentry || !dbentry->functions)
- return;
+ if (!create && !found)
+ return NULL;
 
- /*
- * Process all function entries in the message.
- */
- for (i = 0; i < msg->m_nentries; i++)
+ /* If not found, initialize the new one. */
+ if (!found)
  {
- /* Remove from hashtable if present; we don't care if it's not. */
- (void) hash_search(dbentry->functions,
-   (void *) &(msg->m_functionid[i]),
-   HASH_REMOVE, NULL);
+ result->numscans = 0;
+ result->tuples_returned = 0;
+ result->tuples_fetched = 0;
+ result->tuples_inserted = 0;
+ result->tuples_updated = 0;
+ result->tuples_deleted = 0;
+ result->tuples_hot_updated = 0;
+ result->n_live_tuples = 0;
+ result->n_dead_tuples = 0;
+ result->changes_since_analyze = 0;
+ result->blocks_fetched = 0;
+ result->blocks_hit = 0;
+ result->vacuum_timestamp = 0;
+ result->vacuum_count = 0;
+ result->autovac_vacuum_timestamp = 0;
+ result->autovac_vacuum_count = 0;
+ result->analyze_timestamp = 0;
+ result->analyze_count = 0;
+ result->autovac_analyze_timestamp = 0;
+ result->autovac_analyze_count = 0;
  }
-}
 
-/* ----------
- * pgstat_write_statsfile_needed() -
- *
- * Do we need to write out any stats files?
- * ----------
- */
-static bool
-pgstat_write_statsfile_needed(void)
-{
- if (pending_write_requests != NIL)
- return true;
-
- /* Everything was written recently */
- return false;
-}
-
-/* ----------
- * pgstat_db_requested() -
- *
- * Checks whether stats for a particular DB need to be written to a file.
- * ----------
- */
-static bool
-pgstat_db_requested(Oid databaseid)
-{
- /*
- * If any requests are outstanding at all, we should write the stats for
- * shared catalogs (the "database" with OID 0).  This ensures that
- * backends will see up-to-date stats for shared catalogs, even though
- * they send inquiry messages mentioning only their own DB.
- */
- if (databaseid == InvalidOid && pending_write_requests != NIL)
- return true;
-
- /* Search to see if there's an open request to write this database. */
- if (list_member_oid(pending_write_requests, databaseid))
- return true;
-
- return false;
+ return result;
 }
 
 /*
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 84fda38249..4c0ea0cc23 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -255,7 +255,6 @@ static pid_t StartupPID = 0,
  WalReceiverPID = 0,
  AutoVacPID = 0,
  PgArchPID = 0,
- PgStatPID = 0,
  SysLoggerPID = 0;
 
 /* Startup process's status */
@@ -503,7 +502,6 @@ typedef struct
  PGPROC   *AuxiliaryProcs;
  PGPROC   *PreparedXactProcs;
  PMSignalData *PMSignalState;
- InheritableSocket pgStatSock;
  pid_t PostmasterPid;
  TimestampTz PgStartTime;
  TimestampTz PgReloadTime;
@@ -1326,12 +1324,6 @@ PostmasterMain(int argc, char *argv[])
  */
  RemovePgTempFiles();
 
- /*
- * Initialize stats collection subsystem (this does NOT start the
- * collector process!)
- */
- pgstat_init();
-
  /*
  * Initialize the autovacuum subsystem (again, no process start yet)
  */
@@ -1780,11 +1772,6 @@ ServerLoop(void)
  start_autovac_launcher = false; /* signal processed */
  }
 
- /* If we have lost the stats collector, try to start a new one */
- if (PgStatPID == 0 &&
- (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
- PgStatPID = pgstat_start();
-
  /* If we have lost the archiver, try to start a new one. */
  if (PgArchPID == 0 && PgArchStartupAllowed())
  PgArchPID = StartArchiver();
@@ -2681,8 +2668,6 @@ SIGHUP_handler(SIGNAL_ARGS)
  signal_child(PgArchPID, SIGHUP);
  if (SysLoggerPID != 0)
  signal_child(SysLoggerPID, SIGHUP);
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGHUP);
 
  /* Reload authentication config files too */
  if (!load_hba())
@@ -3045,8 +3030,6 @@ reaper(SIGNAL_ARGS)
  AutoVacPID = StartAutoVacLauncher();
  if (PgArchStartupAllowed() && PgArchPID == 0)
  PgArchPID = StartArchiver();
- if (PgStatPID == 0)
- PgStatPID = pgstat_start();
 
  /* workers may be scheduled to start now */
  maybe_start_bgworkers();
@@ -3113,13 +3096,6 @@ reaper(SIGNAL_ARGS)
  SignalChildren(SIGUSR2);
 
  pmState = PM_SHUTDOWN_2;
-
- /*
- * We can also shut down the stats collector now; there's
- * nothing left for it to do.
- */
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
  }
  else
  {
@@ -3194,22 +3170,6 @@ reaper(SIGNAL_ARGS)
  continue;
  }
 
- /*
- * Was it the statistics collector?  If so, just try to start a new
- * one; no need to force reset of the rest of the system.  (If fail,
- * we'll try again in future cycles of the main loop.)
- */
- if (pid == PgStatPID)
- {
- PgStatPID = 0;
- if (!EXIT_STATUS_0(exitstatus))
- LogChildExit(LOG, _("statistics collector process"),
- pid, exitstatus);
- if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
- PgStatPID = pgstat_start();
- continue;
- }
-
  /* Was it the system logger?  If so, try to start a new one */
  if (pid == SysLoggerPID)
  {
@@ -3670,22 +3630,6 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  signal_child(PgArchPID, SIGQUIT);
  }
 
- /*
- * Force a power-cycle of the pgstat process too.  (This isn't absolutely
- * necessary, but it seems like a good idea for robustness, and it
- * simplifies the state-machine logic in the case where a shutdown request
- * arrives during crash processing.)
- */
- if (PgStatPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- "SIGQUIT",
- (int) PgStatPID)));
- signal_child(PgStatPID, SIGQUIT);
- allow_immediate_pgstat_restart();
- }
-
  /* We do NOT restart the syslogger */
 
  if (Shutdown != ImmediateShutdown)
@@ -3881,8 +3825,6 @@ PostmasterStateMachine(void)
  SignalChildren(SIGQUIT);
  if (PgArchPID != 0)
  signal_child(PgArchPID, SIGQUIT);
- if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
  }
  }
  }
@@ -3917,8 +3859,7 @@ PostmasterStateMachine(void)
  * normal state transition leading up to PM_WAIT_DEAD_END, or during
  * FatalError processing.
  */
- if (dlist_is_empty(&BackendList) &&
- PgArchPID == 0 && PgStatPID == 0)
+ if (dlist_is_empty(&BackendList) && PgArchPID == 0)
  {
  /* These other guys should be dead already */
  Assert(StartupPID == 0);
@@ -4119,8 +4060,6 @@ TerminateChildren(int signal)
  signal_child(AutoVacPID, signal);
  if (PgArchPID != 0)
  signal_child(PgArchPID, signal);
- if (PgStatPID != 0)
- signal_child(PgStatPID, signal);
 }
 
 /*
@@ -5093,18 +5032,6 @@ SubPostmasterMain(int argc, char *argv[])
 
  StartBackgroundWorker();
  }
- if (strcmp(argv[1], "--forkarch") == 0)
- {
- /* Do not want to attach to shared memory */
-
- PgArchiverMain(argc, argv); /* does not return */
- }
- if (strcmp(argv[1], "--forkcol") == 0)
- {
- /* Do not want to attach to shared memory */
-
- PgstatCollectorMain(argc, argv); /* does not return */
- }
  if (strcmp(argv[1], "--forklog") == 0)
  {
  /* Do not want to attach to shared memory */
@@ -5223,12 +5150,6 @@ sigusr1_handler(SIGNAL_ARGS)
  if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
  pmState == PM_RECOVERY && Shutdown == NoShutdown)
  {
- /*
- * Likewise, start other special children as needed.
- */
- Assert(PgStatPID == 0);
- PgStatPID = pgstat_start();
-
  ereport(LOG,
  (errmsg("database system is ready to accept read only connections")));
 
@@ -6133,7 +6054,6 @@ extern slock_t *ShmemLock;
 extern slock_t *ProcStructLock;
 extern PGPROC *AuxiliaryProcs;
 extern PMSignalData *PMSignalState;
-extern pgsocket pgStatSock;
 extern pg_time_t first_syslogger_file_time;
 
 #ifndef WIN32
@@ -6189,8 +6109,6 @@ save_backend_variables(BackendParameters *param, Port *port,
  param->AuxiliaryProcs = AuxiliaryProcs;
  param->PreparedXactProcs = PreparedXactProcs;
  param->PMSignalState = PMSignalState;
- if (!write_inheritable_socket(&param->pgStatSock, pgStatSock, childPid))
- return false;
 
  param->PostmasterPid = PostmasterPid;
  param->PgStartTime = PgStartTime;
@@ -6425,7 +6343,6 @@ restore_backend_variables(BackendParameters *param, Port *port)
  AuxiliaryProcs = param->AuxiliaryProcs;
  PreparedXactProcs = param->PreparedXactProcs;
  PMSignalState = param->PMSignalState;
- read_inheritable_socket(&pgStatSock, &param->pgStatSock);
 
  PostmasterPid = param->PostmasterPid;
  PgStartTime = param->PgStartTime;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 4829953ee6..5093a4a11d 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -147,6 +147,7 @@ CreateSharedMemoryAndSemaphores(void)
  size = add_size(size, BTreeShmemSize());
  size = add_size(size, SyncScanShmemSize());
  size = add_size(size, AsyncShmemSize());
+ size = add_size(size, StatsShmemSize());
 #ifdef EXEC_BACKEND
  size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -263,6 +264,7 @@ CreateSharedMemoryAndSemaphores(void)
  BTreeShmemInit();
  SyncScanShmemInit();
  AsyncShmemInit();
+ StatsShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 18e3843e8b..caa00011a9 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -522,6 +522,7 @@ RegisterLWLockTranches(void)
  LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
  LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join");
  LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact");
+ LWLockRegisterTranche(LWTRANCHE_STATS, "activity stats");
 
  /* Register named tranches. */
  for (i = 0; i < NamedLWLockTrancheRequests; i++)
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3b85e48333..c518424471 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3181,6 +3181,12 @@ ProcessInterrupts(void)
 
  if (ParallelMessagePending)
  HandleParallelMessages();
+
+ if (IdleStatsUpdateTimeoutPending)
+ {
+ IdleStatsUpdateTimeoutPending = false;
+ pgstat_report_stat(true);
+ }
 }
 
 
@@ -3755,6 +3761,7 @@ PostgresMain(int argc, char *argv[],
  sigjmp_buf local_sigjmp_buf;
  volatile bool send_ready_for_query = true;
  bool disable_idle_in_transaction_timeout = false;
+ bool disable_idle_stats_update_timeout = false;
 
  /* Initialize startup process environment if necessary. */
  if (!IsUnderPostmaster)
@@ -4194,6 +4201,8 @@ PostgresMain(int argc, char *argv[],
  }
  else
  {
+ long stats_timeout;
+
  /* Send out notify signals and transmit self-notifies */
  ProcessCompletedNotifies();
 
@@ -4206,8 +4215,13 @@ PostgresMain(int argc, char *argv[],
  if (notifyInterruptPending)
  ProcessNotifyInterrupt();
 
- pgstat_report_stat(false);
-
+ stats_timeout = pgstat_report_stat(false);
+ if (stats_timeout > 0)
+ {
+ disable_idle_stats_update_timeout = true;
+ enable_timeout_after(IDLE_STATS_UPDATE_TIMEOUT,
+ stats_timeout);
+ }
  set_ps_display("idle", false);
  pgstat_report_activity(STATE_IDLE, NULL);
  }
@@ -4242,7 +4256,7 @@ PostgresMain(int argc, char *argv[],
  DoingCommandRead = false;
 
  /*
- * (5) turn off the idle-in-transaction timeout
+ * (5) turn off the idle-in-transaction timeout and stats update timeout
  */
  if (disable_idle_in_transaction_timeout)
  {
@@ -4250,6 +4264,12 @@ PostgresMain(int argc, char *argv[],
  disable_idle_in_transaction_timeout = false;
  }
 
+ if (disable_idle_stats_update_timeout)
+ {
+ disable_timeout(IDLE_STATS_UPDATE_TIMEOUT, false);
+ disable_idle_stats_update_timeout = false;
+ }
+
  /*
  * (6) check for any other interesting events that happened while we
  * slept.
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 3bf96de256..9c694f20c9 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -32,6 +32,7 @@ volatile sig_atomic_t QueryCancelPending = false;
 volatile sig_atomic_t ProcDiePending = false;
 volatile sig_atomic_t ClientConnectionLost = false;
 volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
+volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false;
 volatile sig_atomic_t ConfigReloadPending = false;
 volatile uint32 InterruptHoldoffCount = 0;
 volatile uint32 QueryCancelHoldoffCount = 0;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index cc38669a1e..a60bc58b0c 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -73,6 +73,7 @@ static void ShutdownPostgres(int code, Datum arg);
 static void StatementTimeoutHandler(void);
 static void LockTimeoutHandler(void);
 static void IdleInTransactionSessionTimeoutHandler(void);
+static void IdleStatsUpdateTimeoutHandler(void);
 static bool ThereIsAtLeastOneRole(void);
 static void process_startup_options(Port *port, bool am_superuser);
 static void process_settings(Oid databaseid, Oid roleid);
@@ -630,6 +631,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
  RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler);
  RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
  IdleInTransactionSessionTimeoutHandler);
+ RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT,
+ IdleStatsUpdateTimeoutHandler);
  }
 
  /*
@@ -1240,6 +1243,14 @@ IdleInTransactionSessionTimeoutHandler(void)
  SetLatch(MyLatch);
 }
 
+static void
+IdleStatsUpdateTimeoutHandler(void)
+{
+ IdleStatsUpdateTimeoutPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
 /*
  * Returns true if at least one role is defined in this database cluster.
  */
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index b7d36b65dd..13be46c172 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 106;
+use Test::More tests => 105;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -123,7 +123,7 @@ is_deeply(
 
 # Contents of these directories should not be copied.
 foreach my $dirname (
- qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_stat_tmp pg_subtrans)
+ qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_subtrans)
   )
 {
  is_deeply(
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1f4db67f3f..43250c3885 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -82,6 +82,7 @@ extern PGDLLIMPORT volatile sig_atomic_t InterruptPending;
 extern PGDLLIMPORT volatile sig_atomic_t QueryCancelPending;
 extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
 extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
+extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending;
 extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
 
 extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 65713abc2b..c9fbcead3f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1,7 +1,7 @@
 /* ----------
  * pgstat.h
  *
- * Definitions for the PostgreSQL statistics collector daemon.
+ * Definitions for the PostgreSQL statistics collector facility.
  *
  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  *
@@ -13,10 +13,11 @@
 
 #include "datatype/timestamp.h"
 #include "libpq/pqcomm.h"
-#include "port/atomics.h"
+#include "lib/dshash.h"
 #include "portability/instr_time.h"
 #include "postmaster/pgarch.h"
 #include "storage/proc.h"
+#include "storage/lwlock.h"
 #include "utils/hsearch.h"
 #include "utils/relcache.h"
 
@@ -40,33 +41,6 @@ typedef enum TrackFunctionsLevel
  TRACK_FUNC_ALL
 } TrackFunctionsLevel;
 
-/* ----------
- * The types of backend -> collector messages
- * ----------
- */
-typedef enum StatMsgType
-{
- PGSTAT_MTYPE_DUMMY,
- PGSTAT_MTYPE_INQUIRY,
- PGSTAT_MTYPE_TABSTAT,
- PGSTAT_MTYPE_TABPURGE,
- PGSTAT_MTYPE_DROPDB,
- PGSTAT_MTYPE_RESETCOUNTER,
- PGSTAT_MTYPE_RESETSHAREDCOUNTER,
- PGSTAT_MTYPE_RESETSINGLECOUNTER,
- PGSTAT_MTYPE_AUTOVAC_START,
- PGSTAT_MTYPE_VACUUM,
- PGSTAT_MTYPE_ANALYZE,
- PGSTAT_MTYPE_ARCHIVER,
- PGSTAT_MTYPE_BGWRITER,
- PGSTAT_MTYPE_FUNCSTAT,
- PGSTAT_MTYPE_FUNCPURGE,
- PGSTAT_MTYPE_RECOVERYCONFLICT,
- PGSTAT_MTYPE_TEMPFILE,
- PGSTAT_MTYPE_DEADLOCK,
- PGSTAT_MTYPE_CHECKSUMFAILURE
-} StatMsgType;
-
 /* ----------
  * The data type used for counters.
  * ----------
@@ -77,9 +51,8 @@ typedef int64 PgStat_Counter;
  * PgStat_TableCounts The actual per-table counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
- * it against zeroes to detect whether there are any counts to transmit.
- * It is a component of PgStat_TableStatus (within-backend state) and
- * PgStat_TableEntry (the transmitted message format).
+ * it against zeroes to detect whether there are any counts to write.
+ * It is a component of PgStat_TableStatus (within-backend state).
  *
  * Note: for a table, tuples_returned is the number of tuples successfully
  * fetched by heap_getnext, while tuples_fetched is the number of tuples
@@ -115,13 +88,6 @@ typedef struct PgStat_TableCounts
  PgStat_Counter t_blocks_hit;
 } PgStat_TableCounts;
 
-/* Possible targets for resetting cluster-wide shared values */
-typedef enum PgStat_Shared_Reset_Target
-{
- RESET_ARCHIVER,
- RESET_BGWRITER
-} PgStat_Shared_Reset_Target;
-
 /* Possible object types for resetting single counters */
 typedef enum PgStat_Single_Reset_Type
 {
@@ -180,236 +146,12 @@ typedef struct PgStat_TableXactStatus
 } PgStat_TableXactStatus;
 
 
-/* ------------------------------------------------------------
- * Message formats follow
- * ------------------------------------------------------------
- */
-
-
-/* ----------
- * PgStat_MsgHdr The common message header
- * ----------
- */
-typedef struct PgStat_MsgHdr
-{
- StatMsgType m_type;
- int m_size;
-} PgStat_MsgHdr;
-
-/* ----------
- * Space available in a message.  This will keep the UDP packets below 1K,
- * which should fit unfragmented into the MTU of the loopback interface.
- * (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most
- * platforms, but we're being conservative here.)
- * ----------
- */
-#define PGSTAT_MAX_MSG_SIZE 1000
-#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr))
-
-
-/* ----------
- * PgStat_MsgDummy A dummy message, ignored by the collector
- * ----------
- */
-typedef struct PgStat_MsgDummy
-{
- PgStat_MsgHdr m_hdr;
-} PgStat_MsgDummy;
-
-
-/* ----------
- * PgStat_MsgInquiry Sent by a backend to ask the collector
- * to write the stats file(s).
- *
- * Ordinarily, an inquiry message prompts writing of the global stats file,
- * the stats file for shared catalogs, and the stats file for the specified
- * database.  If databaseid is InvalidOid, only the first two are written.
- *
- * New file(s) will be written only if the existing file has a timestamp
- * older than the specified cutoff_time; this prevents duplicated effort
- * when multiple requests arrive at nearly the same time, assuming that
- * backends send requests with cutoff_times a little bit in the past.
- *
- * clock_time should be the requestor's current local time; the collector
- * uses this to check for the system clock going backward, but it has no
- * effect unless that occurs.  We assume clock_time >= cutoff_time, though.
- * ----------
- */
-
-typedef struct PgStat_MsgInquiry
-{
- PgStat_MsgHdr m_hdr;
- TimestampTz clock_time; /* observed local clock time */
- TimestampTz cutoff_time; /* minimum acceptable file timestamp */
- Oid databaseid; /* requested DB (InvalidOid => shared only) */
-} PgStat_MsgInquiry;
-
-
-/* ----------
- * PgStat_TableEntry Per-table info in a MsgTabstat
- * ----------
- */
-typedef struct PgStat_TableEntry
-{
- Oid t_id;
- PgStat_TableCounts t_counts;
-} PgStat_TableEntry;
-
-/* ----------
- * PgStat_MsgTabstat Sent by the backend to report table
- * and buffer access statistics.
- * ----------
- */
-#define PGSTAT_NUM_TABENTRIES  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter)) \
- / sizeof(PgStat_TableEntry))
-
-typedef struct PgStat_MsgTabstat
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- int m_xact_commit;
- int m_xact_rollback;
- PgStat_Counter m_block_read_time; /* times in microseconds */
- PgStat_Counter m_block_write_time;
- PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
-} PgStat_MsgTabstat;
-
-
-/* ----------
- * PgStat_MsgTabpurge Sent by the backend to tell the collector
- * about dead tables.
- * ----------
- */
-#define PGSTAT_NUM_TABPURGE  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(Oid))
-
-typedef struct PgStat_MsgTabpurge
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- Oid m_tableid[PGSTAT_NUM_TABPURGE];
-} PgStat_MsgTabpurge;
-
-
-/* ----------
- * PgStat_MsgDropdb Sent by the backend to tell the collector
- * about a dropped database
- * ----------
- */
-typedef struct PgStat_MsgDropdb
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgDropdb;
-
-
-/* ----------
- * PgStat_MsgResetcounter Sent by the backend to tell the collector
- * to reset counters
- * ----------
- */
-typedef struct PgStat_MsgResetcounter
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgResetcounter;
-
-/* ----------
- * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
- * to reset a shared counter
- * ----------
- */
-typedef struct PgStat_MsgResetsharedcounter
-{
- PgStat_MsgHdr m_hdr;
- PgStat_Shared_Reset_Target m_resettarget;
-} PgStat_MsgResetsharedcounter;
-
-/* ----------
- * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector
- * to reset a single counter
- * ----------
- */
-typedef struct PgStat_MsgResetsinglecounter
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- PgStat_Single_Reset_Type m_resettype;
- Oid m_objectid;
-} PgStat_MsgResetsinglecounter;
-
-/* ----------
- * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal
- * that a database is going to be processed
- * ----------
- */
-typedef struct PgStat_MsgAutovacStart
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- TimestampTz m_start_time;
-} PgStat_MsgAutovacStart;
-
-
 /* ----------
- * PgStat_MsgVacuum Sent by the backend or autovacuum daemon
- * after VACUUM
- * ----------
- */
-typedef struct PgStat_MsgVacuum
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- Oid m_tableoid;
- bool m_autovacuum;
- TimestampTz m_vacuumtime;
- PgStat_Counter m_live_tuples;
- PgStat_Counter m_dead_tuples;
-} PgStat_MsgVacuum;
-
-
-/* ----------
- * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon
- * after ANALYZE
- * ----------
- */
-typedef struct PgStat_MsgAnalyze
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- Oid m_tableoid;
- bool m_autovacuum;
- bool m_resetcounter;
- TimestampTz m_analyzetime;
- PgStat_Counter m_live_tuples;
- PgStat_Counter m_dead_tuples;
-} PgStat_MsgAnalyze;
-
-
-/* ----------
- * PgStat_MsgArchiver Sent by the archiver to update statistics.
- * ----------
- */
-typedef struct PgStat_MsgArchiver
-{
- PgStat_MsgHdr m_hdr;
- bool m_failed; /* Failed attempt */
- char m_xlog[MAX_XFN_CHARS + 1];
- TimestampTz m_timestamp;
-} PgStat_MsgArchiver;
-
-/* ----------
- * PgStat_MsgBgWriter Sent by the bgwriter to update statistics.
+ * PgStat_MsgBgWriter bgwriter statistics
  * ----------
  */
 typedef struct PgStat_MsgBgWriter
 {
- PgStat_MsgHdr m_hdr;
-
  PgStat_Counter m_timed_checkpoints;
  PgStat_Counter m_requested_checkpoints;
  PgStat_Counter m_buf_written_checkpoints;
@@ -422,38 +164,14 @@ typedef struct PgStat_MsgBgWriter
  PgStat_Counter m_checkpoint_sync_time;
 } PgStat_MsgBgWriter;
 
-/* ----------
- * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict
- * ----------
- */
-typedef struct PgStat_MsgRecoveryConflict
-{
- PgStat_MsgHdr m_hdr;
-
- Oid m_databaseid;
- int m_reason;
-} PgStat_MsgRecoveryConflict;
-
-/* ----------
- * PgStat_MsgTempFile Sent by the backend upon creating a temp file
- * ----------
- */
-typedef struct PgStat_MsgTempFile
-{
- PgStat_MsgHdr m_hdr;
-
- Oid m_databaseid;
- size_t m_filesize;
-} PgStat_MsgTempFile;
-
 /* ----------
  * PgStat_FunctionCounts The actual per-function counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
- * it against zeroes to detect whether there are any counts to transmit.
+ * it against zeroes to detect whether there are any counts to write.
  *
  * Note that the time counters are in instr_time format here.  We convert to
- * microseconds in PgStat_Counter format when transmitting to the collector.
+ * microseconds in PgStat_Counter format when writing to shared statsitics.
  * ----------
  */
 typedef struct PgStat_FunctionCounts
@@ -485,96 +203,8 @@ typedef struct PgStat_FunctionEntry
  PgStat_Counter f_self_time;
 } PgStat_FunctionEntry;
 
-/* ----------
- * PgStat_MsgFuncstat Sent by the backend to report function
- * usage statistics.
- * ----------
- */
-#define PGSTAT_NUM_FUNCENTRIES \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(PgStat_FunctionEntry))
-
-typedef struct PgStat_MsgFuncstat
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES];
-} PgStat_MsgFuncstat;
-
-/* ----------
- * PgStat_MsgFuncpurge Sent by the backend to tell the collector
- * about dead functions.
- * ----------
- */
-#define PGSTAT_NUM_FUNCPURGE  \
- ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
- / sizeof(Oid))
-
-typedef struct PgStat_MsgFuncpurge
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_nentries;
- Oid m_functionid[PGSTAT_NUM_FUNCPURGE];
-} PgStat_MsgFuncpurge;
-
-/* ----------
- * PgStat_MsgDeadlock Sent by the backend to tell the collector
- * about a deadlock that occurred.
- * ----------
- */
-typedef struct PgStat_MsgDeadlock
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
-} PgStat_MsgDeadlock;
-
-/* ----------
- * PgStat_MsgChecksumFailure Sent by the backend to tell the collector
- * about checksum failures noticed.
- * ----------
- */
-typedef struct PgStat_MsgChecksumFailure
-{
- PgStat_MsgHdr m_hdr;
- Oid m_databaseid;
- int m_failurecount;
- TimestampTz m_failure_time;
-} PgStat_MsgChecksumFailure;
-
-
-/* ----------
- * PgStat_Msg Union over all possible messages.
- * ----------
- */
-typedef union PgStat_Msg
-{
- PgStat_MsgHdr msg_hdr;
- PgStat_MsgDummy msg_dummy;
- PgStat_MsgInquiry msg_inquiry;
- PgStat_MsgTabstat msg_tabstat;
- PgStat_MsgTabpurge msg_tabpurge;
- PgStat_MsgDropdb msg_dropdb;
- PgStat_MsgResetcounter msg_resetcounter;
- PgStat_MsgResetsharedcounter msg_resetsharedcounter;
- PgStat_MsgResetsinglecounter msg_resetsinglecounter;
- PgStat_MsgAutovacStart msg_autovacuum_start;
- PgStat_MsgVacuum msg_vacuum;
- PgStat_MsgAnalyze msg_analyze;
- PgStat_MsgArchiver msg_archiver;
- PgStat_MsgBgWriter msg_bgwriter;
- PgStat_MsgFuncstat msg_funcstat;
- PgStat_MsgFuncpurge msg_funcpurge;
- PgStat_MsgRecoveryConflict msg_recoveryconflict;
- PgStat_MsgDeadlock msg_deadlock;
- PgStat_MsgTempFile msg_tempfile;
- PgStat_MsgChecksumFailure msg_checksumfailure;
-} PgStat_Msg;
-
-
 /* ------------------------------------------------------------
- * Statistic collector data structures follow
+ * Statistic collector data structures on file and shared memory follow
  *
  * PGSTAT_FILE_FORMAT_ID should be changed whenever any of these
  * data structures change.
@@ -614,16 +244,29 @@ typedef struct PgStat_StatDBEntry
  PgStat_Counter n_block_write_time;
 
  TimestampTz stat_reset_timestamp;
- TimestampTz stats_timestamp; /* time of db stats file update */
+ TimestampTz stats_timestamp; /* time of db stats update */
 
  /*
- * tables and functions must be last in the struct, because we don't write
- * the pointers out to the stats file.
+ * The followings must be last in the struct, because we don't write them
+ * out to the stats file.
  */
- HTAB   *tables;
- HTAB   *functions;
+ int generation; /* current generation of the below */
+ int refcnt; /* current gen reference count */
+ dshash_table_handle tables; /* current gen tables hash */
+ dshash_table_handle functions; /* current gen functions hash */
+ int prev_refcnt; /* prev gen reference count */
+ dshash_table_handle prev_tables; /* prev gen tables hash */
+ dshash_table_handle prev_functions; /* prev gen functions hash */
+ LWLock lock; /* Lock for the above members */
+
+ /* non-shared members */
+ HTAB *snapshot_tables; /* table entry snapshot */
+ HTAB *snapshot_functions; /* function entry snapshot */
+ dshash_table *dshash_tables; /* attached tables dshash */
+ dshash_table *dshash_functions; /* attached functions dshash */
 } PgStat_StatDBEntry;
 
+#define SHARED_DBENT_SIZE offsetof(PgStat_StatDBEntry, snapshot_tables)
 
 /* ----------
  * PgStat_StatTabEntry The collector's data per table (or index)
@@ -662,7 +305,7 @@ typedef struct PgStat_StatTabEntry
 
 
 /* ----------
- * PgStat_StatFuncEntry The collector's data per function
+ * PgStat_StatFuncEntry per function stats data
  * ----------
  */
 typedef struct PgStat_StatFuncEntry
@@ -677,7 +320,7 @@ typedef struct PgStat_StatFuncEntry
 
 
 /*
- * Archiver statistics kept in the stats collector
+ * Archiver statistics kept in the shared stats
  */
 typedef struct PgStat_ArchiverStats
 {
@@ -693,7 +336,7 @@ typedef struct PgStat_ArchiverStats
 } PgStat_ArchiverStats;
 
 /*
- * Global statistics kept in the stats collector
+ * Global statistics kept in the shared stats
  */
 typedef struct PgStat_GlobalStats
 {
@@ -779,7 +422,6 @@ typedef enum
  WAIT_EVENT_CHECKPOINTER_MAIN,
  WAIT_EVENT_LOGICAL_APPLY_MAIN,
  WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
- WAIT_EVENT_PGSTAT_MAIN,
  WAIT_EVENT_RECOVERY_WAL_ALL,
  WAIT_EVENT_RECOVERY_WAL_STREAM,
  WAIT_EVENT_SYSLOGGER_MAIN,
@@ -1214,6 +856,8 @@ extern bool pgstat_track_counts;
 extern int pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
+
+/* No longer used, but will be removed with GUC */
 extern char *pgstat_stat_tmpname;
 extern char *pgstat_stat_filename;
 
@@ -1235,29 +879,26 @@ extern PgStat_Counter pgStatBlockWriteTime;
 extern Size BackendStatusShmemSize(void);
 extern void CreateSharedBackendStatus(void);
 
-extern void pgstat_init(void);
-extern int pgstat_start(void);
-extern void pgstat_reset_all(void);
-extern void allow_immediate_pgstat_restart(void);
+extern Size StatsShmemSize(void);
+extern void StatsShmemInit(void);
 
-#ifdef EXEC_BACKEND
-extern void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-#endif
+extern void pgstat_reset_all(void);
 
+/* File input/output functions  */
+extern void pgstat_read_statsfiles(void);
+extern void pgstat_write_statsfiles(void);
 
 /* ----------
  * Functions called from backends
  * ----------
  */
-extern void pgstat_ping(void);
-
-extern void pgstat_report_stat(bool force);
+extern long pgstat_report_stat(bool force);
 extern void pgstat_vacuum_stat(void);
 extern void pgstat_drop_database(Oid databaseid);
 
 extern void pgstat_clear_snapshot(void);
 extern void pgstat_reset_counters(void);
-extern void pgstat_reset_shared_counters(const char *);
+extern void pgstat_reset_shared_counters(const char *target);
 extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type type);
 
 extern void pgstat_report_autovac(Oid dboid);
@@ -1429,11 +1070,13 @@ extern void pgstat_send_bgwriter(void);
  */
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
+extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_extended(PgStat_StatDBEntry *dbent, Oid relid);
 extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
 extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid);
 extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern void pgstat_clear_snapshot(void);
 
 #endif /* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index f9450dac90..50d0a0c9dd 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -220,6 +220,7 @@ typedef enum BuiltinTrancheIds
  LWTRANCHE_TBM,
  LWTRANCHE_PARALLEL_APPEND,
  LWTRANCHE_SXACT,
+ LWTRANCHE_STATS,
  LWTRANCHE_FIRST_USER_DEFINED
 } BuiltinTrancheIds;
 
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ae5389ec96..3e02bc6f85 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -31,6 +31,7 @@ typedef enum TimeoutId
  STANDBY_TIMEOUT,
  STANDBY_LOCK_TIMEOUT,
  IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
+ IDLE_STATS_UPDATE_TIMEOUT,
  /* First user-definable timeout reason */
  USER_TIMEOUT,
  /* Maximum number of timeout reasons */
--
2.23.0


From 4e92947ed997fd13693ee73837240961ccd63bfc Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <[hidden email]>
Date: Tue, 27 Nov 2018 14:42:12 +0900
Subject: [PATCH v24 5/5] Remove the GUC stats_temp_directory

The guc used to specifie the directory to store temporary statistics
files. It is no longer needed by the stats collector but still used by
the programs in bin and contirb, and maybe other extensions. Thus this
patch removes the GUC but some backing variables and macro definitions
are left alone for backward comptibility.
---
 doc/src/sgml/backup.sgml                      |  2 -
 doc/src/sgml/config.sgml                      | 19 ---------
 doc/src/sgml/monitoring.sgml                  |  7 +---
 doc/src/sgml/storage.sgml                     |  3 +-
 src/backend/postmaster/pgstat.c               | 13 +++---
 src/backend/replication/basebackup.c          | 13 ++----
 src/backend/utils/misc/guc.c                  | 41 -------------------
 src/backend/utils/misc/postgresql.conf.sample |  1 -
 src/include/pgstat.h                          |  5 ++-
 src/test/perl/PostgresNode.pm                 |  4 --
 10 files changed, 14 insertions(+), 94 deletions(-)

diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml
index bdc9026c62..2885540362 100644
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -1146,8 +1146,6 @@ SELECT pg_stop_backup();
     <filename>pg_snapshots/</filename>, <filename>pg_stat_tmp/</filename>,
     and <filename>pg_subtrans/</filename> (but not the directories themselves) can be
     omitted from the backup as they will be initialized on postmaster startup.
-    If <xref linkend="guc-stats-temp-directory"/> is set and is under the data
-    directory then the contents of that directory can also be omitted.
    </para>
 
    <para>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4ec13f3311..389269999d 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7004,25 +7004,6 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-stats-temp-directory" xreflabel="stats_temp_directory">
-      <term><varname>stats_temp_directory</varname> (<type>string</type>)
-      <indexterm>
-       <primary><varname>stats_temp_directory</varname> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        Sets the directory to store temporary statistics data in. This can be
-        a path relative to the data directory or an absolute path. The default
-        is <filename>pg_stat_tmp</filename>. Pointing this at a RAM-based
-        file system will decrease physical I/O requirements and can lead to
-        improved performance.
-        This parameter can only be set in the <filename>postgresql.conf</filename>
-        file or on the server command line.
-       </para>
-      </listitem>
-     </varlistentry>
-
      </variablelist>
     </sect2>
 
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index eb94dec119..73cba4e21f 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -195,12 +195,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
 
   <para>
    The statistics collector transmits the collected information to other
-   <productname>PostgreSQL</productname> processes through temporary files.
-   These files are stored in the directory named by the
-   <xref linkend="guc-stats-temp-directory"/> parameter,
-   <filename>pg_stat_tmp</filename> by default.
-   For better performance, <varname>stats_temp_directory</varname> can be
-   pointed at a RAM-based file system, decreasing physical I/O requirements.
+   <productname>PostgreSQL</productname> processes through shared memory.
    When the server shuts down cleanly, a permanent copy of the statistics
    data is stored in the <filename>pg_stat</filename> subdirectory, so that
    statistics can be retained across server restarts.  When recovery is
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index 1c19e863d2..2f04bb68bb 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -122,8 +122,7 @@ Item
 
 <row>
  <entry><filename>pg_stat_tmp</filename></entry>
- <entry>Subdirectory containing temporary files for the statistics
-  subsystem</entry>
+ <entry>Subdirectory containing ephemeral files for extensions</entry>
 </row>
 
 <row>
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index bcf8c6f371..7fe5c5019a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -107,15 +107,12 @@ bool pgstat_track_counts = false;
 int pgstat_track_functions = TRACK_FUNC_OFF;
 int pgstat_track_activity_query_size = 1024;
 
-/* ----------
- * Built from GUC parameter
- * ----------
+/*
+ * This used to be a GUC variable and is no longer used in this file, but left
+ * alone just for backward comptibility for extensions, having the default
+ * value.
  */
-char   *pgstat_stat_directory = NULL;
-
-/* No longer used, but will be removed with GUC */
-char   *pgstat_stat_filename = NULL;
-char   *pgstat_stat_tmpname = NULL;
+char   *pgstat_stat_directory = PG_STAT_TMP_DIR;
 
 #define StatsLock (&StatsShmem->StatsMainLock)
 
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 1fa4551eff..84f7acbc4f 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -241,11 +241,8 @@ perform_base_backup(basebackup_options *opt)
  TimeLineID endtli;
  StringInfo labelfile;
  StringInfo tblspc_map_file = NULL;
- int datadirpathlen;
  List   *tablespaces = NIL;
 
- datadirpathlen = strlen(DataDir);
-
  backup_started_in_recovery = RecoveryInProgress();
 
  labelfile = makeStringInfo();
@@ -276,13 +273,9 @@ perform_base_backup(basebackup_options *opt)
  * Calculate the relative path of temporary statistics directory in
  * order to skip the files which are located in that directory later.
  */
- if (is_absolute_path(pgstat_stat_directory) &&
- strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
- statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
- else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
- statrelpath = psprintf("./%s", pgstat_stat_directory);
- else
- statrelpath = pgstat_stat_directory;
+
+ Assert(strchr(PG_STAT_TMP_DIR, '/') == NULL);
+ statrelpath = psprintf("./%s", PG_STAT_TMP_DIR);
 
  /* Add a node for the base directory at the end */
  ti = palloc0(sizeof(tablespaceinfo));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5fccc9683e..809487ab69 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -195,7 +195,6 @@ static bool check_max_wal_senders(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static void assign_effective_io_concurrency(int newval, void *extra);
-static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
 static bool check_cluster_name(char **newval, void **extra, GucSource source);
@@ -4114,17 +4113,6 @@ static struct config_string ConfigureNamesString[] =
  NULL, NULL, NULL
  },
 
- {
- {"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR,
- gettext_noop("Writes temporary statistics files to the specified directory."),
- NULL,
- GUC_SUPERUSER_ONLY
- },
- &pgstat_temp_directory,
- PG_STAT_TMP_DIR,
- check_canonical_path, assign_pgstat_temp_directory, NULL
- },
-
  {
  {"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER,
  gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."),
@@ -11434,35 +11422,6 @@ assign_effective_io_concurrency(int newval, void *extra)
 #endif /* USE_PREFETCH */
 }
 
-static void
-assign_pgstat_temp_directory(const char *newval, void *extra)
-{
- /* check_canonical_path already canonicalized newval for us */
- char   *dname;
- char   *tname;
- char   *fname;
-
- /* directory */
- dname = guc_malloc(ERROR, strlen(newval) + 1); /* runtime dir */
- sprintf(dname, "%s", newval);
-
- /* global stats */
- tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */
- sprintf(tname, "%s/global.tmp", newval);
- fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */
- sprintf(fname, "%s/global.stat", newval);
-
- if (pgstat_stat_directory)
- free(pgstat_stat_directory);
- pgstat_stat_directory = dname;
- if (pgstat_stat_tmpname)
- free(pgstat_stat_tmpname);
- pgstat_stat_tmpname = tname;
- if (pgstat_stat_filename)
- free(pgstat_stat_filename);
- pgstat_stat_filename = fname;
-}
-
 static bool
 check_application_name(char **newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 46a06ffacd..7aeb789b0e 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -570,7 +570,6 @@
 #track_io_timing = off
 #track_functions = none # none, pl, all
 #track_activity_query_size = 1024 # (change requires restart)
-#stats_temp_directory = 'pg_stat_tmp'
 
 
 # - Monitoring -
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c9fbcead3f..e9e18ed27a 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -30,7 +30,10 @@
 #define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat"
 #define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp"
 
-/* Default directory to store temporary statistics data in */
+/*
+ * This used to be the direct