@@ -182,6 +182,7 @@ static void CheckArchiveTimeout(void);
182182static void BgWriterNap (void );
183183static bool IsCheckpointOnSchedule (double progress );
184184static bool ImmediateCheckpointRequested (void );
185+ static bool CompactBgwriterRequestQueue (void );
185186
186187/* Signal handlers */
187188
@@ -1064,14 +1065,15 @@ RequestCheckpoint(int flags)
10641065 * use high values for special flags; that's all internal to md.c, which
10651066 * see for details.)
10661067 *
1067- * If we are unable to pass over the request (at present, this can happen
1068- * if the shared memory queue is full), we return false. That forces
1069- * the backend to do its own fsync. We hope that will be even more seldom.
1070- *
1071- * Note: we presently make no attempt to eliminate duplicate requests
1072- * in the requests[] queue. The bgwriter will have to eliminate dups
1073- * internally anyway, so we may as well avoid holding the lock longer
1074- * than we have to here.
1068+ * To avoid holding the lock for longer than necessary, we normally write
1069+ * to the requests[] queue without checking for duplicates. The bgwriter
1070+ * will have to eliminate dups internally anyway. However, if we discover
1071+ * that the queue is full, we make a pass over the entire queue to compact
1072+ * it. This is somewhat expensive, but the alternative is for the backend
1073+ * to perform its own fsync, which is far more expensive in practice. It
1074+ * is theoretically possible a backend fsync might still be necessary, if
1075+ * the queue is full and contains no duplicate entries. In that case, we
1076+ * let the backend know by returning false.
10751077 */
10761078bool
10771079ForwardFsyncRequest (RelFileNodeBackend rnode , ForkNumber forknum ,
@@ -1090,10 +1092,20 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
10901092 /* Count all backend writes regardless of if they fit in the queue */
10911093 BgWriterShmem -> num_backend_writes ++ ;
10921094
1095+ /*
1096+ * If the background writer isn't running or the request queue is full,
1097+ * the backend will have to perform its own fsync request. But before
1098+ * forcing that to happen, we can try to compact the background writer
1099+ * request queue.
1100+ */
10931101 if (BgWriterShmem -> bgwriter_pid == 0 ||
1094- BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests )
1102+ (BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests
1103+ && !CompactBgwriterRequestQueue ()))
10951104 {
1096- /* Also count the subset where backends have to do their own fsync */
1105+ /*
1106+ * Count the subset of writes where backends have to do their own
1107+ * fsync
1108+ */
10971109 BgWriterShmem -> num_backend_fsync ++ ;
10981110 LWLockRelease (BgWriterCommLock );
10991111 return false;
@@ -1106,6 +1118,108 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
11061118 return true;
11071119}
11081120
1121+ /*
1122+ * CompactBgwriterRequestQueue
1123+ * Remove duplicates from the request queue to avoid backend fsyncs.
1124+ *
1125+ * Although a full fsync request queue is not common, it can lead to severe
1126+ * performance problems when it does happen. So far, this situation has
1127+ * only been observed to occur when the system is under heavy write load,
1128+ * and especially during the "sync" phase of a checkpoint. Without this
1129+ * logic, each backend begins doing an fsync for every block written, which
1130+ * gets very expensive and can slow down the whole system.
1131+ *
1132+ * Trying to do this every time the queue is full could lose if there
1133+ * aren't any removable entries. But should be vanishingly rare in
1134+ * practice: there's one queue entry per shared buffer.
1135+ */
1136+ static bool
1137+ CompactBgwriterRequestQueue ()
1138+ {
1139+ struct BgWriterSlotMapping {
1140+ BgWriterRequest request ;
1141+ int slot ;
1142+ };
1143+
1144+ int n ,
1145+ preserve_count ;
1146+ int num_skipped = 0 ;
1147+ HASHCTL ctl ;
1148+ HTAB * htab ;
1149+ bool * skip_slot ;
1150+
1151+ /* must hold BgWriterCommLock in exclusive mode */
1152+ Assert (LWLockHeldByMe (BgWriterCommLock ));
1153+
1154+ /* Initialize temporary hash table */
1155+ MemSet (& ctl , 0 , sizeof (ctl ));
1156+ ctl .keysize = sizeof (BgWriterRequest );
1157+ ctl .entrysize = sizeof (struct BgWriterSlotMapping );
1158+ ctl .hash = tag_hash ;
1159+ htab = hash_create ("CompactBgwriterRequestQueue" ,
1160+ BgWriterShmem -> num_requests ,
1161+ & ctl ,
1162+ HASH_ELEM | HASH_FUNCTION );
1163+
1164+ /* Initialize skip_slot array */
1165+ skip_slot = palloc0 (sizeof (bool ) * BgWriterShmem -> num_requests );
1166+
1167+ /*
1168+ * The basic idea here is that a request can be skipped if it's followed
1169+ * by a later, identical request. It might seem more sensible to work
1170+ * backwards from the end of the queue and check whether a request is
1171+ * *preceded* by an earlier, identical request, in the hopes of doing less
1172+ * copying. But that might change the semantics, if there's an
1173+ * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1174+ * we do it this way. It would be possible to be even smarter if we made
1175+ * the code below understand the specific semantics of such requests (it
1176+ * could blow away preceding entries that would end up being cancelled
1177+ * anyhow), but it's not clear that the extra complexity would buy us
1178+ * anything.
1179+ */
1180+ for (n = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1181+ {
1182+ BgWriterRequest * request ;
1183+ struct BgWriterSlotMapping * slotmap ;
1184+ bool found ;
1185+
1186+ request = & BgWriterShmem -> requests [n ];
1187+ slotmap = hash_search (htab , request , HASH_ENTER , & found );
1188+ if (found )
1189+ {
1190+ skip_slot [slotmap -> slot ] = true;
1191+ ++ num_skipped ;
1192+ }
1193+ slotmap -> slot = n ;
1194+ }
1195+
1196+ /* Done with the hash table. */
1197+ hash_destroy (htab );
1198+
1199+ /* If no duplicates, we're out of luck. */
1200+ if (!num_skipped )
1201+ {
1202+ pfree (skip_slot );
1203+ return false;
1204+ }
1205+
1206+ /* We found some duplicates; remove them. */
1207+ for (n = 0 , preserve_count = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1208+ {
1209+ if (skip_slot [n ])
1210+ continue ;
1211+ BgWriterShmem -> requests [preserve_count ++ ] = BgWriterShmem -> requests [n ];
1212+ }
1213+ ereport (DEBUG1 ,
1214+ (errmsg ("compacted fsync request queue from %d entries to %d entries" ,
1215+ BgWriterShmem -> num_requests , preserve_count )));
1216+ BgWriterShmem -> num_requests = preserve_count ;
1217+
1218+ /* Cleanup. */
1219+ pfree (skip_slot );
1220+ return true;
1221+ }
1222+
11091223/*
11101224 * AbsorbFsyncRequests
11111225 * Retrieve queued fsync requests and pass them to local smgr.
0 commit comments