4242#include "miscadmin.h"
4343#include "pgstat.h"
4444#include "postmaster/bgwriter.h"
45+ #include "postmaster/walwriter.h"
4546#include "postmaster/startup.h"
4647#include "replication/basebackup.h"
4748#include "replication/logical.h"
@@ -2729,28 +2730,37 @@ XLogFlush(XLogRecPtr record)
27292730}
27302731
27312732/*
2732- * Flush xlog, but without specifying exactly where to flush to.
2733+ * Write & flush xlog, but without specifying exactly where to.
27332734 *
2734- * We normally flush only completed blocks; but if there is nothing to do on
2735- * that basis, we check for unflushed async commits in the current incomplete
2736- * block, and flush through the latest one of those. Thus, if async commits
2737- * are not being used, we will flush complete blocks only. We can guarantee
2738- * that async commits reach disk after at most three cycles; normally only
2739- * one or two. (When flushing complete blocks, we allow XLogWrite to write
2740- * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2741- * difference only with very high load or long wal_writer_delay, but imposes
2742- * one extra cycle for the worst case for async commits.)
2735+ * We normally write only completed blocks; but if there is nothing to do on
2736+ * that basis, we check for unwritten async commits in the current incomplete
2737+ * block, and write through the latest one of those. Thus, if async commits
2738+ * are not being used, we will write complete blocks only.
2739+ *
2740+ * If, based on the above, there's anything to write we do so immediately. But
2741+ * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2742+ * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2743+ * more than wal_writer_flush_after unflushed blocks.
2744+ *
2745+ * We can guarantee that async commits reach disk after at most three
2746+ * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2747+ * to write "flexibly", meaning it can stop at the end of the buffer ring;
2748+ * this makes a difference only with very high load or long wal_writer_delay,
2749+ * but imposes one extra cycle for the worst case for async commits.)
27432750 *
27442751 * This routine is invoked periodically by the background walwriter process.
27452752 *
2746- * Returns TRUE if we flushed anything.
2753+ * Returns TRUE if there was any work to do, even if we skipped flushing due
2754+ * to wal_writer_delay/wal_flush_after.
27472755 */
27482756bool
27492757XLogBackgroundFlush (void )
27502758{
2751- XLogRecPtr WriteRqstPtr ;
2759+ XLogwrtRqst WriteRqst ;
27522760 bool flexible = true;
2753- bool wrote_something = false;
2761+ static TimestampTz lastflush ;
2762+ TimestampTz now ;
2763+ int flushbytes ;
27542764
27552765 /* XLOG doesn't need flushing during recovery */
27562766 if (RecoveryInProgress ())
@@ -2759,17 +2769,17 @@ XLogBackgroundFlush(void)
27592769 /* read LogwrtResult and update local state */
27602770 SpinLockAcquire (& XLogCtl -> info_lck );
27612771 LogwrtResult = XLogCtl -> LogwrtResult ;
2762- WriteRqstPtr = XLogCtl -> LogwrtRqst . Write ;
2772+ WriteRqst = XLogCtl -> LogwrtRqst ;
27632773 SpinLockRelease (& XLogCtl -> info_lck );
27642774
27652775 /* back off to last completed page boundary */
2766- WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ ;
2776+ WriteRqst . Write -= WriteRqst . Write % XLOG_BLCKSZ ;
27672777
27682778 /* if we have already flushed that far, consider async commit records */
2769- if (WriteRqstPtr <= LogwrtResult .Flush )
2779+ if (WriteRqst . Write <= LogwrtResult .Flush )
27702780 {
27712781 SpinLockAcquire (& XLogCtl -> info_lck );
2772- WriteRqstPtr = XLogCtl -> asyncXactLSN ;
2782+ WriteRqst . Write = XLogCtl -> asyncXactLSN ;
27732783 SpinLockRelease (& XLogCtl -> info_lck );
27742784 flexible = false; /* ensure it all gets written */
27752785 }
@@ -2779,7 +2789,7 @@ XLogBackgroundFlush(void)
27792789 * holding an open file handle to a logfile that's no longer in use,
27802790 * preventing the file from being deleted.
27812791 */
2782- if (WriteRqstPtr <= LogwrtResult .Flush )
2792+ if (WriteRqst . Write <= LogwrtResult .Flush )
27832793 {
27842794 if (openLogFile >= 0 )
27852795 {
@@ -2791,28 +2801,61 @@ XLogBackgroundFlush(void)
27912801 return false;
27922802 }
27932803
2804+ /*
2805+ * Determine how far to flush WAL, based on the wal_writer_delay and
2806+ * wal_writer_flush_after GUCs.
2807+ */
2808+ now = GetCurrentTimestamp ();
2809+ flushbytes =
2810+ WriteRqst .Write / XLOG_BLCKSZ - LogwrtResult .Flush / XLOG_BLCKSZ ;
2811+
2812+ if (WalWriterFlushAfter == 0 || lastflush == 0 )
2813+ {
2814+ /* first call, or block based limits disabled */
2815+ WriteRqst .Flush = WriteRqst .Write ;
2816+ lastflush = now ;
2817+ }
2818+ else if (TimestampDifferenceExceeds (lastflush , now , WalWriterDelay ))
2819+ {
2820+ /*
2821+ * Flush the writes at least every WalWriteDelay ms. This is important
2822+ * to bound the amount of time it takes for an asynchronous commit to
2823+ * hit disk.
2824+ */
2825+ WriteRqst .Flush = WriteRqst .Write ;
2826+ lastflush = now ;
2827+ }
2828+ else if (flushbytes >= WalWriterFlushAfter )
2829+ {
2830+ /* exceeded wal_writer_flush_after blocks, flush */
2831+ WriteRqst .Flush = WriteRqst .Write ;
2832+ lastflush = now ;
2833+ }
2834+ else
2835+ {
2836+ /* no flushing, this time round */
2837+ WriteRqst .Flush = 0 ;
2838+ }
2839+
27942840#ifdef WAL_DEBUG
27952841 if (XLOG_DEBUG )
2796- elog (LOG , "xlog bg flush request %X/%X; write %X/%X; flush %X/%X" ,
2797- (uint32 ) (WriteRqstPtr >> 32 ), (uint32 ) WriteRqstPtr ,
2842+ elog (LOG , "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X" ,
2843+ (uint32 ) (WriteRqst .Write >> 32 ), (uint32 ) WriteRqst .Write ,
2844+ (uint32 ) (WriteRqst .Flush >> 32 ), (uint32 ) WriteRqst .Flush ,
27982845 (uint32 ) (LogwrtResult .Write >> 32 ), (uint32 ) LogwrtResult .Write ,
27992846 (uint32 ) (LogwrtResult .Flush >> 32 ), (uint32 ) LogwrtResult .Flush );
28002847#endif
28012848
28022849 START_CRIT_SECTION ();
28032850
28042851 /* now wait for any in-progress insertions to finish and get write lock */
2805- WaitXLogInsertionsToFinish (WriteRqstPtr );
2852+ WaitXLogInsertionsToFinish (WriteRqst . Write );
28062853 LWLockAcquire (WALWriteLock , LW_EXCLUSIVE );
28072854 LogwrtResult = XLogCtl -> LogwrtResult ;
2808- if (WriteRqstPtr > LogwrtResult .Flush )
2855+ if (WriteRqst .Write > LogwrtResult .Write ||
2856+ WriteRqst .Flush > LogwrtResult .Flush )
28092857 {
2810- XLogwrtRqst WriteRqst ;
2811-
2812- WriteRqst .Write = WriteRqstPtr ;
2813- WriteRqst .Flush = WriteRqstPtr ;
28142858 XLogWrite (WriteRqst , flexible );
2815- wrote_something = true;
28162859 }
28172860 LWLockRelease (WALWriteLock );
28182861
@@ -2827,7 +2870,12 @@ XLogBackgroundFlush(void)
28272870 */
28282871 AdvanceXLInsertBuffer (InvalidXLogRecPtr , true);
28292872
2830- return wrote_something ;
2873+ /*
2874+ * If we determined that we need to write data, but somebody else
2875+ * wrote/flushed already, it should be considered as being active, to
2876+ * avoid hibernating too early.
2877+ */
2878+ return true;
28312879}
28322880
28332881/*
0 commit comments