@@ -559,6 +559,16 @@ typedef struct XLogCtlData
559559 slock_t info_lck ; /* locks shared variables shown above */
560560} XLogCtlData ;
561561
562+ /*
563+ * Classification of XLogRecordInsert operations.
564+ */
565+ typedef enum
566+ {
567+ WALINSERT_NORMAL ,
568+ WALINSERT_SPECIAL_SWITCH ,
569+ WALINSERT_SPECIAL_CHECKPOINT
570+ } WalInsertClass ;
571+
562572static XLogCtlData * XLogCtl = NULL ;
563573
564574/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -739,13 +749,21 @@ XLogInsertRecord(XLogRecData *rdata,
739749 bool inserted ;
740750 XLogRecord * rechdr = (XLogRecord * ) rdata -> data ;
741751 uint8 info = rechdr -> xl_info & ~XLR_INFO_MASK ;
742- bool isLogSwitch = (rechdr -> xl_rmid == RM_XLOG_ID &&
743- info == XLOG_SWITCH );
752+ WalInsertClass class = WALINSERT_NORMAL ;
744753 XLogRecPtr StartPos ;
745754 XLogRecPtr EndPos ;
746755 bool prevDoPageWrites = doPageWrites ;
747756 TimeLineID insertTLI ;
748757
758+ /* Does this record type require special handling? */
759+ if (unlikely (rechdr -> xl_rmid == RM_XLOG_ID ))
760+ {
761+ if (info == XLOG_SWITCH )
762+ class = WALINSERT_SPECIAL_SWITCH ;
763+ else if (info == XLOG_CHECKPOINT_REDO )
764+ class = WALINSERT_SPECIAL_CHECKPOINT ;
765+ }
766+
749767 /* we assume that all of the record header is in the first chunk */
750768 Assert (rdata -> len >= SizeOfXLogRecord );
751769
@@ -793,7 +811,7 @@ XLogInsertRecord(XLogRecData *rdata,
793811 */
794812 START_CRIT_SECTION ();
795813
796- if (likely (! isLogSwitch ))
814+ if (likely (class == WALINSERT_NORMAL ))
797815 {
798816 WALInsertLockAcquire ();
799817
@@ -843,7 +861,7 @@ XLogInsertRecord(XLogRecData *rdata,
843861 /* Normal records are always inserted. */
844862 inserted = true;
845863 }
846- else
864+ else if ( class == WALINSERT_SPECIAL_SWITCH )
847865 {
848866 /*
849867 * In order to insert an XLOG_SWITCH record, we need to hold all of
@@ -852,14 +870,32 @@ XLogInsertRecord(XLogRecData *rdata,
852870 * remains in the current WAL segment and claimed all of it.
853871 *
854872 * Nonetheless, this case is simpler than the normal cases handled
855- * above , which must check for changes in doPageWrites and RedoRecPtr.
856- * Those checks are only needed for records that can contain
857- * full-pages images , and an XLOG_SWITCH record never does.
873+ * below , which must check for changes in doPageWrites and RedoRecPtr.
874+ * Those checks are only needed for records that can contain buffer
875+ * references , and an XLOG_SWITCH record never does.
858876 */
859877 Assert (fpw_lsn == InvalidXLogRecPtr );
860878 WALInsertLockAcquireExclusive ();
861879 inserted = ReserveXLogSwitch (& StartPos , & EndPos , & rechdr -> xl_prev );
862880 }
881+ else
882+ {
883+ Assert (class == WALINSERT_SPECIAL_CHECKPOINT );
884+
885+ /*
886+ * We need to update both the local and shared copies of RedoRecPtr,
887+ * which means that we need to hold all the WAL insertion locks.
888+ * However, there can't be any buffer references, so as above, we need
889+ * not check RedoRecPtr before inserting the record; we just need to
890+ * update it afterwards.
891+ */
892+ Assert (fpw_lsn == InvalidXLogRecPtr );
893+ WALInsertLockAcquireExclusive ();
894+ ReserveXLogInsertLocation (rechdr -> xl_tot_len , & StartPos , & EndPos ,
895+ & rechdr -> xl_prev );
896+ RedoRecPtr = Insert -> RedoRecPtr = StartPos ;
897+ inserted = true;
898+ }
863899
864900 if (inserted )
865901 {
@@ -876,7 +912,8 @@ XLogInsertRecord(XLogRecData *rdata,
876912 * All the record data, including the header, is now ready to be
877913 * inserted. Copy the record in the space reserved.
878914 */
879- CopyXLogRecordToWAL (rechdr -> xl_tot_len , isLogSwitch , rdata ,
915+ CopyXLogRecordToWAL (rechdr -> xl_tot_len ,
916+ class == WALINSERT_SPECIAL_SWITCH , rdata ,
880917 StartPos , EndPos , insertTLI );
881918
882919 /*
@@ -935,7 +972,7 @@ XLogInsertRecord(XLogRecData *rdata,
935972 * padding space that fills the rest of the segment, and perform
936973 * end-of-segment actions (eg, notifying archiver).
937974 */
938- if (isLogSwitch )
975+ if (class == WALINSERT_SPECIAL_SWITCH )
939976 {
940977 TRACE_POSTGRESQL_WAL_SWITCH ();
941978 XLogFlush (EndPos );
@@ -1054,8 +1091,12 @@ XLogInsertRecord(XLogRecData *rdata,
10541091 *
10551092 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
10561093 * where we actually copy the record to the reserved space.
1094+ *
1095+ * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
1096+ * however, because there are two call sites, the compiler is reluctant to
1097+ * inline. We use pg_attribute_always_inline here to try to convince it.
10571098 */
1058- static void
1099+ static pg_attribute_always_inline void
10591100ReserveXLogInsertLocation (int size , XLogRecPtr * StartPos , XLogRecPtr * EndPos ,
10601101 XLogRecPtr * PrevPtr )
10611102{
@@ -6475,17 +6516,22 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
64756516 * In particular note that this routine is synchronous and does not pay
64766517 * attention to CHECKPOINT_WAIT.
64776518 *
6478- * If !shutdown then we are writing an online checkpoint. This is a very special
6479- * kind of operation and WAL record because the checkpoint action occurs over
6480- * a period of time yet logically occurs at just a single LSN. The logical
6481- * position of the WAL record (redo ptr) is the same or earlier than the
6482- * physical position. When we replay WAL we locate the checkpoint via its
6483- * physical position then read the redo ptr and actually start replay at the
6484- * earlier logical position. Note that we don't write *anything* to WAL at
6485- * the logical position, so that location could be any other kind of WAL record.
6486- * All of this mechanism allows us to continue working while we checkpoint.
6487- * As a result, timing of actions is critical here and be careful to note that
6488- * this function will likely take minutes to execute on a busy system.
6519+ * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
6520+ * record is inserted into WAL at the logical location of the checkpoint, before
6521+ * flushing anything to disk, and when the checkpoint is eventually completed,
6522+ * and it is from this point that WAL replay will begin in the case of a recovery
6523+ * from this checkpoint. Once everything is written to disk, an
6524+ * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
6525+ * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
6526+ * other write-ahead log records to be written while the checkpoint is in
6527+ * progress, but we must be very careful about order of operations. This function
6528+ * may take many minutes to execute on a busy system.
6529+ *
6530+ * On the other hand, when shutdown is true, concurrent insertion into the
6531+ * write-ahead log is impossible, so there is no need for two separate records.
6532+ * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
6533+ * both the record marking the completion of the checkpoint and the location
6534+ * from which WAL replay would begin if needed.
64896535 */
64906536void
64916537CreateCheckPoint (int flags )
@@ -6497,7 +6543,6 @@ CreateCheckPoint(int flags)
64976543 XLogCtlInsert * Insert = & XLogCtl -> Insert ;
64986544 uint32 freespace ;
64996545 XLogRecPtr PriorRedoPtr ;
6500- XLogRecPtr curInsert ;
65016546 XLogRecPtr last_important_lsn ;
65026547 VirtualTransactionId * vxids ;
65036548 int nvxids ;
@@ -6567,13 +6612,6 @@ CreateCheckPoint(int flags)
65676612 */
65686613 last_important_lsn = GetLastImportantRecPtr ();
65696614
6570- /*
6571- * We must block concurrent insertions while examining insert state to
6572- * determine the checkpoint REDO pointer.
6573- */
6574- WALInsertLockAcquireExclusive ();
6575- curInsert = XLogBytePosToRecPtr (Insert -> CurrBytePos );
6576-
65776615 /*
65786616 * If this isn't a shutdown or forced checkpoint, and if there has been no
65796617 * WAL activity requiring a checkpoint, skip it. The idea here is to
@@ -6584,7 +6622,6 @@ CreateCheckPoint(int flags)
65846622 {
65856623 if (last_important_lsn == ControlFile -> checkPoint )
65866624 {
6587- WALInsertLockRelease ();
65886625 END_CRIT_SECTION ();
65896626 ereport (DEBUG1 ,
65906627 (errmsg_internal ("checkpoint skipped because system is idle" )));
@@ -6606,45 +6643,81 @@ CreateCheckPoint(int flags)
66066643 else
66076644 checkPoint .PrevTimeLineID = checkPoint .ThisTimeLineID ;
66086645
6609- checkPoint .fullPageWrites = Insert -> fullPageWrites ;
6610-
66116646 /*
6612- * Compute new REDO record ptr = location of next XLOG record.
6613- *
6614- * NB: this is NOT necessarily where the checkpoint record itself will be,
6615- * since other backends may insert more XLOG records while we're off doing
6616- * the buffer flush work. Those XLOG records are logically after the
6617- * checkpoint, even though physically before it. Got that?
6647+ * We must block concurrent insertions while examining insert state.
66186648 */
6619- freespace = INSERT_FREESPACE (curInsert );
6620- if (freespace == 0 )
6649+ WALInsertLockAcquireExclusive ();
6650+
6651+ checkPoint .fullPageWrites = Insert -> fullPageWrites ;
6652+
6653+ if (shutdown )
66216654 {
6622- if (XLogSegmentOffset (curInsert , wal_segment_size ) == 0 )
6623- curInsert += SizeOfXLogLongPHD ;
6624- else
6625- curInsert += SizeOfXLogShortPHD ;
6626- }
6627- checkPoint .redo = curInsert ;
6655+ XLogRecPtr curInsert = XLogBytePosToRecPtr (Insert -> CurrBytePos );
66286656
6629- /*
6630- * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6631- * must be done while holding all the insertion locks.
6632- *
6633- * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6634- * pointing past where it really needs to point. This is okay; the only
6635- * consequence is that XLogInsert might back up whole buffers that it
6636- * didn't really need to. We can't postpone advancing RedoRecPtr because
6637- * XLogInserts that happen while we are dumping buffers must assume that
6638- * their buffer changes are not included in the checkpoint.
6639- */
6640- RedoRecPtr = XLogCtl -> Insert .RedoRecPtr = checkPoint .redo ;
6657+ /*
6658+ * Compute new REDO record ptr = location of next XLOG record.
6659+ *
6660+ * Since this is a shutdown checkpoint, there can't be any concurrent
6661+ * WAL insertion.
6662+ */
6663+ freespace = INSERT_FREESPACE (curInsert );
6664+ if (freespace == 0 )
6665+ {
6666+ if (XLogSegmentOffset (curInsert , wal_segment_size ) == 0 )
6667+ curInsert += SizeOfXLogLongPHD ;
6668+ else
6669+ curInsert += SizeOfXLogShortPHD ;
6670+ }
6671+ checkPoint .redo = curInsert ;
6672+
6673+ /*
6674+ * Here we update the shared RedoRecPtr for future XLogInsert calls;
6675+ * this must be done while holding all the insertion locks.
6676+ *
6677+ * Note: if we fail to complete the checkpoint, RedoRecPtr will be
6678+ * left pointing past where it really needs to point. This is okay;
6679+ * the only consequence is that XLogInsert might back up whole buffers
6680+ * that it didn't really need to. We can't postpone advancing
6681+ * RedoRecPtr because XLogInserts that happen while we are dumping
6682+ * buffers must assume that their buffer changes are not included in
6683+ * the checkpoint.
6684+ */
6685+ RedoRecPtr = XLogCtl -> Insert .RedoRecPtr = checkPoint .redo ;
6686+ }
66416687
66426688 /*
66436689 * Now we can release the WAL insertion locks, allowing other xacts to
66446690 * proceed while we are flushing disk buffers.
66456691 */
66466692 WALInsertLockRelease ();
66476693
6694+ /*
6695+ * If this is an online checkpoint, we have not yet determined the redo
6696+ * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
6697+ * record; the LSN at which it starts becomes the new redo pointer. We
6698+ * don't do this for a shutdown checkpoint, because in that case no WAL
6699+ * can be written between the redo point and the insertion of the
6700+ * checkpoint record itself, so the checkpoint record itself serves to
6701+ * mark the redo point.
6702+ */
6703+ if (!shutdown )
6704+ {
6705+ int dummy = 0 ;
6706+
6707+ /* Record must have payload to avoid assertion failure. */
6708+ XLogBeginInsert ();
6709+ XLogRegisterData ((char * ) & dummy , sizeof (dummy ));
6710+ (void ) XLogInsert (RM_XLOG_ID , XLOG_CHECKPOINT_REDO );
6711+
6712+ /*
6713+ * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
6714+ * shared memory and RedoRecPtr in backend-local memory, but we need
6715+ * to copy that into the record that will be inserted when the
6716+ * checkpoint is complete.
6717+ */
6718+ checkPoint .redo = RedoRecPtr ;
6719+ }
6720+
66486721 /* Update the info_lck-protected copy of RedoRecPtr as well */
66496722 SpinLockAcquire (& XLogCtl -> info_lck );
66506723 XLogCtl -> RedoRecPtr = checkPoint .redo ;
@@ -8105,6 +8178,10 @@ xlog_redo(XLogReaderState *record)
81058178 /* Keep track of full_page_writes */
81068179 lastFullPageWrites = fpw ;
81078180 }
8181+ else if (info == XLOG_CHECKPOINT_REDO )
8182+ {
8183+ /* nothing to do here, just for informational purposes */
8184+ }
81088185}
81098186
81108187/*
0 commit comments