@@ -694,6 +694,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
694694 uint32 freespace ;
695695 int curridx ;
696696 XLogRecData * rdt ;
697+ XLogRecData * rdt_lastnormal ;
697698 Buffer dtbuf [XLR_MAX_BKP_BLOCKS ];
698699 bool dtbuf_bkp [XLR_MAX_BKP_BLOCKS ];
699700 BkpBlock dtbuf_xlg [XLR_MAX_BKP_BLOCKS ];
@@ -708,6 +709,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
708709 bool updrqst ;
709710 bool doPageWrites ;
710711 bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH );
712+ uint8 info_orig = info ;
711713
712714 /* cross-check on whether we should be here or not */
713715 if (!XLogInsertAllowed ())
@@ -731,23 +733,18 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
731733 }
732734
733735 /*
734- * Here we scan the rdata chain, determine which buffers must be backed
735- * up, and compute the CRC values for the data. Note that the record
736- * header isn't added into the CRC initially since we don't know the final
737- * length or info bits quite yet. Thus, the CRC will represent the CRC of
738- * the whole record in the order "rdata, then backup blocks, then record
739- * header".
736+ * Here we scan the rdata chain, to determine which buffers must be backed
737+ * up.
740738 *
741739 * We may have to loop back to here if a race condition is detected below.
742740 * We could prevent the race by doing all this work while holding the
743741 * insert lock, but it seems better to avoid doing CRC calculations while
744- * holding the lock. This means we have to be careful about modifying the
745- * rdata chain until we know we aren't going to loop back again. The only
746- * change we allow ourselves to make earlier is to set rdt->data = NULL in
747- * chain items we have decided we will have to back up the whole buffer
748- * for. This is OK because we will certainly decide the same thing again
749- * for those items if we do it over; doing it here saves an extra pass
750- * over the chain later.
742+ * holding the lock.
743+ *
744+ * We add entries for backup blocks to the chain, so that they don't
745+ * need any special treatment in the critical section where the chunks are
746+ * copied into the WAL buffers. Those entries have to be unlinked from the
747+ * chain if we have to loop back here.
751748 */
752749begin :;
753750 for (i = 0 ; i < XLR_MAX_BKP_BLOCKS ; i ++ )
@@ -764,15 +761,13 @@ begin:;
764761 */
765762 doPageWrites = fullPageWrites || Insert -> forcePageWrites ;
766763
767- INIT_CRC32 (rdata_crc );
768764 len = 0 ;
769765 for (rdt = rdata ;;)
770766 {
771767 if (rdt -> buffer == InvalidBuffer )
772768 {
773769 /* Simple data, just include it */
774770 len += rdt -> len ;
775- COMP_CRC32 (rdata_crc , rdt -> data , rdt -> len );
776771 }
777772 else
778773 {
@@ -783,12 +778,12 @@ begin:;
783778 {
784779 /* Buffer already referenced by earlier chain item */
785780 if (dtbuf_bkp [i ])
781+ {
786782 rdt -> data = NULL ;
783+ rdt -> len = 0 ;
784+ }
787785 else if (rdt -> data )
788- {
789786 len += rdt -> len ;
790- COMP_CRC32 (rdata_crc , rdt -> data , rdt -> len );
791- }
792787 break ;
793788 }
794789 if (dtbuf [i ] == InvalidBuffer )
@@ -800,12 +795,10 @@ begin:;
800795 {
801796 dtbuf_bkp [i ] = true;
802797 rdt -> data = NULL ;
798+ rdt -> len = 0 ;
803799 }
804800 else if (rdt -> data )
805- {
806801 len += rdt -> len ;
807- COMP_CRC32 (rdata_crc , rdt -> data , rdt -> len );
808- }
809802 break ;
810803 }
811804 }
@@ -819,39 +812,6 @@ begin:;
819812 rdt = rdt -> next ;
820813 }
821814
822- /*
823- * Now add the backup block headers and data into the CRC
824- */
825- for (i = 0 ; i < XLR_MAX_BKP_BLOCKS ; i ++ )
826- {
827- if (dtbuf_bkp [i ])
828- {
829- BkpBlock * bkpb = & (dtbuf_xlg [i ]);
830- char * page ;
831-
832- COMP_CRC32 (rdata_crc ,
833- (char * ) bkpb ,
834- sizeof (BkpBlock ));
835- page = (char * ) BufferGetBlock (dtbuf [i ]);
836- if (bkpb -> hole_length == 0 )
837- {
838- COMP_CRC32 (rdata_crc ,
839- page ,
840- BLCKSZ );
841- }
842- else
843- {
844- /* must skip the hole */
845- COMP_CRC32 (rdata_crc ,
846- page ,
847- bkpb -> hole_offset );
848- COMP_CRC32 (rdata_crc ,
849- page + (bkpb -> hole_offset + bkpb -> hole_length ),
850- BLCKSZ - (bkpb -> hole_offset + bkpb -> hole_length ));
851- }
852- }
853- }
854-
855815 /*
856816 * NOTE: We disallow len == 0 because it provides a useful bit of extra
857817 * error checking in ReadRecord. This means that all callers of
@@ -862,70 +822,20 @@ begin:;
862822 if (len == 0 && !isLogSwitch )
863823 elog (PANIC , "invalid xlog record length %u" , len );
864824
865- START_CRIT_SECTION ();
866-
867- /* Now wait to get insert lock */
868- LWLockAcquire (WALInsertLock , LW_EXCLUSIVE );
869-
870- /*
871- * Check to see if my RedoRecPtr is out of date. If so, may have to go
872- * back and recompute everything. This can only happen just after a
873- * checkpoint, so it's better to be slow in this case and fast otherwise.
874- *
875- * If we aren't doing full-page writes then RedoRecPtr doesn't actually
876- * affect the contents of the XLOG record, so we'll update our local copy
877- * but not force a recomputation.
878- */
879- if (!XLByteEQ (RedoRecPtr , Insert -> RedoRecPtr ))
880- {
881- Assert (XLByteLT (RedoRecPtr , Insert -> RedoRecPtr ));
882- RedoRecPtr = Insert -> RedoRecPtr ;
883-
884- if (doPageWrites )
885- {
886- for (i = 0 ; i < XLR_MAX_BKP_BLOCKS ; i ++ )
887- {
888- if (dtbuf [i ] == InvalidBuffer )
889- continue ;
890- if (dtbuf_bkp [i ] == false &&
891- XLByteLE (dtbuf_lsn [i ], RedoRecPtr ))
892- {
893- /*
894- * Oops, this buffer now needs to be backed up, but we
895- * didn't think so above. Start over.
896- */
897- LWLockRelease (WALInsertLock );
898- END_CRIT_SECTION ();
899- goto begin ;
900- }
901- }
902- }
903- }
904-
905- /*
906- * Also check to see if forcePageWrites was just turned on; if we weren't
907- * already doing full-page writes then go back and recompute. (If it was
908- * just turned off, we could recompute the record without full pages, but
909- * we choose not to bother.)
910- */
911- if (Insert -> forcePageWrites && !doPageWrites )
912- {
913- /* Oops, must redo it with full-page data */
914- LWLockRelease (WALInsertLock );
915- END_CRIT_SECTION ();
916- goto begin ;
917- }
918-
919825 /*
920826 * Make additional rdata chain entries for the backup blocks, so that we
921- * don't need to special-case them in the write loop. Note that we have
922- * now irrevocably changed the input rdata chain. At the exit of this
923- * loop, write_len includes the backup block data.
827+ * don't need to special-case them in the write loop. This modifies the
828+ * original rdata chain, but we keep a pointer to the last regular entry,
829+ * rdt_lastnormal, so that we can undo this if we have to loop back to the
830+ * beginning.
831+ *
832+ * At the exit of this loop, write_len includes the backup block data.
924833 *
925834 * Also set the appropriate info bits to show which buffers were backed
926835 * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
927836 * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
928837 */
838+ rdt_lastnormal = rdt ;
929839 write_len = len ;
930840 for (i = 0 ; i < XLR_MAX_BKP_BLOCKS ; i ++ )
931841 {
@@ -974,6 +884,76 @@ begin:;
974884 }
975885 }
976886
887+ /*
888+ * Calculate CRC of the data, including all the backup blocks
889+ *
890+ * Note that the record header isn't added into the CRC initially since
891+ * we don't know the prev-link yet. Thus, the CRC will represent the CRC
892+ * of the whole record in the order: rdata, then backup blocks, then
893+ * record header.
894+ */
895+ INIT_CRC32 (rdata_crc );
896+ for (rdt = rdata ; rdt != NULL ; rdt = rdt -> next )
897+ COMP_CRC32 (rdata_crc , rdt -> data , rdt -> len );
898+
899+ START_CRIT_SECTION ();
900+
901+ /* Now wait to get insert lock */
902+ LWLockAcquire (WALInsertLock , LW_EXCLUSIVE );
903+
904+ /*
905+ * Check to see if my RedoRecPtr is out of date. If so, may have to go
906+ * back and recompute everything. This can only happen just after a
907+ * checkpoint, so it's better to be slow in this case and fast otherwise.
908+ *
909+ * If we aren't doing full-page writes then RedoRecPtr doesn't actually
910+ * affect the contents of the XLOG record, so we'll update our local copy
911+ * but not force a recomputation.
912+ */
913+ if (!XLByteEQ (RedoRecPtr , Insert -> RedoRecPtr ))
914+ {
915+ Assert (XLByteLT (RedoRecPtr , Insert -> RedoRecPtr ));
916+ RedoRecPtr = Insert -> RedoRecPtr ;
917+
918+ if (doPageWrites )
919+ {
920+ for (i = 0 ; i < XLR_MAX_BKP_BLOCKS ; i ++ )
921+ {
922+ if (dtbuf [i ] == InvalidBuffer )
923+ continue ;
924+ if (dtbuf_bkp [i ] == false &&
925+ XLByteLE (dtbuf_lsn [i ], RedoRecPtr ))
926+ {
927+ /*
928+ * Oops, this buffer now needs to be backed up, but we
929+ * didn't think so above. Start over.
930+ */
931+ LWLockRelease (WALInsertLock );
932+ END_CRIT_SECTION ();
933+ rdt_lastnormal -> next = NULL ;
934+ info = info_orig ;
935+ goto begin ;
936+ }
937+ }
938+ }
939+ }
940+
941+ /*
942+ * Also check to see if forcePageWrites was just turned on; if we weren't
943+ * already doing full-page writes then go back and recompute. (If it was
944+ * just turned off, we could recompute the record without full pages, but
945+ * we choose not to bother.)
946+ */
947+ if (Insert -> forcePageWrites && !doPageWrites )
948+ {
949+ /* Oops, must redo it with full-page data. */
950+ LWLockRelease (WALInsertLock );
951+ END_CRIT_SECTION ();
952+ rdt_lastnormal -> next = NULL ;
953+ info = info_orig ;
954+ goto begin ;
955+ }
956+
977957 /*
978958 * If there isn't enough space on the current XLOG page for a record
979959 * header, advance to the next page (leaving the unused space as zeroes).
0 commit comments