@@ -145,6 +145,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
145145 bool rootdescend );
146146static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
147147 BtreeLevel level );
148+ static void bt_recheck_sibling_links (BtreeCheckState * state ,
149+ BlockNumber btpo_prev_from_target ,
150+ BlockNumber leftcurrent );
148151static void bt_target_page_check (BtreeCheckState * state );
149152static BTScanInsert bt_right_page_check_scankey (BtreeCheckState * state );
150153static void bt_child_check (BtreeCheckState * state , BTScanInsert targetkey ,
@@ -787,17 +790,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
787790 */
788791 }
789792
790- /*
791- * readonly mode can only ever land on live pages and half-dead pages,
792- * so sibling pointers should always be in mutual agreement
793- */
794- if (state -> readonly && opaque -> btpo_prev != leftcurrent )
795- ereport (ERROR ,
796- (errcode (ERRCODE_INDEX_CORRUPTED ),
797- errmsg ("left link/right link pair in index \"%s\" not in agreement" ,
798- RelationGetRelationName (state -> rel )),
799- errdetail_internal ("Block=%u left block=%u left link from block=%u." ,
800- current , leftcurrent , opaque -> btpo_prev )));
793+ /* Sibling links should be in mutual agreement */
794+ if (opaque -> btpo_prev != leftcurrent )
795+ bt_recheck_sibling_links (state , opaque -> btpo_prev , leftcurrent );
801796
802797 /* Check level, which must be valid for non-ignorable page */
803798 if (level .level != opaque -> btpo .level )
@@ -877,6 +872,140 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
877872 return nextleveldown ;
878873}
879874
875+ /*
876+ * Raise an error when target page's left link does not point back to the
877+ * previous target page, called leftcurrent here. The leftcurrent page's
878+ * right link was followed to get to the current target page, and we expect
879+ * mutual agreement among leftcurrent and the current target page. Make sure
880+ * that this condition has definitely been violated in the !readonly case,
881+ * where concurrent page splits are something that we need to deal with.
882+ *
883+ * Cross-page inconsistencies involving pages that don't agree about being
884+ * siblings are known to be a particularly good indicator of corruption
885+ * involving partial writes/lost updates. The bt_right_page_check_scankey
886+ * check also provides a way of detecting cross-page inconsistencies for
887+ * !readonly callers, but it can only detect sibling pages that have an
888+ * out-of-order keyspace, which can't catch many of the problems that we
889+ * expect to catch here.
890+ *
891+ * The classic example of the kind of inconsistency that we can only catch
892+ * with this check (when in !readonly mode) involves three sibling pages that
893+ * were affected by a faulty page split at some point in the past. The
894+ * effects of the split are reflected in the original page and its new right
895+ * sibling page, with a lack of any accompanying changes for the _original_
896+ * right sibling page. The original right sibling page's left link fails to
897+ * point to the new right sibling page (its left link still points to the
898+ * original page), even though the first phase of a page split is supposed to
899+ * work as a single atomic action. This subtle inconsistency will probably
900+ * only break backwards scans in practice.
901+ *
902+ * Note that this is the only place where amcheck will "couple" buffer locks
903+ * (and only for !readonly callers). In general we prefer to avoid more
904+ * thorough cross-page checks in !readonly mode, but it seems worth the
905+ * complexity here. Also, the performance overhead of performing lock
906+ * coupling here is negligible in practice. Control only reaches here with a
907+ * non-corrupt index when there is a concurrent page split at the instant
908+ * caller crossed over to target page from leftcurrent page.
909+ */
910+ static void
911+ bt_recheck_sibling_links (BtreeCheckState * state ,
912+ BlockNumber btpo_prev_from_target ,
913+ BlockNumber leftcurrent )
914+ {
915+ if (!state -> readonly )
916+ {
917+ Buffer lbuf ;
918+ Buffer newtargetbuf ;
919+ Page page ;
920+ BTPageOpaque opaque ;
921+ BlockNumber newtargetblock ;
922+
923+ /* Couple locks in the usual order for nbtree: Left to right */
924+ lbuf = ReadBufferExtended (state -> rel , MAIN_FORKNUM , leftcurrent ,
925+ RBM_NORMAL , state -> checkstrategy );
926+ LockBuffer (lbuf , BT_READ );
927+ _bt_checkpage (state -> rel , lbuf );
928+ page = BufferGetPage (lbuf );
929+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
930+ if (P_ISDELETED (opaque ))
931+ {
932+ /*
933+ * Cannot reason about concurrently deleted page -- the left link
934+ * in the page to the right is expected to point to some other
935+ * page to the left (not leftcurrent page).
936+ *
937+ * Note that we deliberately don't give up with a half-dead page.
938+ */
939+ UnlockReleaseBuffer (lbuf );
940+ return ;
941+ }
942+
943+ newtargetblock = opaque -> btpo_next ;
944+ /* Avoid self-deadlock when newtargetblock == leftcurrent */
945+ if (newtargetblock != leftcurrent )
946+ {
947+ newtargetbuf = ReadBufferExtended (state -> rel , MAIN_FORKNUM ,
948+ newtargetblock , RBM_NORMAL ,
949+ state -> checkstrategy );
950+ LockBuffer (newtargetbuf , BT_READ );
951+ _bt_checkpage (state -> rel , newtargetbuf );
952+ page = BufferGetPage (newtargetbuf );
953+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
954+ /* btpo_prev_from_target may have changed; update it */
955+ btpo_prev_from_target = opaque -> btpo_prev ;
956+ }
957+ else
958+ {
959+ /*
960+ * leftcurrent right sibling points back to leftcurrent block.
961+ * Index is corrupt. Easiest way to handle this is to pretend
962+ * that we actually read from a distinct page that has an invalid
963+ * block number in its btpo_prev.
964+ */
965+ newtargetbuf = InvalidBuffer ;
966+ btpo_prev_from_target = InvalidBlockNumber ;
967+ }
968+
969+ /*
970+ * No need to check P_ISDELETED here, since new target block cannot be
971+ * marked deleted as long as we hold a lock on lbuf
972+ */
973+ if (BufferIsValid (newtargetbuf ))
974+ UnlockReleaseBuffer (newtargetbuf );
975+ UnlockReleaseBuffer (lbuf );
976+
977+ if (btpo_prev_from_target == leftcurrent )
978+ {
979+ /* Report split in left sibling, not target (or new target) */
980+ ereport (DEBUG1 ,
981+ (errcode (ERRCODE_INTERNAL_ERROR ),
982+ errmsg ("harmless concurrent page split detected in index \"%s\"" ,
983+ RelationGetRelationName (state -> rel )),
984+ errdetail_internal ("Block=%u new right sibling=%u original right sibling=%u." ,
985+ leftcurrent , newtargetblock ,
986+ state -> targetblock )));
987+ return ;
988+ }
989+
990+ /*
991+ * Index is corrupt. Make sure that we report correct target page.
992+ *
993+ * This could have changed in cases where there was a concurrent page
994+ * split, as well as index corruption (at least in theory). Note that
995+ * btpo_prev_from_target was already updated above.
996+ */
997+ state -> targetblock = newtargetblock ;
998+ }
999+
1000+ ereport (ERROR ,
1001+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1002+ errmsg ("left link/right link pair in index \"%s\" not in agreement" ,
1003+ RelationGetRelationName (state -> rel )),
1004+ errdetail_internal ("Block=%u left block=%u left link from block=%u." ,
1005+ state -> targetblock , leftcurrent ,
1006+ btpo_prev_from_target )));
1007+ }
1008+
8801009/*
8811010 * Function performs the following checks on target page, or pages ancillary to
8821011 * target page:
@@ -1965,18 +2094,14 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
19652094 * downlink, which was concurrently physically removed in target/parent as
19662095 * part of deletion's first phase.)
19672096 *
1968- * Note that while the cross-page-same-level last item check uses a trick
1969- * that allows it to perform verification for !readonly callers, a similar
1970- * trick seems difficult here. The trick that that other check uses is,
1971- * in essence, to lock down race conditions to those that occur due to
1972- * concurrent page deletion of the target; that's a race that can be
1973- * reliably detected before actually reporting corruption.
1974- *
1975- * On the other hand, we'd need to lock down race conditions involving
1976- * deletion of child's left page, for long enough to read the child page
1977- * into memory (in other words, a scheme with concurrently held buffer
1978- * locks on both child and left-of-child pages). That's unacceptable for
1979- * amcheck functions on general principle, though.
2097+ * While we use various techniques elsewhere to perform cross-page
2098+ * verification for !readonly callers, a similar trick seems difficult
2099+ * here. The tricks used by bt_recheck_sibling_links and by
2100+ * bt_right_page_check_scankey both involve verification of a same-level,
2101+ * cross-sibling invariant. Cross-level invariants are far more squishy,
2102+ * though. The nbtree REDO routines do not actually couple buffer locks
2103+ * across levels during page splits, so making any cross-level check work
2104+ * reliably in !readonly mode may be impossible.
19802105 */
19812106 Assert (state -> readonly );
19822107
@@ -2785,6 +2910,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
27852910 * There is never an attempt to get a consistent view of multiple pages using
27862911 * multiple concurrent buffer locks; in general, we only acquire a single pin
27872912 * and buffer lock at a time, which is often all that the nbtree code requires.
2913+ * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only
2914+ * exception to this general rule.)
27882915 *
27892916 * Operating on a copy of the page is useful because it prevents control
27902917 * getting stuck in an uninterruptible state when an underlying operator class
0 commit comments