@@ -75,6 +75,8 @@ typedef struct BtreeCheckState
7575 bool readonly ;
7676 /* Also verifying heap has no unindexed tuples? */
7777 bool heapallindexed ;
78+ /* Also making sure non-pivot tuples can be found by new search? */
79+ bool rootdescend ;
7880 /* Per-page context */
7981 MemoryContext targetcontext ;
8082 /* Buffer access strategy */
@@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check);
124126PG_FUNCTION_INFO_V1 (bt_index_parent_check );
125127
126128static void bt_index_check_internal (Oid indrelid , bool parentcheck ,
127- bool heapallindexed );
129+ bool heapallindexed , bool rootdescend );
128130static inline void btree_index_checkable (Relation rel );
129131static void bt_check_every_level (Relation rel , Relation heaprel ,
130- bool heapkeyspace , bool readonly , bool heapallindexed );
132+ bool heapkeyspace , bool readonly , bool heapallindexed ,
133+ bool rootdescend );
131134static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
132135 BtreeLevel level );
133136static void bt_target_page_check (BtreeCheckState * state );
@@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
140143 bool tupleIsAlive , void * checkstate );
141144static IndexTuple bt_normalize_tuple (BtreeCheckState * state ,
142145 IndexTuple itup );
146+ static bool bt_rootdescend (BtreeCheckState * state , IndexTuple itup );
143147static inline bool offset_is_negative_infinity (BTPageOpaque opaque ,
144148 OffsetNumber offset );
145149static inline bool invariant_l_offset (BtreeCheckState * state , BTScanInsert key ,
@@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS)
177181 if (PG_NARGS () == 2 )
178182 heapallindexed = PG_GETARG_BOOL (1 );
179183
180- bt_index_check_internal (indrelid , false, heapallindexed );
184+ bt_index_check_internal (indrelid , false, heapallindexed , false );
181185
182186 PG_RETURN_VOID ();
183187}
@@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
196200{
197201 Oid indrelid = PG_GETARG_OID (0 );
198202 bool heapallindexed = false;
203+ bool rootdescend = false;
199204
200- if (PG_NARGS () = = 2 )
205+ if (PG_NARGS () > = 2 )
201206 heapallindexed = PG_GETARG_BOOL (1 );
207+ if (PG_NARGS () == 3 )
208+ rootdescend = PG_GETARG_BOOL (2 );
202209
203- bt_index_check_internal (indrelid , true, heapallindexed );
210+ bt_index_check_internal (indrelid , true, heapallindexed , rootdescend );
204211
205212 PG_RETURN_VOID ();
206213}
@@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
209216 * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
210217 */
211218static void
212- bt_index_check_internal (Oid indrelid , bool parentcheck , bool heapallindexed )
219+ bt_index_check_internal (Oid indrelid , bool parentcheck , bool heapallindexed ,
220+ bool rootdescend )
213221{
214222 Oid heapid ;
215223 Relation indrel ;
@@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
267275 /* Check index, possibly against table it is an index on */
268276 heapkeyspace = _bt_heapkeyspace (indrel );
269277 bt_check_every_level (indrel , heaprel , heapkeyspace , parentcheck ,
270- heapallindexed );
278+ heapallindexed , rootdescend );
271279
272280 /*
273281 * Release locks early. That's ok here because nothing in the called
@@ -338,7 +346,7 @@ btree_index_checkable(Relation rel)
338346 */
339347static void
340348bt_check_every_level (Relation rel , Relation heaprel , bool heapkeyspace ,
341- bool readonly , bool heapallindexed )
349+ bool readonly , bool heapallindexed , bool rootdescend )
342350{
343351 BtreeCheckState * state ;
344352 Page metapage ;
@@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
362370 state -> heapkeyspace = heapkeyspace ;
363371 state -> readonly = readonly ;
364372 state -> heapallindexed = heapallindexed ;
373+ state -> rootdescend = rootdescend ;
365374
366375 if (state -> heapallindexed )
367376 {
@@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
430439 }
431440 }
432441
442+ Assert (!state -> rootdescend || state -> readonly );
443+ if (state -> rootdescend && !state -> heapkeyspace )
444+ ereport (ERROR ,
445+ (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
446+ errmsg ("cannot verify that tuples from index \"%s\" can each be found by an independent index search" ,
447+ RelationGetRelationName (rel )),
448+ errhint ("Only B-Tree version 4 indexes support rootdescend verification." )));
449+
433450 /* Create context for page */
434451 state -> targetcontext = AllocSetContextCreate (CurrentMemoryContext ,
435452 "amcheck context" ,
@@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state)
922939 if (offset_is_negative_infinity (topaque , offset ))
923940 continue ;
924941
942+ /*
943+ * Readonly callers may optionally verify that non-pivot tuples can
944+ * each be found by an independent search that starts from the root
945+ */
946+ if (state -> rootdescend && P_ISLEAF (topaque ) &&
947+ !bt_rootdescend (state , itup ))
948+ {
949+ char * itid ,
950+ * htid ;
951+
952+ itid = psprintf ("(%u,%u)" , state -> targetblock , offset );
953+ htid = psprintf ("(%u,%u)" ,
954+ ItemPointerGetBlockNumber (& (itup -> t_tid )),
955+ ItemPointerGetOffsetNumber (& (itup -> t_tid )));
956+
957+ ereport (ERROR ,
958+ (errcode (ERRCODE_INDEX_CORRUPTED ),
959+ errmsg ("could not find tuple using search from root page in index \"%s\"" ,
960+ RelationGetRelationName (state -> rel )),
961+ errdetail_internal ("Index tid=%s points to heap tid=%s page lsn=%X/%X." ,
962+ itid , htid ,
963+ (uint32 ) (state -> targetlsn >> 32 ),
964+ (uint32 ) state -> targetlsn )));
965+ }
966+
925967 /* Build insertion scankey for current page offset */
926968 skey = bt_mkscankey_pivotsearch (state -> rel , itup );
927969
@@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
15261568 * internal pages. In more general terms, a negative infinity item is
15271569 * only negative infinity with respect to the subtree that the page is
15281570 * at the root of.
1571+ *
1572+ * See also: bt_rootdescend(), which can even detect transitive
1573+ * inconsistencies on cousin leaf pages.
15291574 */
15301575 if (offset_is_negative_infinity (copaque , offset ))
15311576 continue ;
@@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
19261971 return reformed ;
19271972}
19281973
1974+ /*
1975+ * Search for itup in index, starting from fast root page. itup must be a
1976+ * non-pivot tuple. This is only supported with heapkeyspace indexes, since
1977+ * we rely on having fully unique keys to find a match with only a signle
1978+ * visit to a leaf page, barring an interrupted page split, where we may have
1979+ * to move right. (A concurrent page split is impossible because caller must
1980+ * be readonly caller.)
1981+ *
1982+ * This routine can detect very subtle transitive consistency issues across
1983+ * more than one level of the tree. Leaf pages all have a high key (even the
1984+ * rightmost page has a conceptual positive infinity high key), but not a low
1985+ * key. Their downlink in parent is a lower bound, which along with the high
1986+ * key is almost enough to detect every possible inconsistency. A downlink
1987+ * separator key value won't always be available from parent, though, because
1988+ * the first items of internal pages are negative infinity items, truncated
1989+ * down to zero attributes during internal page splits. While it's true that
1990+ * bt_downlink_check() and the high key check can detect most imaginable key
1991+ * space problems, there are remaining problems it won't detect with non-pivot
1992+ * tuples in cousin leaf pages. Starting a search from the root for every
1993+ * existing leaf tuple detects small inconsistencies in upper levels of the
1994+ * tree that cannot be detected any other way. (Besides all this, this is
1995+ * probably also useful as a direct test of the code used by index scans
1996+ * themselves.)
1997+ */
1998+ static bool
1999+ bt_rootdescend (BtreeCheckState * state , IndexTuple itup )
2000+ {
2001+ BTScanInsert key ;
2002+ BTStack stack ;
2003+ Buffer lbuf ;
2004+ bool exists ;
2005+
2006+ key = _bt_mkscankey (state -> rel , itup );
2007+ Assert (key -> heapkeyspace && key -> scantid != NULL );
2008+
2009+ /*
2010+ * Search from root.
2011+ *
2012+ * Ideally, we would arrange to only move right within _bt_search() when
2013+ * an interrupted page split is detected (i.e. when the incomplete split
2014+ * bit is found to be set), but for now we accept the possibility that
2015+ * that could conceal an inconsistency.
2016+ */
2017+ Assert (state -> readonly && state -> rootdescend );
2018+ exists = false;
2019+ stack = _bt_search (state -> rel , key , & lbuf , BT_READ , NULL );
2020+
2021+ if (BufferIsValid (lbuf ))
2022+ {
2023+ BTInsertStateData insertstate ;
2024+ OffsetNumber offnum ;
2025+ Page page ;
2026+
2027+ insertstate .itup = itup ;
2028+ insertstate .itemsz = MAXALIGN (IndexTupleSize (itup ));
2029+ insertstate .itup_key = key ;
2030+ insertstate .bounds_valid = false;
2031+ insertstate .buf = lbuf ;
2032+
2033+ /* Get matching tuple on leaf page */
2034+ offnum = _bt_binsrch_insert (state -> rel , & insertstate );
2035+ /* Compare first >= matching item on leaf page, if any */
2036+ page = BufferGetPage (lbuf );
2037+ if (offnum <= PageGetMaxOffsetNumber (page ) &&
2038+ _bt_compare (state -> rel , key , page , offnum ) == 0 )
2039+ exists = true;
2040+ _bt_relbuf (state -> rel , lbuf );
2041+ }
2042+
2043+ _bt_freestack (stack );
2044+ pfree (key );
2045+
2046+ return exists ;
2047+ }
2048+
19292049/*
19302050 * Is particular offset within page (whose special state is passed by caller)
19312051 * the page negative-infinity item?
0 commit comments