3535#include "utils/snapmgr.h"
3636
3737static BTMetaPageData * _bt_getmeta (Relation rel , Buffer metabuf );
38- static bool _bt_mark_page_halfdead (Relation rel , Buffer buf , BTStack stack );
38+ static bool _bt_mark_page_halfdead (Relation rel , Buffer leafbuf ,
39+ BTStack stack );
3940static bool _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf ,
40- bool * rightsib_empty );
41+ bool * rightsib_empty ,
42+ TransactionId * oldestBtpoXact );
4143static TransactionId _bt_xid_horizon (Relation rel , Relation heapRel , Page page ,
4244 OffsetNumber * deletable , int ndeletable );
4345static bool _bt_lock_branch_parent (Relation rel , BlockNumber child ,
@@ -1470,27 +1472,35 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
14701472}
14711473
14721474/*
1473- * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1475+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
14741476 *
1475- * This action unlinks the page from the b-tree structure, removing all
1477+ * This action unlinks the leaf page from the b-tree structure, removing all
14761478 * pointers leading to it --- but not touching its own left and right links.
14771479 * The page cannot be physically reclaimed right away, since other processes
14781480 * may currently be trying to follow links leading to the page; they have to
14791481 * be allowed to use its right-link to recover. See nbtree/README.
14801482 *
14811483 * On entry, the target buffer must be pinned and locked (either read or write
1482- * lock is OK). This lock and pin will be dropped before exiting.
1484+ * lock is OK). The page must be an empty leaf page, which may be half-dead
1485+ * already (a half-dead page should only be passed to us when an earlier
1486+ * VACUUM operation was interrupted, though). Note in particular that caller
1487+ * should never pass a buffer containing an existing deleted page here. The
1488+ * lock and pin on caller's buffer will be dropped before we return.
14831489 *
14841490 * Returns the number of pages successfully deleted (zero if page cannot
1485- * be deleted now; could be more than one if parent or sibling pages were
1486- * deleted too).
1491+ * be deleted now; could be more than one if parent or right sibling pages
1492+ * were deleted too).
1493+ *
1494+ * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1495+ * responsible for maintaining *oldestBtpoXact in the case of pages that were
1496+ * deleted by a previous VACUUM.
14871497 *
14881498 * NOTE: this leaks memory. Rather than trying to clean up everything
14891499 * carefully, it's better to run it in a temp context that can be reset
14901500 * frequently.
14911501 */
14921502int
1493- _bt_pagedel (Relation rel , Buffer buf )
1503+ _bt_pagedel (Relation rel , Buffer leafbuf , TransactionId * oldestBtpoXact )
14941504{
14951505 int ndeleted = 0 ;
14961506 BlockNumber rightsib ;
@@ -1511,14 +1521,21 @@ _bt_pagedel(Relation rel, Buffer buf)
15111521
15121522 for (;;)
15131523 {
1514- page = BufferGetPage (buf );
1524+ page = BufferGetPage (leafbuf );
15151525 opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
15161526
15171527 /*
15181528 * Internal pages are never deleted directly, only as part of deleting
15191529 * the whole branch all the way down to leaf level.
1530+ *
1531+ * Also check for deleted pages here. Caller never passes us a fully
1532+ * deleted page. Only VACUUM can delete pages, so there can't have
1533+ * been a concurrent deletion. Assume that we reached any deleted
1534+ * page encountered here by following a sibling link, and that the
1535+ * index is corrupt.
15201536 */
1521- if (!P_ISLEAF (opaque ))
1537+ Assert (!P_ISDELETED (opaque ));
1538+ if (!P_ISLEAF (opaque ) || P_ISDELETED (opaque ))
15221539 {
15231540 /*
15241541 * Pre-9.4 page deletion only marked internal pages as half-dead,
@@ -1537,13 +1554,22 @@ _bt_pagedel(Relation rel, Buffer buf)
15371554 errmsg ("index \"%s\" contains a half-dead internal page" ,
15381555 RelationGetRelationName (rel )),
15391556 errhint ("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it." )));
1540- _bt_relbuf (rel , buf );
1557+
1558+ if (P_ISDELETED (opaque ))
1559+ ereport (LOG ,
1560+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1561+ errmsg_internal ("found deleted block %u while following right link in index \"%s\"" ,
1562+ BufferGetBlockNumber (leafbuf ),
1563+ RelationGetRelationName (rel ))));
1564+
1565+ _bt_relbuf (rel , leafbuf );
15411566 return ndeleted ;
15421567 }
15431568
15441569 /*
15451570 * We can never delete rightmost pages nor root pages. While at it,
1546- * check that page is not already deleted and is empty.
1571+ * check that page is empty, since it's possible that the leafbuf page
1572+ * was empty a moment ago, but has since had some inserts.
15471573 *
15481574 * To keep the algorithm simple, we also never delete an incompletely
15491575 * split page (they should be rare enough that this doesn't make any
@@ -1558,14 +1584,14 @@ _bt_pagedel(Relation rel, Buffer buf)
15581584 * to. On subsequent iterations, we know we stepped right from a page
15591585 * that passed these tests, so it's OK.
15601586 */
1561- if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) || P_ISDELETED ( opaque ) ||
1587+ if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) ||
15621588 P_FIRSTDATAKEY (opaque ) <= PageGetMaxOffsetNumber (page ) ||
15631589 P_INCOMPLETE_SPLIT (opaque ))
15641590 {
15651591 /* Should never fail to delete a half-dead page */
15661592 Assert (!P_ISHALFDEAD (opaque ));
15671593
1568- _bt_relbuf (rel , buf );
1594+ _bt_relbuf (rel , leafbuf );
15691595 return ndeleted ;
15701596 }
15711597
@@ -1603,7 +1629,7 @@ _bt_pagedel(Relation rel, Buffer buf)
16031629 * To avoid deadlocks, we'd better drop the leaf page lock
16041630 * before going further.
16051631 */
1606- LockBuffer (buf , BUFFER_LOCK_UNLOCK );
1632+ LockBuffer (leafbuf , BUFFER_LOCK_UNLOCK );
16071633
16081634 /*
16091635 * Fetch the left sibling, to check that it's not marked with
@@ -1627,10 +1653,10 @@ _bt_pagedel(Relation rel, Buffer buf)
16271653 * incompletely-split page to be split again. So we don't
16281654 * need to walk right here.
16291655 */
1630- if (lopaque -> btpo_next == BufferGetBlockNumber (buf ) &&
1656+ if (lopaque -> btpo_next == BufferGetBlockNumber (leafbuf ) &&
16311657 P_INCOMPLETE_SPLIT (lopaque ))
16321658 {
1633- ReleaseBuffer (buf );
1659+ ReleaseBuffer (leafbuf );
16341660 _bt_relbuf (rel , lbuf );
16351661 return ndeleted ;
16361662 }
@@ -1646,40 +1672,59 @@ _bt_pagedel(Relation rel, Buffer buf)
16461672 _bt_relbuf (rel , lbuf );
16471673
16481674 /*
1649- * Re-lock the leaf page, and start over, to re-check that the
1650- * page can still be deleted.
1675+ * Re-lock the leaf page, and start over to use our stack
1676+ * within _bt_mark_page_halfdead. We must do it that way
1677+ * because it's possible that leafbuf can no longer be
1678+ * deleted. We need to recheck.
16511679 */
1652- LockBuffer (buf , BT_WRITE );
1680+ LockBuffer (leafbuf , BT_WRITE );
16531681 continue ;
16541682 }
16551683
1656- if (!_bt_mark_page_halfdead (rel , buf , stack ))
1684+ /*
1685+ * See if it's safe to delete the leaf page, and determine how
1686+ * many parent/internal pages above the leaf level will be
1687+ * deleted. If it's safe then _bt_mark_page_halfdead will also
1688+ * perform the first phase of deletion, which includes marking the
1689+ * leafbuf page half-dead.
1690+ */
1691+ Assert (P_ISLEAF (opaque ) && !P_IGNORE (opaque ));
1692+ if (!_bt_mark_page_halfdead (rel , leafbuf , stack ))
16571693 {
1658- _bt_relbuf (rel , buf );
1694+ _bt_relbuf (rel , leafbuf );
16591695 return ndeleted ;
16601696 }
16611697 }
16621698
16631699 /*
16641700 * Then unlink it from its siblings. Each call to
16651701 * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1666- * making it shallower. Iterate until the leaf page is gone.
1702+ * making it shallower. Iterate until the leafbuf page is deleted.
1703+ *
1704+ * _bt_unlink_halfdead_page should never fail, since we established
1705+ * that deletion is generally safe in _bt_mark_page_halfdead.
16671706 */
16681707 rightsib_empty = false;
1708+ Assert (P_ISLEAF (opaque ) && P_ISHALFDEAD (opaque ));
16691709 while (P_ISHALFDEAD (opaque ))
16701710 {
1671- /* will check for interrupts, once lock is released */
1672- if (!_bt_unlink_halfdead_page (rel , buf , & rightsib_empty ))
1711+ /* Check for interrupts in _bt_unlink_halfdead_page */
1712+ if (!_bt_unlink_halfdead_page (rel , leafbuf , & rightsib_empty ,
1713+ oldestBtpoXact ))
16731714 {
1674- /* _bt_unlink_halfdead_page already released buffer */
1715+ /* _bt_unlink_halfdead_page failed, released buffer */
16751716 return ndeleted ;
16761717 }
16771718 ndeleted ++ ;
16781719 }
16791720
1721+ Assert (P_ISLEAF (opaque ) && P_ISDELETED (opaque ));
1722+ Assert (TransactionIdFollowsOrEquals (opaque -> btpo .xact ,
1723+ * oldestBtpoXact ));
1724+
16801725 rightsib = opaque -> btpo_next ;
16811726
1682- _bt_relbuf (rel , buf );
1727+ _bt_relbuf (rel , leafbuf );
16831728
16841729 /*
16851730 * Check here, as calling loops will have locks held, preventing
@@ -1705,7 +1750,7 @@ _bt_pagedel(Relation rel, Buffer buf)
17051750 if (!rightsib_empty )
17061751 break ;
17071752
1708- buf = _bt_getbuf (rel , rightsib , BT_WRITE );
1753+ leafbuf = _bt_getbuf (rel , rightsib , BT_WRITE );
17091754 }
17101755
17111756 return ndeleted ;
@@ -1909,17 +1954,28 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
19091954 * of the whole branch, including the leaf page itself, iterate until the
19101955 * leaf page is deleted.
19111956 *
1912- * Returns 'false' if the page could not be unlinked (shouldn't happen).
1913- * If the (current) right sibling of the page is empty, *rightsib_empty is
1914- * set to true.
1957+ * Returns 'false' if the page could not be unlinked (shouldn't happen). If
1958+ * the right sibling of the current target page is empty, *rightsib_empty is
1959+ * set to true, allowing caller to delete the target's right sibling page in
1960+ * passing. Note that *rightsib_empty is only actually used by caller when
1961+ * target page is leafbuf, following last call here for leafbuf/the subtree
1962+ * containing leafbuf. (We always set *rightsib_empty for caller, just to be
1963+ * consistent.)
1964+ *
1965+ * We maintain *oldestBtpoXact for pages that are deleted by the current
1966+ * VACUUM operation here. This must be handled here because we conservatively
1967+ * assume that there needs to be a new call to ReadNewTransactionId() each
1968+ * time a page gets deleted. See comments about the underlying assumption
1969+ * below.
19151970 *
19161971 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
19171972 * On success exit, we'll be holding pin and write lock. On failure exit,
19181973 * we'll release both pin and lock before returning (we define it that way
19191974 * to avoid having to reacquire a lock we already released).
19201975 */
19211976static bool
1922- _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf , bool * rightsib_empty )
1977+ _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf , bool * rightsib_empty ,
1978+ TransactionId * oldestBtpoXact )
19231979{
19241980 BlockNumber leafblkno = BufferGetBlockNumber (leafbuf );
19251981 BlockNumber leafleftsib ;
@@ -2057,9 +2113,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
20572113 lbuf = InvalidBuffer ;
20582114
20592115 /*
2060- * Next write-lock the target page itself. It should be okay to take just
2061- * a write lock not a superexclusive lock, since no scans would stop on an
2062- * empty page.
2116+ * Next write-lock the target page itself. It's okay to take a write lock
2117+ * rather than a superexclusive lock, since no scan will stop on an empty
2118+ * page.
20632119 */
20642120 LockBuffer (buf , BT_WRITE );
20652121 page = BufferGetPage (buf );
@@ -2204,6 +2260,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
22042260 */
22052261 page = BufferGetPage (buf );
22062262 opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
2263+ Assert (P_ISHALFDEAD (opaque ) || !P_ISLEAF (opaque ));
22072264 opaque -> btpo_flags &= ~BTP_HALF_DEAD ;
22082265 opaque -> btpo_flags |= BTP_DELETED ;
22092266 opaque -> btpo .xact = ReadNewTransactionId ();
@@ -2309,6 +2366,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
23092366 _bt_relbuf (rel , lbuf );
23102367 _bt_relbuf (rel , rbuf );
23112368
2369+ if (!TransactionIdIsValid (* oldestBtpoXact ) ||
2370+ TransactionIdPrecedes (opaque -> btpo .xact , * oldestBtpoXact ))
2371+ * oldestBtpoXact = opaque -> btpo .xact ;
2372+
23122373 /*
23132374 * Release the target, if it was not the leaf block. The leaf is always
23142375 * kept locked.
0 commit comments