@@ -1315,20 +1315,87 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
13151315}
13161316
13171317/*
1318- * Returns true, if the given block has the half-dead flag set.
1318+ * Check that leftsib page (the btpo_prev of target page) is not marked with
1319+ * INCOMPLETE_SPLIT flag.
1320+ *
1321+ * Returning true indicates that page flag is set in leftsib (which is
1322+ * definitely still the left sibling of target). When that happens, the
1323+ * target doesn't have a downlink in parent, and the page deletion algorithm
1324+ * isn't prepared to handle that. Deletion of the target page (or the whole
1325+ * subtree that contains the target page) cannot take place.
1326+ */
1327+ static bool
1328+ _bt_leftsib_splitflag (Relation rel , BlockNumber leftsib , BlockNumber target )
1329+ {
1330+ Buffer buf ;
1331+ Page page ;
1332+ BTPageOpaque opaque ;
1333+ bool result ;
1334+
1335+ /* Easy case: No left sibling */
1336+ if (leftsib == P_NONE )
1337+ return false;
1338+
1339+ buf = _bt_getbuf (rel , leftsib , BT_READ );
1340+ page = BufferGetPage (buf );
1341+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
1342+
1343+ /*
1344+ * If the left sibling was concurrently split, so that its next-pointer
1345+ * doesn't point to the current page anymore, the split that created
1346+ * target must be completed. Caller can reasonably expect that there will
1347+ * be a downlink to the target page that it can relocate using its stack.
1348+ * (We don't allow splitting an incompletely split page again until the
1349+ * previous split has been completed.)
1350+ */
1351+ result = (opaque -> btpo_next == target && P_INCOMPLETE_SPLIT (opaque ));
1352+ _bt_relbuf (rel , buf );
1353+
1354+ return result ;
1355+ }
1356+
1357+ /*
1358+ * Check that leafrightsib page (the btpo_next of target leaf page) is not
1359+ * marked with ISHALFDEAD flag.
1360+ *
1361+ * Returning true indicates that page flag is set in leafrightsib, so page
1362+ * deletion cannot go ahead. Our caller is not prepared to deal with the case
1363+ * where the parent page does not have a pivot tuples whose downlink points to
1364+ * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1365+ * seem worth going to the trouble of teaching our caller to deal with it.
1366+ * The situation will be resolved after VACUUM finishes the deletion of the
1367+ * half-dead page (when a future VACUUM operation reaches the target page
1368+ * again).
1369+ *
1370+ * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1371+ * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1372+ * okay because of the restriction on deleting pages that are the rightmost
1373+ * page of their parent (i.e. that such deletions can only take place when the
1374+ * entire subtree must be deleted). The leaf level check made here will apply
1375+ * to a right "cousin" leaf page rather than a simple right sibling leaf page
1376+ * in cases where caller actually goes on to attempt deleting pages that are
1377+ * above the leaf page. The right cousin leaf page is representative of the
1378+ * left edge of the subtree to the right of the to-be-deleted subtree as a
1379+ * whole, which is exactly the condition that our caller cares about.
1380+ * (Besides, internal pages are never marked half-dead, so it isn't even
1381+ * possible to _directly_ assess if an internal page is part of some other
1382+ * to-be-deleted subtree.)
13191383 */
13201384static bool
1321- _bt_is_page_halfdead (Relation rel , BlockNumber blk )
1385+ _bt_rightsib_halfdeadflag (Relation rel , BlockNumber leafrightsib )
13221386{
13231387 Buffer buf ;
13241388 Page page ;
13251389 BTPageOpaque opaque ;
13261390 bool result ;
13271391
1328- buf = _bt_getbuf (rel , blk , BT_READ );
1392+ Assert (leafrightsib != P_NONE );
1393+
1394+ buf = _bt_getbuf (rel , leafrightsib , BT_READ );
13291395 page = BufferGetPage (buf );
13301396 opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
13311397
1398+ Assert (P_ISLEAF (opaque ) && !P_ISDELETED (opaque ));
13321399 result = P_ISHALFDEAD (opaque );
13331400 _bt_relbuf (rel , buf );
13341401
@@ -1374,7 +1441,6 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
13741441 Buffer pbuf ;
13751442 Page page ;
13761443 BTPageOpaque opaque ;
1377- BlockNumber leftsib ;
13781444
13791445 /*
13801446 * Locate the downlink of "child" in the parent, updating the stack entry
@@ -1399,11 +1465,14 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
13991465 * If the target is the rightmost child of its parent, then we can't
14001466 * delete, unless it's also the only child.
14011467 */
1468+ Assert (poffset <= maxoff );
14021469 if (poffset >= maxoff )
14031470 {
14041471 /* It's rightmost child... */
14051472 if (poffset == P_FIRSTDATAKEY (opaque ))
14061473 {
1474+ BlockNumber leftsibparent ;
1475+
14071476 /*
14081477 * It's only child, so safe if parent would itself be removable.
14091478 * We have to check the parent itself, and then recurse to test
@@ -1418,41 +1487,16 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
14181487
14191488 * target = parent ;
14201489 * rightsib = opaque -> btpo_next ;
1421- leftsib = opaque -> btpo_prev ;
1490+ leftsibparent = opaque -> btpo_prev ;
14221491
14231492 _bt_relbuf (rel , pbuf );
14241493
14251494 /*
1426- * Like in _bt_pagedel, check that the left sibling is not marked
1427- * with INCOMPLETE_SPLIT flag. That would mean that there is no
1428- * downlink to the page to be deleted, and the page deletion
1429- * algorithm isn't prepared to handle that.
1495+ * Check that the left sibling of parent (if any) is not marked
1496+ * with INCOMPLETE_SPLIT flag before proceeding
14301497 */
1431- if (leftsib != P_NONE )
1432- {
1433- Buffer lbuf ;
1434- Page lpage ;
1435- BTPageOpaque lopaque ;
1436-
1437- lbuf = _bt_getbuf (rel , leftsib , BT_READ );
1438- lpage = BufferGetPage (lbuf );
1439- lopaque = (BTPageOpaque ) PageGetSpecialPointer (lpage );
1440-
1441- /*
1442- * If the left sibling was concurrently split, so that its
1443- * next-pointer doesn't point to the current page anymore, the
1444- * split that created the current page must be completed. (We
1445- * don't allow splitting an incompletely split page again
1446- * until the previous split has been completed)
1447- */
1448- if (lopaque -> btpo_next == parent &&
1449- P_INCOMPLETE_SPLIT (lopaque ))
1450- {
1451- _bt_relbuf (rel , lbuf );
1452- return false;
1453- }
1454- _bt_relbuf (rel , lbuf );
1455- }
1498+ if (_bt_leftsib_splitflag (rel , leftsibparent , parent ))
1499+ return false;
14561500
14571501 return _bt_lock_branch_parent (rel , parent , stack -> bts_parent ,
14581502 topparent , topoff , target , rightsib );
@@ -1525,7 +1569,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
15251569 *
15261570 * Also, when "stack" is not NULL, we have already checked that the
15271571 * current page is not the right half of an incomplete split, i.e. the
1528- * left sibling does not have its INCOMPLETE_SPLIT flag set.
1572+ * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1573+ * when the current target page is to the right of caller's initial page
1574+ * (the scanblkno page).
15291575 */
15301576 BTStack stack = NULL ;
15311577
@@ -1589,11 +1635,12 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
15891635 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
15901636 * left half of an incomplete split, but ensuring that it's not the
15911637 * right half is more complicated. For that, we have to check that
1592- * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1593- * the first iteration, we temporarily release the lock on the current
1594- * page, and check the left sibling and also construct a search stack
1595- * to. On subsequent iterations, we know we stepped right from a page
1596- * that passed these tests, so it's OK.
1638+ * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1639+ * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1640+ * release the lock on scanblkno/leafbuf, check the left sibling, and
1641+ * construct a search stack to scanblkno. On subsequent iterations,
1642+ * we know we stepped right from a page that passed these tests, so
1643+ * it's OK.
15971644 */
15981645 if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) ||
15991646 P_FIRSTDATAKEY (opaque ) <= PageGetMaxOffsetNumber (page ) ||
@@ -1628,13 +1675,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
16281675 BTScanInsert itup_key ;
16291676 ItemId itemid ;
16301677 IndexTuple targetkey ;
1678+ BlockNumber leftsib , target ;
16311679 Buffer lbuf ;
1632- BlockNumber leftsib ;
16331680
16341681 itemid = PageGetItemId (page , P_HIKEY );
16351682 targetkey = CopyIndexTuple ((IndexTuple ) PageGetItem (page , itemid ));
16361683
16371684 leftsib = opaque -> btpo_prev ;
1685+ target = BufferGetBlockNumber (leafbuf );
16381686
16391687 /*
16401688 * To avoid deadlocks, we'd better drop the leaf page lock
@@ -1643,43 +1691,22 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
16431691 LockBuffer (leafbuf , BUFFER_LOCK_UNLOCK );
16441692
16451693 /*
1646- * Fetch the left sibling, to check that it's not marked with
1647- * INCOMPLETE_SPLIT flag. That would mean that the page
1648- * to-be-deleted doesn't have a downlink, and the page
1649- * deletion algorithm isn't prepared to handle that.
1694+ * Check that the left sibling of leafbuf (if any) is not
1695+ * marked with INCOMPLETE_SPLIT flag before proceeding
16501696 */
1651- if (leftsib != P_NONE )
1697+ Assert (target == scanblkno );
1698+ if (_bt_leftsib_splitflag (rel , leftsib , target ))
16521699 {
1653- BTPageOpaque lopaque ;
1654- Page lpage ;
1655-
1656- lbuf = _bt_getbuf (rel , leftsib , BT_READ );
1657- lpage = BufferGetPage (lbuf );
1658- lopaque = (BTPageOpaque ) PageGetSpecialPointer (lpage );
1659-
1660- /*
1661- * If the left sibling is split again by another backend,
1662- * after we released the lock, we know that the first
1663- * split must have finished, because we don't allow an
1664- * incompletely-split page to be split again. So we don't
1665- * need to walk right here.
1666- */
1667- if (lopaque -> btpo_next == BufferGetBlockNumber (leafbuf ) &&
1668- P_INCOMPLETE_SPLIT (lopaque ))
1669- {
1670- ReleaseBuffer (leafbuf );
1671- _bt_relbuf (rel , lbuf );
1672- return ndeleted ;
1673- }
1674- _bt_relbuf (rel , lbuf );
1700+ ReleaseBuffer (leafbuf );
1701+ return ndeleted ;
16751702 }
16761703
16771704 /* we need an insertion scan key for the search, so build one */
16781705 itup_key = _bt_mkscankey (rel , targetkey );
16791706 /* find the leftmost leaf page with matching pivot/high key */
16801707 itup_key -> pivotsearch = true;
16811708 stack = _bt_search (rel , itup_key , & lbuf , BT_READ , NULL );
1682- /* don 't need a lock or second pin on the page */
1709+ /* won 't need a second lock or pin on leafbuf */
16831710 _bt_relbuf (rel , lbuf );
16841711
16851712 /*
@@ -1804,12 +1831,11 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
18041831 * Before attempting to lock the parent page, check that the right sibling
18051832 * is not in half-dead state. A half-dead right sibling would have no
18061833 * downlink in the parent, which would be highly confusing later when we
1807- * delete the downlink that follows the current page's downlink. (I
1808- * believe the deletion would work correctly, but it would fail the
1809- * cross-check we make that the following downlink points to the right
1810- * sibling of the delete page.)
1834+ * delete the downlink that follows the leafbuf page's downlink. It would
1835+ * fail the "right sibling of target page is also the next child in parent
1836+ * page" cross-check below.
18111837 */
1812- if (_bt_is_page_halfdead (rel , leafrightsib ))
1838+ if (_bt_rightsib_halfdeadflag (rel , leafrightsib ))
18131839 {
18141840 elog (DEBUG1 , "could not delete page %u because its right sibling %u is half-dead" ,
18151841 leafblkno , leafrightsib );
@@ -1822,16 +1848,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
18221848 * be deleted too, and the same condition applies recursively to it. We
18231849 * have to check this condition all the way up before trying to delete,
18241850 * and lock the final parent of the to-be-deleted subtree.
1825- *
1826- * However, we won't need to repeat the above _bt_is_page_halfdead() check
1827- * for parent/ancestor pages because of the rightmost restriction. The
1828- * leaf check will apply to a right "cousin" leaf page rather than a
1829- * simple right sibling leaf page in cases where we actually go on to
1830- * perform internal page deletion. The right cousin leaf page is
1831- * representative of the left edge of the subtree to the right of the
1832- * to-be-deleted subtree as a whole. (Besides, internal pages are never
1833- * marked half-dead, so it isn't even possible to directly assess if an
1834- * internal page is part of some other to-be-deleted subtree.)
18351851 */
18361852 rightsib = leafrightsib ;
18371853 target = leafblkno ;
0 commit comments