@@ -49,8 +49,8 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
4949 OffsetNumber newitemoff ,
5050 bool split_only_page );
5151static Buffer _bt_split (Relation rel , BTScanInsert itup_key , Buffer buf ,
52- Buffer cbuf , OffsetNumber firstright , OffsetNumber newitemoff ,
53- Size newitemsz , IndexTuple newitem , bool newitemonleft );
52+ Buffer cbuf , OffsetNumber newitemoff , Size newitemsz ,
53+ IndexTuple newitem );
5454static void _bt_insert_parent (Relation rel , Buffer buf , Buffer rbuf ,
5555 BTStack stack , bool is_root , bool is_only );
5656static bool _bt_pgaddtup (Page page , Size itemsize , IndexTuple itup ,
@@ -943,7 +943,6 @@ _bt_insertonpg(Relation rel,
943943{
944944 Page page ;
945945 BTPageOpaque lpageop ;
946- OffsetNumber firstright = InvalidOffsetNumber ;
947946 Size itemsz ;
948947
949948 page = BufferGetPage (buf );
@@ -979,7 +978,6 @@ _bt_insertonpg(Relation rel,
979978 {
980979 bool is_root = P_ISROOT (lpageop );
981980 bool is_only = P_LEFTMOST (lpageop ) && P_RIGHTMOST (lpageop );
982- bool newitemonleft ;
983981 Buffer rbuf ;
984982
985983 /*
@@ -1000,14 +998,8 @@ _bt_insertonpg(Relation rel,
1000998 Assert (!(P_ISLEAF (lpageop ) &&
1001999 BlockNumberIsValid (RelationGetTargetBlock (rel ))));
10021000
1003- /* Choose the split point */
1004- firstright = _bt_findsplitloc (rel , page ,
1005- newitemoff , itemsz , itup ,
1006- & newitemonleft );
1007-
10081001 /* split the buffer into left and right halves */
1009- rbuf = _bt_split (rel , itup_key , buf , cbuf , firstright , newitemoff ,
1010- itemsz , itup , newitemonleft );
1002+ rbuf = _bt_split (rel , itup_key , buf , cbuf , newitemoff , itemsz , itup );
10111003 PredicateLockPageSplit (rel ,
10121004 BufferGetBlockNumber (buf ),
10131005 BufferGetBlockNumber (rbuf ));
@@ -1211,9 +1203,8 @@ _bt_insertonpg(Relation rel,
12111203 * _bt_split() -- split a page in the btree.
12121204 *
12131205 * On entry, buf is the page to split, and is pinned and write-locked.
1214- * firstright is the item index of the first item to be moved to the
1215- * new right page. newitemoff etc. tell us about the new item that
1216- * must be inserted along with the data from the old page.
1206+ * newitemoff etc. tell us about the new item that must be inserted
1207+ * along with the data from the original page.
12171208 *
12181209 * itup_key is used for suffix truncation on leaf pages (internal
12191210 * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
@@ -1226,8 +1217,7 @@ _bt_insertonpg(Relation rel,
12261217 */
12271218static Buffer
12281219_bt_split (Relation rel , BTScanInsert itup_key , Buffer buf , Buffer cbuf ,
1229- OffsetNumber firstright , OffsetNumber newitemoff , Size newitemsz ,
1230- IndexTuple newitem , bool newitemonleft )
1220+ OffsetNumber newitemoff , Size newitemsz , IndexTuple newitem )
12311221{
12321222 Buffer rbuf ;
12331223 Page origpage ;
@@ -1246,99 +1236,80 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
12461236 IndexTuple item ;
12471237 OffsetNumber leftoff ,
12481238 rightoff ;
1239+ OffsetNumber firstright ;
12491240 OffsetNumber maxoff ;
12501241 OffsetNumber i ;
1251- bool isleaf ;
1242+ bool newitemonleft ,
1243+ isleaf ;
12521244 IndexTuple lefthikey ;
12531245 int indnatts = IndexRelationGetNumberOfAttributes (rel );
12541246 int indnkeyatts = IndexRelationGetNumberOfKeyAttributes (rel );
12551247
1256- /* Acquire a new page to split into */
1257- rbuf = _bt_getbuf (rel , P_NEW , BT_WRITE );
1258-
12591248 /*
12601249 * origpage is the original page to be split. leftpage is a temporary
12611250 * buffer that receives the left-sibling data, which will be copied back
1262- * into origpage on success. rightpage is the new page that receives the
1263- * right-sibling data. If we fail before reaching the critical section,
1264- * origpage hasn't been modified and leftpage is only workspace. In
1265- * principle we shouldn't need to worry about rightpage either, because it
1266- * hasn't been linked into the btree page structure; but to avoid leaving
1267- * possibly-confusing junk behind, we are careful to rewrite rightpage as
1268- * zeroes before throwing any error.
1251+ * into origpage on success. rightpage is the new page that will receive
1252+ * the right-sibling data.
1253+ *
1254+ * leftpage is allocated after choosing a split point. rightpage's new
1255+ * buffer isn't acquired until after leftpage is initialized and has new
1256+ * high key, the last point where splitting the page may fail (barring
1257+ * corruption). Failing before acquiring new buffer won't have lasting
1258+ * consequences, since origpage won't have been modified and leftpage is
1259+ * only workspace.
12691260 */
12701261 origpage = BufferGetPage (buf );
1271- leftpage = PageGetTempPage (origpage );
1272- rightpage = BufferGetPage (rbuf );
1273-
1262+ oopaque = (BTPageOpaque ) PageGetSpecialPointer (origpage );
12741263 origpagenumber = BufferGetBlockNumber (buf );
1275- rightpagenumber = BufferGetBlockNumber (rbuf );
1276-
1277- _bt_pageinit (leftpage , BufferGetPageSize (buf ));
1278- /* rightpage was already initialized by _bt_getbuf */
12791264
12801265 /*
1281- * Copy the original page's LSN into leftpage, which will become the
1282- * updated version of the page. We need this because XLogInsert will
1283- * examine the LSN and possibly dump it in a page image.
1266+ * Choose a point to split origpage at.
1267+ *
1268+ * A split point can be thought of as a point _between_ two existing
1269+ * tuples on origpage (lastleft and firstright tuples), provided you
1270+ * pretend that the new item that didn't fit is already on origpage.
1271+ *
1272+ * Since origpage does not actually contain newitem, the representation of
1273+ * split points needs to work with two boundary cases: splits where
1274+ * newitem is lastleft, and splits where newitem is firstright.
1275+ * newitemonleft resolves the ambiguity that would otherwise exist when
1276+ * newitemoff == firstright. In all other cases it's clear which side of
1277+ * the split every tuple goes on from context. newitemonleft is usually
1278+ * (but not always) redundant information.
12841279 */
1285- PageSetLSN (leftpage , PageGetLSN (origpage ));
1280+ firstright = _bt_findsplitloc (rel , origpage , newitemoff , newitemsz ,
1281+ newitem , & newitemonleft );
12861282
1287- /* init btree private data */
1288- oopaque = (BTPageOpaque ) PageGetSpecialPointer (origpage );
1283+ /* Allocate temp buffer for leftpage */
1284+ leftpage = PageGetTempPage (origpage );
1285+ _bt_pageinit (leftpage , BufferGetPageSize (buf ));
12891286 lopaque = (BTPageOpaque ) PageGetSpecialPointer (leftpage );
1290- ropaque = (BTPageOpaque ) PageGetSpecialPointer (rightpage );
12911287
1292- isleaf = P_ISLEAF ( oopaque );
1293-
1294- /* if we're splitting this page, it won't be the root when we're done */
1295- /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1288+ /*
1289+ * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1290+ * and HAS_GARBAGE flags.
1291+ */
12961292 lopaque -> btpo_flags = oopaque -> btpo_flags ;
12971293 lopaque -> btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE );
1298- ropaque -> btpo_flags = lopaque -> btpo_flags ;
1299- /* set flag in left page indicating that the right page has no downlink */
1294+ /* set flag in leftpage indicating that rightpage has no downlink yet */
13001295 lopaque -> btpo_flags |= BTP_INCOMPLETE_SPLIT ;
13011296 lopaque -> btpo_prev = oopaque -> btpo_prev ;
1302- lopaque -> btpo_next = rightpagenumber ;
1303- ropaque -> btpo_prev = origpagenumber ;
1304- ropaque -> btpo_next = oopaque -> btpo_next ;
1305- lopaque -> btpo .level = ropaque -> btpo .level = oopaque -> btpo .level ;
1306- /* Since we already have write-lock on both pages, ok to read cycleid */
1307- lopaque -> btpo_cycleid = _bt_vacuum_cycleid (rel );
1308- ropaque -> btpo_cycleid = lopaque -> btpo_cycleid ;
1297+ /* handle btpo_next after rightpage buffer acquired */
1298+ lopaque -> btpo .level = oopaque -> btpo .level ;
1299+ /* handle btpo_cycleid after rightpage buffer acquired */
13091300
13101301 /*
1311- * If the page we're splitting is not the rightmost page at its level in
1312- * the tree, then the first entry on the page is the high key for the
1313- * page. We need to copy that to the right half. Otherwise (meaning the
1314- * rightmost page case), all the items on the right half will be user
1315- * data.
1302+ * Copy the original page's LSN into leftpage, which will become the
1303+ * updated version of the page. We need this because XLogInsert will
1304+ * examine the LSN and possibly dump it in a page image.
13161305 */
1317- rightoff = P_HIKEY ;
1318-
1319- if (!P_RIGHTMOST (oopaque ))
1320- {
1321- itemid = PageGetItemId (origpage , P_HIKEY );
1322- itemsz = ItemIdGetLength (itemid );
1323- item = (IndexTuple ) PageGetItem (origpage , itemid );
1324- Assert (BTreeTupleGetNAtts (item , rel ) > 0 );
1325- Assert (BTreeTupleGetNAtts (item , rel ) <= indnkeyatts );
1326- if (PageAddItem (rightpage , (Item ) item , itemsz , rightoff ,
1327- false, false) == InvalidOffsetNumber )
1328- {
1329- memset (rightpage , 0 , BufferGetPageSize (rbuf ));
1330- elog (ERROR , "failed to add hikey to the right sibling"
1331- " while splitting block %u of index \"%s\"" ,
1332- origpagenumber , RelationGetRelationName (rel ));
1333- }
1334- rightoff = OffsetNumberNext (rightoff );
1335- }
1306+ PageSetLSN (leftpage , PageGetLSN (origpage ));
1307+ isleaf = P_ISLEAF (oopaque );
13361308
13371309 /*
13381310 * The "high key" for the new left page will be the first key that's going
1339- * to go into the new right page, or possibly a truncated version if this
1340- * is a leaf page split. This might be either the existing data item at
1341- * position firstright, or the incoming tuple.
1311+ * to go into the new right page, or a truncated version if this is a leaf
1312+ * page split.
13421313 *
13431314 * The high key for the left page is formed using the first item on the
13441315 * right page, which may seem to be contrary to Lehman & Yao's approach of
@@ -1360,7 +1331,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
13601331 * tuple could be physically larger despite being opclass-equal in respect
13611332 * of all attributes prior to the heap TID attribute.)
13621333 */
1363- leftoff = P_HIKEY ;
13641334 if (!newitemonleft && newitemoff == firstright )
13651335 {
13661336 /* incoming tuple will become first on right page */
@@ -1416,23 +1386,91 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
14161386 else
14171387 lefthikey = item ;
14181388
1389+ /*
1390+ * Add new high key to leftpage
1391+ */
1392+ leftoff = P_HIKEY ;
1393+
14191394 Assert (BTreeTupleGetNAtts (lefthikey , rel ) > 0 );
14201395 Assert (BTreeTupleGetNAtts (lefthikey , rel ) <= indnkeyatts );
14211396 if (PageAddItem (leftpage , (Item ) lefthikey , itemsz , leftoff ,
14221397 false, false) == InvalidOffsetNumber )
1423- {
1424- memset (rightpage , 0 , BufferGetPageSize (rbuf ));
14251398 elog (ERROR , "failed to add hikey to the left sibling"
14261399 " while splitting block %u of index \"%s\"" ,
14271400 origpagenumber , RelationGetRelationName (rel ));
1428- }
14291401 leftoff = OffsetNumberNext (leftoff );
14301402 /* be tidy */
14311403 if (lefthikey != item )
14321404 pfree (lefthikey );
14331405
14341406 /*
1435- * Now transfer all the data items to the appropriate page.
1407+ * Acquire a new right page to split into, now that left page has a new
1408+ * high key. From here on, it's not okay to throw an error without
1409+ * zeroing rightpage first. This coding rule ensures that we won't
1410+ * confuse future VACUUM operations, which might otherwise try to re-find
1411+ * a downlink to a leftover junk page as the page undergoes deletion.
1412+ *
1413+ * It would be reasonable to start the critical section just after the new
1414+ * rightpage buffer is acquired instead; that would allow us to avoid
1415+ * leftover junk pages without bothering to zero rightpage. We do it this
1416+ * way because it avoids an unnecessary PANIC when either origpage or its
1417+ * existing sibling page are corrupt.
1418+ */
1419+ rbuf = _bt_getbuf (rel , P_NEW , BT_WRITE );
1420+ rightpage = BufferGetPage (rbuf );
1421+ rightpagenumber = BufferGetBlockNumber (rbuf );
1422+ /* rightpage was initialized by _bt_getbuf */
1423+ ropaque = (BTPageOpaque ) PageGetSpecialPointer (rightpage );
1424+
1425+ /*
1426+ * Finish off remaining leftpage special area fields. They cannot be set
1427+ * before both origpage (leftpage) and rightpage buffers are acquired and
1428+ * locked.
1429+ */
1430+ lopaque -> btpo_next = rightpagenumber ;
1431+ lopaque -> btpo_cycleid = _bt_vacuum_cycleid (rel );
1432+
1433+ /*
1434+ * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1435+ * and HAS_GARBAGE flags.
1436+ */
1437+ ropaque -> btpo_flags = oopaque -> btpo_flags ;
1438+ ropaque -> btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE );
1439+ ropaque -> btpo_prev = origpagenumber ;
1440+ ropaque -> btpo_next = oopaque -> btpo_next ;
1441+ ropaque -> btpo .level = oopaque -> btpo .level ;
1442+ ropaque -> btpo_cycleid = lopaque -> btpo_cycleid ;
1443+
1444+ /*
1445+ * Add new high key to rightpage where necessary.
1446+ *
1447+ * If the page we're splitting is not the rightmost page at its level in
1448+ * the tree, then the first entry on the page is the high key from
1449+ * origpage.
1450+ */
1451+ rightoff = P_HIKEY ;
1452+
1453+ if (!P_RIGHTMOST (oopaque ))
1454+ {
1455+ itemid = PageGetItemId (origpage , P_HIKEY );
1456+ itemsz = ItemIdGetLength (itemid );
1457+ item = (IndexTuple ) PageGetItem (origpage , itemid );
1458+ Assert (BTreeTupleGetNAtts (item , rel ) > 0 );
1459+ Assert (BTreeTupleGetNAtts (item , rel ) <= indnkeyatts );
1460+ if (PageAddItem (rightpage , (Item ) item , itemsz , rightoff ,
1461+ false, false) == InvalidOffsetNumber )
1462+ {
1463+ memset (rightpage , 0 , BufferGetPageSize (rbuf ));
1464+ elog (ERROR , "failed to add hikey to the right sibling"
1465+ " while splitting block %u of index \"%s\"" ,
1466+ origpagenumber , RelationGetRelationName (rel ));
1467+ }
1468+ rightoff = OffsetNumberNext (rightoff );
1469+ }
1470+
1471+ /*
1472+ * Now transfer all the data items (non-pivot tuples in isleaf case, or
1473+ * additional pivot tuples in !isleaf case) to the appropriate page.
14361474 *
14371475 * Note: we *must* insert at least the right page's items in item-number
14381476 * order, for the benefit of _bt_restore_page().
@@ -1450,6 +1488,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
14501488 {
14511489 if (newitemonleft )
14521490 {
1491+ Assert (newitemoff <= firstright );
14531492 if (!_bt_pgaddtup (leftpage , newitemsz , newitem , leftoff ))
14541493 {
14551494 memset (rightpage , 0 , BufferGetPageSize (rbuf ));
@@ -1461,6 +1500,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
14611500 }
14621501 else
14631502 {
1503+ Assert (newitemoff >= firstright );
14641504 if (!_bt_pgaddtup (rightpage , newitemsz , newitem , rightoff ))
14651505 {
14661506 memset (rightpage , 0 , BufferGetPageSize (rbuf ));
@@ -1523,7 +1563,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
15231563 * all readers release locks on a page before trying to fetch its
15241564 * neighbors.
15251565 */
1526-
15271566 if (!P_RIGHTMOST (oopaque ))
15281567 {
15291568 sbuf = _bt_getbuf (rel , oopaque -> btpo_next , BT_WRITE );
0 commit comments