@@ -74,7 +74,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
7474 int leaffillfactor , bool * usemult );
7575static bool _bt_adjacenthtid (ItemPointer lowhtid , ItemPointer highhtid );
7676static OffsetNumber _bt_bestsplitloc (FindSplitData * state , int perfectpenalty ,
77- bool * newitemonleft );
77+ bool * newitemonleft , FindSplitStrat strategy );
7878static int _bt_strategy (FindSplitData * state , SplitPoint * leftpage ,
7979 SplitPoint * rightpage , FindSplitStrat * strategy );
8080static void _bt_interval_edges (FindSplitData * state ,
@@ -214,7 +214,7 @@ _bt_findsplitloc(Relation rel,
214214 * split can be newitem. Record a split after the previous data item
215215 * from original page, but before the current data item from original
216216 * page. (_bt_recsplitloc() will reject the split when there are no
217- * previous data items, which we rely on.)
217+ * previous items, which we rely on.)
218218 */
219219 if (offnum < newitemoff )
220220 _bt_recsplitloc (& state , offnum , false, olddataitemstoleft , itemsz );
@@ -361,10 +361,11 @@ _bt_findsplitloc(Relation rel,
361361 * fillfactormult, in order to deal with pages with a large number of
362362 * duplicates gracefully.
363363 *
364- * Pass low and high splits for the entire page (including even newitem).
365- * These are used when the initial split interval encloses split points
366- * that are full of duplicates, and we need to consider if it's even
367- * possible to avoid appending a heap TID.
364+ * Pass low and high splits for the entire page (actually, they're for an
365+ * imaginary version of the page that includes newitem). These are used
366+ * when the initial split interval encloses split points that are full of
367+ * duplicates, and we need to consider if it's even possible to avoid
368+ * appending a heap TID.
368369 */
369370 perfectpenalty = _bt_strategy (& state , & leftpage , & rightpage , & strategy );
370371
@@ -381,12 +382,15 @@ _bt_findsplitloc(Relation rel,
381382 * appended, but the page isn't completely full of logical duplicates.
382383 *
383384 * The split interval is widened to include all legal candidate split
384- * points. There may be a few as two distinct values in the whole-page
385- * split interval. Many duplicates strategy has no hard requirements for
386- * space utilization, though it still keeps the use of space balanced as a
387- * non-binding secondary goal (perfect penalty is set so that the
388- * first/lowest delta split points that avoids appending a heap TID is
389- * used).
385+ * points. There might be a few as two distinct values in the whole-page
386+ * split interval, though it's also possible that most of the values on
387+ * the page are unique. The final split point will either be to the
388+ * immediate left or to the immediate right of the group of duplicate
389+ * tuples that enclose the first/delta-optimal split point (perfect
390+ * penalty was set so that the lowest delta split point that avoids
391+ * appending a heap TID will be chosen). Maximizing the number of
392+ * attributes that can be truncated away is not a goal of the many
393+ * duplicates strategy.
390394 *
391395 * Single value strategy is used when it is impossible to avoid appending
392396 * a heap TID. It arranges to leave the left page very full. This
@@ -399,6 +403,9 @@ _bt_findsplitloc(Relation rel,
399403 else if (strategy == SPLIT_MANY_DUPLICATES )
400404 {
401405 Assert (state .is_leaf );
406+ /* Shouldn't try to truncate away extra user attributes */
407+ Assert (perfectpenalty ==
408+ IndexRelationGetNumberOfKeyAttributes (state .rel ));
402409 /* No need to resort splits -- no change in fillfactormult/deltas */
403410 state .interval = state .nsplits ;
404411 }
@@ -419,7 +426,8 @@ _bt_findsplitloc(Relation rel,
419426 * the entry that has the lowest penalty, and is therefore expected to
420427 * maximize fan-out. Sets *newitemonleft for us.
421428 */
422- foundfirstright = _bt_bestsplitloc (& state , perfectpenalty , newitemonleft );
429+ foundfirstright = _bt_bestsplitloc (& state , perfectpenalty , newitemonleft ,
430+ strategy );
423431 pfree (state .splits );
424432
425433 return foundfirstright ;
@@ -753,11 +761,13 @@ _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
753761 * page, plus a boolean indicating if new item is on left of split point.
754762 */
755763static OffsetNumber
756- _bt_bestsplitloc (FindSplitData * state , int perfectpenalty , bool * newitemonleft )
764+ _bt_bestsplitloc (FindSplitData * state , int perfectpenalty ,
765+ bool * newitemonleft , FindSplitStrat strategy )
757766{
758767 int bestpenalty ,
759768 lowsplit ;
760769 int highsplit = Min (state -> interval , state -> nsplits );
770+ SplitPoint * final ;
761771
762772 bestpenalty = INT_MAX ;
763773 lowsplit = 0 ;
@@ -781,8 +791,37 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft)
781791 }
782792 }
783793
784- * newitemonleft = state -> splits [lowsplit ].newitemonleft ;
785- return state -> splits [lowsplit ].firstoldonright ;
794+ final = & state -> splits [lowsplit ];
795+
796+ /*
797+ * There is a risk that the "many duplicates" strategy will repeatedly do
798+ * the wrong thing when there are monotonically decreasing insertions to
799+ * the right of a large group of duplicates. Repeated splits could leave
800+ * a succession of right half pages with free space that can never be
801+ * used. This must be avoided.
802+ *
803+ * Consider the example of the leftmost page in a single integer attribute
804+ * NULLS FIRST index which is almost filled with NULLs. Monotonically
805+ * decreasing integer insertions might cause the same leftmost page to
806+ * split repeatedly at the same point. Each split derives its new high
807+ * key from the lowest current value to the immediate right of the large
808+ * group of NULLs, which will always be higher than all future integer
809+ * insertions, directing all future integer insertions to the same
810+ * leftmost page.
811+ */
812+ if (strategy == SPLIT_MANY_DUPLICATES && !state -> is_rightmost &&
813+ !final -> newitemonleft && final -> firstoldonright >= state -> newitemoff &&
814+ final -> firstoldonright < state -> newitemoff + MAX_LEAF_INTERVAL )
815+ {
816+ /*
817+ * Avoid the problem by peforming a 50:50 split when the new item is
818+ * just to the left of the would-be "many duplicates" split point.
819+ */
820+ final = & state -> splits [0 ];
821+ }
822+
823+ * newitemonleft = final -> newitemonleft ;
824+ return final -> firstoldonright ;
786825}
787826
788827/*
@@ -859,17 +898,16 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
859898 * strategy = SPLIT_MANY_DUPLICATES ;
860899
861900 /*
862- * Caller should choose the lowest delta split that avoids appending a
863- * heap TID. Maximizing the number of attributes that can be
864- * truncated away (returning perfectpenalty when it happens to be less
865- * than the number of key attributes in index) can result in continual
866- * unbalanced page splits.
901+ * Many duplicates strategy should split at either side the group of
902+ * duplicates that enclose the delta-optimal split point. Return
903+ * indnkeyatts rather than the true perfect penalty to make that
904+ * happen. (If perfectpenalty was returned here then low cardinality
905+ * composite indexes could have continual unbalanced splits.)
867906 *
868- * Just avoiding appending a heap TID can still make splits very
869- * unbalanced, but this is self-limiting. When final split has a very
870- * high delta, one side of the split will likely consist of a single
871- * value. If that page is split once again, then that split will
872- * likely use the single value strategy.
907+ * Note that caller won't go through with a many duplicates split in
908+ * rare cases where it looks like there are ever-decreasing insertions
909+ * to the immediate right of the split point. This must happen just
910+ * before a final decision is made, within _bt_bestsplitloc().
873911 */
874912 return indnkeyatts ;
875913 }
0 commit comments