2323#include "miscadmin.h"
2424#include "storage/lmgr.h"
2525#include "storage/predicate.h"
26+ #include "storage/smgr.h"
2627#include "utils/tqual.h"
2728
2829
@@ -85,7 +86,6 @@ static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
8586 int keysz , ScanKey scankey );
8687static void _bt_vacuum_one_page (Relation rel , Buffer buffer , Relation heapRel );
8788
88-
8989/*
9090 * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
9191 *
@@ -111,32 +111,121 @@ _bt_doinsert(Relation rel, IndexTuple itup,
111111 bool is_unique = false;
112112 int natts = rel -> rd_rel -> relnatts ;
113113 ScanKey itup_scankey ;
114- BTStack stack ;
114+ BTStack stack = NULL ;
115115 Buffer buf ;
116116 OffsetNumber offset ;
117+ bool fastpath ;
117118
118119 /* we need an insertion scan key to do our search, so build one */
119120 itup_scankey = _bt_mkscankey (rel , itup );
120121
122+ /*
123+ * It's very common to have an index on an auto-incremented or
124+ * monotonically increasing value. In such cases, every insertion happens
125+ * towards the end of the index. We try to optimise that case by caching
126+ * the right-most leaf of the index. If our cached block is still the
127+ * rightmost leaf, has enough free space to accommodate a new entry and
128+ * the insertion key is strictly greater than the first key in this page,
129+ * then we can safely conclude that the new key will be inserted in the
130+ * cached block. So we simply search within the cached block and insert the
131+ * key at the appropriate location. We call it a fastpath.
132+ *
133+ * Testing has revealed, though, that the fastpath can result in increased
134+ * contention on the exclusive-lock on the rightmost leaf page. So we
135+ * conditionally check if the lock is available. If it's not available then
136+ * we simply abandon the fastpath and take the regular path. This makes
137+ * sense because unavailability of the lock also signals that some other
138+ * backend might be concurrently inserting into the page, thus reducing our
139+ * chances to finding an insertion place in this page.
140+ */
121141top :
122- /* find the first page containing this key */
123- stack = _bt_search (rel , natts , itup_scankey , false, & buf , BT_WRITE , NULL );
124-
142+ fastpath = false;
125143 offset = InvalidOffsetNumber ;
144+ if (RelationGetTargetBlock (rel ) != InvalidBlockNumber )
145+ {
146+ Size itemsz ;
147+ Page page ;
148+ BTPageOpaque lpageop ;
126149
127- /* trade in our read lock for a write lock */
128- LockBuffer (buf , BUFFER_LOCK_UNLOCK );
129- LockBuffer (buf , BT_WRITE );
150+ /*
151+ * Conditionally acquire exclusive lock on the buffer before doing any
152+ * checks. If we don't get the lock, we simply follow slowpath. If we
153+ * do get the lock, this ensures that the index state cannot change, as
154+ * far as the rightmost part of the index is concerned.
155+ */
156+ buf = ReadBuffer (rel , RelationGetTargetBlock (rel ));
130157
131- /*
132- * If the page was split between the time that we surrendered our read
133- * lock and acquired our write lock, then this page may no longer be the
134- * right place for the key we want to insert. In this case, we need to
135- * move right in the tree. See Lehman and Yao for an excruciatingly
136- * precise description.
137- */
138- buf = _bt_moveright (rel , buf , natts , itup_scankey , false,
139- true, stack , BT_WRITE , NULL );
158+ if (ConditionalLockBuffer (buf ))
159+ {
160+ _bt_checkpage (rel , buf );
161+
162+ page = BufferGetPage (buf );
163+
164+ lpageop = (BTPageOpaque ) PageGetSpecialPointer (page );
165+ itemsz = IndexTupleSize (itup );
166+ itemsz = MAXALIGN (itemsz ); /* be safe, PageAddItem will do this
167+ * but we need to be consistent */
168+
169+ /*
170+ * Check if the page is still the rightmost leaf page, has enough
171+ * free space to accommodate the new tuple, no split is in progress
172+ * and the scankey is greater than or equal to the first key on the
173+ * page.
174+ */
175+ if (P_ISLEAF (lpageop ) && P_RIGHTMOST (lpageop ) &&
176+ !P_INCOMPLETE_SPLIT (lpageop ) &&
177+ !P_IGNORE (lpageop ) &&
178+ (PageGetFreeSpace (page ) > itemsz ) &&
179+ PageGetMaxOffsetNumber (page ) >= P_FIRSTDATAKEY (lpageop ) &&
180+ _bt_compare (rel , natts , itup_scankey , page ,
181+ P_FIRSTDATAKEY (lpageop )) > 0 )
182+ {
183+ fastpath = true;
184+ }
185+ else
186+ {
187+ _bt_relbuf (rel , buf );
188+
189+ /*
190+ * Something did not workout. Just forget about the cached
191+ * block and follow the normal path. It might be set again if
192+ * the conditions are favourble.
193+ */
194+ RelationSetTargetBlock (rel , InvalidBlockNumber );
195+ }
196+ }
197+ else
198+ {
199+ ReleaseBuffer (buf );
200+
201+ /*
202+ * If someone's holding a lock, it's likely to change anyway,
203+ * so don't try again until we get an updated rightmost leaf.
204+ */
205+ RelationSetTargetBlock (rel , InvalidBlockNumber );
206+ }
207+ }
208+
209+ if (!fastpath )
210+ {
211+ /* find the first page containing this key */
212+ stack = _bt_search (rel , natts , itup_scankey , false, & buf , BT_WRITE ,
213+ NULL );
214+
215+ /* trade in our read lock for a write lock */
216+ LockBuffer (buf , BUFFER_LOCK_UNLOCK );
217+ LockBuffer (buf , BT_WRITE );
218+
219+ /*
220+ * If the page was split between the time that we surrendered our read
221+ * lock and acquired our write lock, then this page may no longer be
222+ * the right place for the key we want to insert. In this case, we
223+ * need to move right in the tree. See Lehman and Yao for an
224+ * excruciatingly precise description.
225+ */
226+ buf = _bt_moveright (rel , buf , natts , itup_scankey , false,
227+ true, stack , BT_WRITE , NULL );
228+ }
140229
141230 /*
142231 * If we're not allowing duplicates, make sure the key isn't already in
@@ -184,7 +273,8 @@ _bt_doinsert(Relation rel, IndexTuple itup,
184273 XactLockTableWait (xwait , rel , & itup -> t_tid , XLTW_InsertIndex );
185274
186275 /* start over... */
187- _bt_freestack (stack );
276+ if (stack )
277+ _bt_freestack (stack );
188278 goto top ;
189279 }
190280 }
@@ -211,7 +301,8 @@ _bt_doinsert(Relation rel, IndexTuple itup,
211301 }
212302
213303 /* be tidy */
214- _bt_freestack (stack );
304+ if (stack )
305+ _bt_freestack (stack );
215306 _bt_freeskey (itup_scankey );
216307
217308 return is_unique ;
@@ -879,7 +970,16 @@ _bt_insertonpg(Relation rel,
879970 XLogRegisterData ((char * ) & xlrec , SizeOfBtreeInsert );
880971
881972 if (P_ISLEAF (lpageop ))
973+ {
882974 xlinfo = XLOG_BTREE_INSERT_LEAF ;
975+
976+ /*
977+ * Cache the block information if we just inserted into the
978+ * rightmost leaf page of the index.
979+ */
980+ if (P_RIGHTMOST (lpageop ))
981+ RelationSetTargetBlock (rel , BufferGetBlockNumber (buf ));
982+ }
883983 else
884984 {
885985 /*
0 commit comments