88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.65 2007/04/09 22:03:57 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.66 2007/04/19 20:24:04 tgl Exp $
1212 *
1313 * NOTES
1414 * Postgres hash pages look like ordinary relation pages. The opaque
3636#include "utils/lsyscache.h"
3737
3838
39- static BlockNumber _hash_alloc_buckets (Relation rel , uint32 nblocks );
39+ static bool _hash_alloc_buckets (Relation rel , BlockNumber firstblock ,
40+ uint32 nblocks );
4041static void _hash_splitbucket (Relation rel , Buffer metabuf ,
4142 Bucket obucket , Bucket nbucket ,
4243 BlockNumber start_oblkno ,
@@ -104,8 +105,9 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
104105 * requested buffer and its reference count has been incremented
105106 * (ie, the buffer is "locked and pinned").
106107 *
107- * blkno == P_NEW is allowed, but it is caller's responsibility to
108- * ensure that only one process can extend the index at a time.
108+ * P_NEW is disallowed because this routine should only be used
109+ * to access pages that are known to be before the filesystem EOF.
110+ * Extending the index should be done with _hash_getnewbuf.
109111 *
110112 * All call sites should call either _hash_checkpage or _hash_pageinit
111113 * on the returned page, depending on whether the block is expected
@@ -116,6 +118,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
116118{
117119 Buffer buf ;
118120
121+ if (blkno == P_NEW )
122+ elog (ERROR , "hash AM does not use P_NEW" );
123+
119124 buf = ReadBuffer (rel , blkno );
120125
121126 if (access != HASH_NOLOCK )
@@ -125,6 +130,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
125130 return buf ;
126131}
127132
133+ /*
134+ * _hash_getnewbuf() -- Get a new page at the end of the index.
135+ *
136+ * This has the same API as _hash_getbuf, except that we are adding
137+ * a page to the index, and hence expect the page to be past the
138+ * logical EOF. (However, we have to support the case where it isn't,
139+ * since a prior try might have crashed after extending the filesystem
140+ * EOF but before updating the metapage to reflect the added page.)
141+ *
142+ * It is caller's responsibility to ensure that only one process can
143+ * extend the index at a time.
144+ *
145+ * All call sites should call _hash_pageinit on the returned page.
146+ * Also, it's difficult to imagine why access would not be HASH_WRITE.
147+ */
148+ Buffer
149+ _hash_getnewbuf (Relation rel , BlockNumber blkno , int access )
150+ {
151+ BlockNumber nblocks = RelationGetNumberOfBlocks (rel );
152+ Buffer buf ;
153+
154+ if (blkno == P_NEW )
155+ elog (ERROR , "hash AM does not use P_NEW" );
156+ if (blkno > nblocks )
157+ elog (ERROR , "access to noncontiguous page in hash index \"%s\"" ,
158+ RelationGetRelationName (rel ));
159+
160+ /* smgr insists we use P_NEW to extend the relation */
161+ if (blkno == nblocks )
162+ {
163+ buf = ReadBuffer (rel , P_NEW );
164+ if (BufferGetBlockNumber (buf ) != blkno )
165+ elog (ERROR , "unexpected hash relation size: %u, should be %u" ,
166+ BufferGetBlockNumber (buf ), blkno );
167+ }
168+ else
169+ buf = ReadBuffer (rel , blkno );
170+
171+ if (access != HASH_NOLOCK )
172+ LockBuffer (buf , access );
173+
174+ /* ref count and lock type are correct */
175+ return buf ;
176+ }
177+
128178/*
129179 * _hash_relbuf() -- release a locked buffer.
130180 *
@@ -238,12 +288,11 @@ _hash_metapinit(Relation rel)
238288
239289 /*
240290 * We initialize the metapage, the first two bucket pages, and the
241- * first bitmap page in sequence, using P_NEW to cause smgrextend()
242- * calls to occur. This ensures that the smgr level has the right
243- * idea of the physical index length.
291+ * first bitmap page in sequence, using _hash_getnewbuf to cause
292+ * smgrextend() calls to occur. This ensures that the smgr level
293+ * has the right idea of the physical index length.
244294 */
245- metabuf = _hash_getbuf (rel , P_NEW , HASH_WRITE );
246- Assert (BufferGetBlockNumber (metabuf ) == HASH_METAPAGE );
295+ metabuf = _hash_getnewbuf (rel , HASH_METAPAGE , HASH_WRITE );
247296 pg = BufferGetPage (metabuf );
248297 _hash_pageinit (pg , BufferGetPageSize (metabuf ));
249298
@@ -301,8 +350,7 @@ _hash_metapinit(Relation rel)
301350 */
302351 for (i = 0 ; i <= 1 ; i ++ )
303352 {
304- buf = _hash_getbuf (rel , P_NEW , HASH_WRITE );
305- Assert (BufferGetBlockNumber (buf ) == BUCKET_TO_BLKNO (metap , i ));
353+ buf = _hash_getnewbuf (rel , BUCKET_TO_BLKNO (metap , i ), HASH_WRITE );
306354 pg = BufferGetPage (buf );
307355 _hash_pageinit (pg , BufferGetPageSize (buf ));
308356 pageopaque = (HashPageOpaque ) PageGetSpecialPointer (pg );
@@ -350,7 +398,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
350398 Bucket old_bucket ;
351399 Bucket new_bucket ;
352400 uint32 spare_ndx ;
353- BlockNumber firstblock = InvalidBlockNumber ;
354401 BlockNumber start_oblkno ;
355402 BlockNumber start_nblkno ;
356403 uint32 maxbucket ;
@@ -402,39 +449,15 @@ _hash_expandtable(Relation rel, Buffer metabuf)
402449 if (metap -> hashm_maxbucket >= (uint32 ) 0x7FFFFFFE )
403450 goto fail ;
404451
405- /*
406- * If the split point is increasing (hashm_maxbucket's log base 2
407- * increases), we need to allocate a new batch of bucket pages.
408- */
409- new_bucket = metap -> hashm_maxbucket + 1 ;
410- spare_ndx = _hash_log2 (new_bucket + 1 );
411- if (spare_ndx > metap -> hashm_ovflpoint )
412- {
413- Assert (spare_ndx == metap -> hashm_ovflpoint + 1 );
414- /*
415- * The number of buckets in the new splitpoint is equal to the
416- * total number already in existence, i.e. new_bucket. Currently
417- * this maps one-to-one to blocks required, but someday we may need
418- * a more complicated calculation here.
419- */
420- firstblock = _hash_alloc_buckets (rel , new_bucket );
421- if (firstblock == InvalidBlockNumber )
422- goto fail ; /* can't split due to BlockNumber overflow */
423- }
424-
425452 /*
426453 * Determine which bucket is to be split, and attempt to lock the old
427454 * bucket. If we can't get the lock, give up.
428455 *
429456 * The lock protects us against other backends, but not against our own
430457 * backend. Must check for active scans separately.
431- *
432- * Ideally we would lock the new bucket too before proceeding, but if we
433- * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping isn't
434- * correct yet. For simplicity we update the metapage first and then
435- * lock. This should be okay because no one else should be trying to lock
436- * the new bucket yet...
437458 */
459+ new_bucket = metap -> hashm_maxbucket + 1 ;
460+
438461 old_bucket = (new_bucket & metap -> hashm_lowmask );
439462
440463 start_oblkno = BUCKET_TO_BLKNO (metap , old_bucket );
@@ -445,6 +468,45 @@ _hash_expandtable(Relation rel, Buffer metabuf)
445468 if (!_hash_try_getlock (rel , start_oblkno , HASH_EXCLUSIVE ))
446469 goto fail ;
447470
471+ /*
472+ * Likewise lock the new bucket (should never fail).
473+ *
474+ * Note: it is safe to compute the new bucket's blkno here, even though
475+ * we may still need to update the BUCKET_TO_BLKNO mapping. This is
476+ * because the current value of hashm_spares[hashm_ovflpoint] correctly
477+ * shows where we are going to put a new splitpoint's worth of buckets.
478+ */
479+ start_nblkno = BUCKET_TO_BLKNO (metap , new_bucket );
480+
481+ if (_hash_has_active_scan (rel , new_bucket ))
482+ elog (ERROR , "scan in progress on supposedly new bucket" );
483+
484+ if (!_hash_try_getlock (rel , start_nblkno , HASH_EXCLUSIVE ))
485+ elog (ERROR , "could not get lock on supposedly new bucket" );
486+
487+ /*
488+ * If the split point is increasing (hashm_maxbucket's log base 2
489+ * increases), we need to allocate a new batch of bucket pages.
490+ */
491+ spare_ndx = _hash_log2 (new_bucket + 1 );
492+ if (spare_ndx > metap -> hashm_ovflpoint )
493+ {
494+ Assert (spare_ndx == metap -> hashm_ovflpoint + 1 );
495+ /*
496+ * The number of buckets in the new splitpoint is equal to the
497+ * total number already in existence, i.e. new_bucket. Currently
498+ * this maps one-to-one to blocks required, but someday we may need
499+ * a more complicated calculation here.
500+ */
501+ if (!_hash_alloc_buckets (rel , start_nblkno , new_bucket ))
502+ {
503+ /* can't split due to BlockNumber overflow */
504+ _hash_droplock (rel , start_oblkno , HASH_EXCLUSIVE );
505+ _hash_droplock (rel , start_nblkno , HASH_EXCLUSIVE );
506+ goto fail ;
507+ }
508+ }
509+
448510 /*
449511 * Okay to proceed with split. Update the metapage bucket mapping info.
450512 *
@@ -477,20 +539,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
477539 metap -> hashm_ovflpoint = spare_ndx ;
478540 }
479541
480- /* now we can compute the new bucket's primary block number */
481- start_nblkno = BUCKET_TO_BLKNO (metap , new_bucket );
482-
483- /* if we added a splitpoint, should match result of _hash_alloc_buckets */
484- if (firstblock != InvalidBlockNumber &&
485- firstblock != start_nblkno )
486- elog (PANIC , "unexpected hash relation size: %u, should be %u" ,
487- firstblock , start_nblkno );
488-
489- Assert (!_hash_has_active_scan (rel , new_bucket ));
490-
491- if (!_hash_try_getlock (rel , start_nblkno , HASH_EXCLUSIVE ))
492- elog (PANIC , "could not get lock on supposedly new bucket" );
493-
494542 /* Done mucking with metapage */
495543 END_CRIT_SECTION ();
496544
@@ -539,7 +587,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
539587 * This does not need to initialize the new bucket pages; we'll do that as
540588 * each one is used by _hash_expandtable(). But we have to extend the logical
541589 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
542- * sync with ours, so that overflow-page allocation works correctly .
590+ * sync with ours, so that we don't get complaints from smgr .
543591 *
544592 * We do this by writing a page of zeroes at the end of the splitpoint range.
545593 * We expect that the filesystem will ensure that the intervening pages read
@@ -554,37 +602,30 @@ _hash_expandtable(Relation rel, Buffer metabuf)
554602 * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
555603 * so it may not be worth worrying about.
556604 *
557- * Returns the first block number in the new splitpoint's range, or
558- * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
605+ * Returns TRUE if successful, or FALSE if allocation failed due to
606+ * BlockNumber overflow.
559607 */
560- static BlockNumber
561- _hash_alloc_buckets (Relation rel , uint32 nblocks )
608+ static bool
609+ _hash_alloc_buckets (Relation rel , BlockNumber firstblock , uint32 nblocks )
562610{
563- BlockNumber firstblock ;
564611 BlockNumber lastblock ;
565612 char zerobuf [BLCKSZ ];
566613
567- /*
568- * Since we hold metapage lock, no one else is either splitting or
569- * allocating a new page in _hash_getovflpage(); hence it's safe to
570- * assume that the relation length isn't changing under us.
571- */
572- firstblock = RelationGetNumberOfBlocks (rel );
573614 lastblock = firstblock + nblocks - 1 ;
574615
575616 /*
576617 * Check for overflow in block number calculation; if so, we cannot
577618 * extend the index anymore.
578619 */
579620 if (lastblock < firstblock || lastblock == InvalidBlockNumber )
580- return InvalidBlockNumber ;
621+ return false ;
581622
582623 MemSet (zerobuf , 0 , sizeof (zerobuf ));
583624
584- /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
625+ RelationOpenSmgr ( rel );
585626 smgrextend (rel -> rd_smgr , lastblock , zerobuf , rel -> rd_istemp );
586627
587- return firstblock ;
628+ return true ;
588629}
589630
590631
0 commit comments