4343#include "access/htup_details.h"
4444#include "nodes/bitmapset.h"
4545#include "nodes/tidbitmap.h"
46- #include "utils/hsearch.h"
4746
4847/*
4948 * The maximum number of tuples per page is not large (typically 256 with
6160 * for that page in the page table.
6261 *
6362 * We actually store both exact pages and lossy chunks in the same hash
64- * table, using identical data structures. (This is because dynahash.c's
65- * memory management doesn't allow space to be transferred easily from one
66- * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
67- * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
68- * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
69- * remainder operations. So, define it like this:
63+ * table, using identical data structures. (This is because the memory
64+ * management for hashtables doesn't easily/efficiently allow space to be
65+ * transferred easily from one hashtable to another.) Therefore it's best
66+ * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
67+ * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
68+ * avoid expensive integer remainder operations. So, define it like this:
7069 */
7170#define PAGES_PER_CHUNK (BLCKSZ / 32)
7271
9796typedef struct PagetableEntry
9897{
9998 BlockNumber blockno ; /* page number (hashtable key) */
99+ char status ; /* hash entry status */
100100 bool ischunk ; /* T = lossy storage, F = exact */
101101 bool recheck ; /* should the tuples be rechecked? */
102102 bitmapword words [Max (WORDS_PER_PAGE , WORDS_PER_CHUNK )];
103103} PagetableEntry ;
104104
105105/*
106- * dynahash.c is optimized for relatively large, long-lived hash tables.
107- * This is not ideal for TIDBitMap, particularly when we are using a bitmap
108- * scan on the inside of a nestloop join: a bitmap may well live only long
109- * enough to accumulate one entry in such cases. We therefore avoid creating
110- * an actual hashtable until we need two pagetable entries. When just one
111- * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
112- * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
113- * to zero or one page again. So, status can be TBM_HASH even when nentries
114- * is zero or one.)
106+ * We want to avoid the overhead of creating the hashtable, which is
107+ * comparatively large, when not necessary. Particularly when we are using a
108+ * bitmap scan on the inside of a nestloop join: a bitmap may well live only
109+ * long enough to accumulate one entry in such cases. We therefore avoid
110+ * creating an actual hashtable until we need two pagetable entries. When
111+ * just one pagetable entry is needed, we store it in a fixed field of
112+ * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
113+ * shrinks down to zero or one page again. So, status can be TBM_HASH even
114+ * when nentries is zero or one.)
115115 */
116116typedef enum
117117{
@@ -128,12 +128,13 @@ struct TIDBitmap
128128 NodeTag type ; /* to make it a valid Node */
129129 MemoryContext mcxt ; /* memory context containing me */
130130 TBMStatus status ; /* see codes above */
131- HTAB * pagetable ; /* hash table of PagetableEntry's */
131+ struct pagetable_hash * pagetable ; /* hash table of PagetableEntry's */
132132 int nentries ; /* number of entries in pagetable */
133133 int maxentries ; /* limit on same to meet maxbytes */
134134 int npages ; /* number of exact entries in pagetable */
135135 int nchunks ; /* number of lossy entries in pagetable */
136136 bool iterating ; /* tbm_begin_iterate called? */
137+ uint32 lossify_start ; /* offset to start lossifying hashtable at */
137138 PagetableEntry entry1 ; /* used when status == TBM_ONE_PAGE */
138139 /* these are valid when iterating is true: */
139140 PagetableEntry * * spages ; /* sorted exact-page list, or NULL */
@@ -168,6 +169,35 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
168169static void tbm_lossify (TIDBitmap * tbm );
169170static int tbm_comparator (const void * left , const void * right );
170171
172+ /*
173+ * Simple inline murmur hash implementation for the exact width required, for
174+ * performance.
175+ */
176+ static inline uint32
177+ hash_blockno (BlockNumber b )
178+ {
179+ uint32 h = b ;
180+
181+ h ^= h >> 16 ;
182+ h *= 0x85ebca6b ;
183+ h ^= h >> 13 ;
184+ h *= 0xc2b2ae35 ;
185+ h ^= h >> 16 ;
186+ return h ;
187+ }
188+
189+ /* define hashtable mapping block numbers to PagetableEntry's */
190+ #define SH_PREFIX pagetable
191+ #define SH_ELEMENT_TYPE PagetableEntry
192+ #define SH_KEY_TYPE BlockNumber
193+ #define SH_KEY blockno
194+ #define SH_HASH_KEY (tb , key ) hash_blockno(key)
195+ #define SH_EQUAL (tb , a , b ) a == b
196+ #define SH_SCOPE static inline
197+ #define SH_DEFINE
198+ #define SH_DECLARE
199+ #include "lib/simplehash.h"
200+
171201
172202/*
173203 * tbm_create - create an initially-empty bitmap
@@ -190,17 +220,16 @@ tbm_create(long maxbytes)
190220
191221 /*
192222 * Estimate number of hashtable entries we can have within maxbytes. This
193- * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
194- * pointer per hash entry, which is crude but good enough for our purpose.
195- * Also count an extra Pointer per entry for the arrays created during
196- * iteration readout.
223+ * estimates the hash cost as sizeof(PagetableEntry), which is good enough
224+ * for our purpose. Also count an extra Pointer per entry for the arrays
225+ * created during iteration readout.
197226 */
198227 nbuckets = maxbytes /
199- (MAXALIGN (sizeof (HASHELEMENT )) + MAXALIGN (sizeof (PagetableEntry ))
200- + sizeof (Pointer ) + sizeof (Pointer ));
228+ (sizeof (PagetableEntry ) + sizeof (Pointer ) + sizeof (Pointer ));
201229 nbuckets = Min (nbuckets , INT_MAX - 1 ); /* safety limit */
202230 nbuckets = Max (nbuckets , 16 ); /* sanity limit */
203231 tbm -> maxentries = (int ) nbuckets ;
232+ tbm -> lossify_start = 0 ;
204233
205234 return tbm ;
206235}
@@ -212,32 +241,25 @@ tbm_create(long maxbytes)
212241static void
213242tbm_create_pagetable (TIDBitmap * tbm )
214243{
215- HASHCTL hash_ctl ;
216-
217244 Assert (tbm -> status != TBM_HASH );
218245 Assert (tbm -> pagetable == NULL );
219246
220- /* Create the hashtable proper */
221- MemSet (& hash_ctl , 0 , sizeof (hash_ctl ));
222- hash_ctl .keysize = sizeof (BlockNumber );
223- hash_ctl .entrysize = sizeof (PagetableEntry );
224- hash_ctl .hcxt = tbm -> mcxt ;
225- tbm -> pagetable = hash_create ("TIDBitmap" ,
226- 128 , /* start small and extend */
227- & hash_ctl ,
228- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT );
247+ tbm -> pagetable = pagetable_create (tbm -> mcxt , 128 );
229248
230249 /* If entry1 is valid, push it into the hashtable */
231250 if (tbm -> status == TBM_ONE_PAGE )
232251 {
233252 PagetableEntry * page ;
234253 bool found ;
254+ char oldstatus ;
235255
236- page = ( PagetableEntry * ) hash_search (tbm -> pagetable ,
237- ( void * ) & tbm -> entry1 .blockno ,
238- HASH_ENTER , & found );
256+ page = pagetable_insert (tbm -> pagetable ,
257+ tbm -> entry1 .blockno ,
258+ & found );
239259 Assert (!found );
260+ oldstatus = page -> status ;
240261 memcpy (page , & tbm -> entry1 , sizeof (PagetableEntry ));
262+ page -> status = oldstatus ;
241263 }
242264
243265 tbm -> status = TBM_HASH ;
250272tbm_free (TIDBitmap * tbm )
251273{
252274 if (tbm -> pagetable )
253- hash_destroy (tbm -> pagetable );
275+ pagetable_destroy (tbm -> pagetable );
254276 if (tbm -> spages )
255277 pfree (tbm -> spages );
256278 if (tbm -> schunks )
@@ -357,12 +379,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
357379 tbm_union_page (a , & b -> entry1 );
358380 else
359381 {
360- HASH_SEQ_STATUS status ;
382+ pagetable_iterator i ;
361383 PagetableEntry * bpage ;
362384
363385 Assert (b -> status == TBM_HASH );
364- hash_seq_init ( & status , b -> pagetable );
365- while ((bpage = ( PagetableEntry * ) hash_seq_search ( & status )) != NULL )
386+ pagetable_start_iterate ( b -> pagetable , & i );
387+ while ((bpage = pagetable_iterate ( b -> pagetable , & i )) != NULL )
366388 tbm_union_page (a , bpage );
367389 }
368390}
@@ -449,12 +471,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
449471 }
450472 else
451473 {
452- HASH_SEQ_STATUS status ;
474+ pagetable_iterator i ;
453475 PagetableEntry * apage ;
454476
455477 Assert (a -> status == TBM_HASH );
456- hash_seq_init ( & status , a -> pagetable );
457- while ((apage = ( PagetableEntry * ) hash_seq_search ( & status )) != NULL )
478+ pagetable_start_iterate ( a -> pagetable , & i );
479+ while ((apage = pagetable_iterate ( a -> pagetable , & i )) != NULL )
458480 {
459481 if (tbm_intersect_page (a , apage , b ))
460482 {
@@ -464,9 +486,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
464486 else
465487 a -> npages -- ;
466488 a -> nentries -- ;
467- if (hash_search (a -> pagetable ,
468- (void * ) & apage -> blockno ,
469- HASH_REMOVE , NULL ) == NULL )
489+ if (!pagetable_delete (a -> pagetable , apage -> blockno ))
470490 elog (ERROR , "hash table corrupted" );
471491 }
472492 }
@@ -606,7 +626,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
606626 */
607627 if (tbm -> status == TBM_HASH && !tbm -> iterating )
608628 {
609- HASH_SEQ_STATUS status ;
629+ pagetable_iterator i ;
610630 PagetableEntry * page ;
611631 int npages ;
612632 int nchunks ;
@@ -620,9 +640,9 @@ tbm_begin_iterate(TIDBitmap *tbm)
620640 MemoryContextAlloc (tbm -> mcxt ,
621641 tbm -> nchunks * sizeof (PagetableEntry * ));
622642
623- hash_seq_init (& status , tbm -> pagetable );
624643 npages = nchunks = 0 ;
625- while ((page = (PagetableEntry * ) hash_seq_search (& status )) != NULL )
644+ pagetable_start_iterate (tbm -> pagetable , & i );
645+ while ((page = pagetable_iterate (tbm -> pagetable , & i )) != NULL )
626646 {
627647 if (page -> ischunk )
628648 tbm -> schunks [nchunks ++ ] = page ;
@@ -791,9 +811,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
791811 return page ;
792812 }
793813
794- page = (PagetableEntry * ) hash_search (tbm -> pagetable ,
795- (void * ) & pageno ,
796- HASH_FIND , NULL );
814+ page = pagetable_lookup (tbm -> pagetable , pageno );
797815 if (page == NULL )
798816 return NULL ;
799817 if (page -> ischunk )
@@ -834,15 +852,16 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
834852 }
835853
836854 /* Look up or create an entry */
837- page = (PagetableEntry * ) hash_search (tbm -> pagetable ,
838- (void * ) & pageno ,
839- HASH_ENTER , & found );
855+ page = pagetable_insert (tbm -> pagetable , pageno , & found );
840856 }
841857
842858 /* Initialize it if not present before */
843859 if (!found )
844860 {
861+ char oldstatus = page -> status ;
862+
845863 MemSet (page , 0 , sizeof (PagetableEntry ));
864+ page -> status = oldstatus ;
846865 page -> blockno = pageno ;
847866 /* must count it too */
848867 tbm -> nentries ++ ;
@@ -869,9 +888,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
869888
870889 bitno = pageno % PAGES_PER_CHUNK ;
871890 chunk_pageno = pageno - bitno ;
872- page = ( PagetableEntry * ) hash_search ( tbm -> pagetable ,
873- ( void * ) & chunk_pageno ,
874- HASH_FIND , NULL );
891+
892+ page = pagetable_lookup ( tbm -> pagetable , chunk_pageno );
893+
875894 if (page != NULL && page -> ischunk )
876895 {
877896 int wordnum = WORDNUM (bitno );
@@ -912,9 +931,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
912931 */
913932 if (bitno != 0 )
914933 {
915- if (hash_search (tbm -> pagetable ,
916- (void * ) & pageno ,
917- HASH_REMOVE , NULL ) != NULL )
934+ if (pagetable_delete (tbm -> pagetable , pageno ))
918935 {
919936 /* It was present, so adjust counts */
920937 tbm -> nentries -- ;
@@ -923,14 +940,15 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
923940 }
924941
925942 /* Look up or create entry for chunk-header page */
926- page = (PagetableEntry * ) hash_search (tbm -> pagetable ,
927- (void * ) & chunk_pageno ,
928- HASH_ENTER , & found );
943+ page = pagetable_insert (tbm -> pagetable , chunk_pageno , & found );
929944
930945 /* Initialize it if not present before */
931946 if (!found )
932947 {
948+ char oldstatus = page -> status ;
949+
933950 MemSet (page , 0 , sizeof (PagetableEntry ));
951+ page -> status = oldstatus ;
934952 page -> blockno = chunk_pageno ;
935953 page -> ischunk = true;
936954 /* must count it too */
@@ -939,8 +957,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
939957 }
940958 else if (!page -> ischunk )
941959 {
960+ char oldstatus = page -> status ;
961+
942962 /* chunk header page was formerly non-lossy, make it lossy */
943963 MemSet (page , 0 , sizeof (PagetableEntry ));
964+ page -> status = oldstatus ;
944965 page -> blockno = chunk_pageno ;
945966 page -> ischunk = true;
946967 /* we assume it had some tuple bit(s) set, so mark it lossy */
@@ -962,7 +983,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
962983static void
963984tbm_lossify (TIDBitmap * tbm )
964985{
965- HASH_SEQ_STATUS status ;
986+ pagetable_iterator i ;
966987 PagetableEntry * page ;
967988
968989 /*
@@ -977,8 +998,8 @@ tbm_lossify(TIDBitmap *tbm)
977998 Assert (!tbm -> iterating );
978999 Assert (tbm -> status == TBM_HASH );
9791000
980- hash_seq_init ( & status , tbm -> pagetable );
981- while ((page = ( PagetableEntry * ) hash_seq_search ( & status )) != NULL )
1001+ pagetable_start_iterate_at ( tbm -> pagetable , & i , tbm -> lossify_start );
1002+ while ((page = pagetable_iterate ( tbm -> pagetable , & i )) != NULL )
9821003 {
9831004 if (page -> ischunk )
9841005 continue ; /* already a chunk header */
@@ -995,15 +1016,19 @@ tbm_lossify(TIDBitmap *tbm)
9951016
9961017 if (tbm -> nentries <= tbm -> maxentries / 2 )
9971018 {
998- /* we have done enough */
999- hash_seq_term (& status );
1019+ /*
1020+ * We have made enough room. Remember where to start lossifying
1021+ * next round, so we evenly iterate over the hashtable.
1022+ */
1023+ tbm -> lossify_start = i .cur ;
10001024 break ;
10011025 }
10021026
10031027 /*
10041028 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1005- * hashtable. We can continue the same seq_search scan since we do
1006- * not care whether we visit lossy chunks or not.
1029+ * hashtable and may have deleted the non-lossy chunk. We can
1030+ * continue the same hash table scan, since failure to visit one
1031+ * element or visiting the newly inserted element, isn't fatal.
10071032 */
10081033 }
10091034
0 commit comments