1515 * to hash_create. This prevents any attempt to split buckets on-the-fly.
1616 * Therefore, each hash bucket chain operates independently, and no fields
1717 * of the hash header change after init except nentries and freeList.
18- * A partitioned table uses spinlocks to guard changes of those fields.
18+ * (A partitioned table uses multiple copies of those fields, guarded by
19+ * spinlocks, for additional concurrency.)
1920 * This lets any subset of the hash buckets be treated as a separately
2021 * lockable partition. We expect callers to use the low-order bits of a
2122 * lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
121122typedef HASHBUCKET * HASHSEGMENT ;
122123
123124/*
124- * Using array of FreeListData instead of separate arrays of mutexes, nentries
125- * and freeLists prevents, at least partially, sharing one cache line between
126- * different mutexes (see below).
125+ * Per-freelist data.
126+ *
127+ * In a partitioned hash table, each freelist is associated with a specific
128+ * set of hashcodes, as determined by the FREELIST_IDX() macro below.
129+ * nentries tracks the number of live hashtable entries having those hashcodes
130+ * (NOT the number of entries in the freelist, as you might expect).
131+ *
132+ * The coverage of a freelist might be more or less than one partition, so it
133+ * needs its own lock rather than relying on caller locking. Relying on that
134+ * wouldn't work even if the coverage was the same, because of the occasional
135+ * need to "borrow" entries from another freelist; see get_hash_entry().
136+ *
137+ * Using an array of FreeListData instead of separate arrays of mutexes,
138+ * nentries and freeLists helps to reduce sharing of cache lines between
139+ * different mutexes.
127140 */
128141typedef struct
129142{
130- slock_t mutex ; /* spinlock */
131- long nentries ; /* number of entries */
132- HASHELEMENT * freeList ; /* list of free elements */
143+ slock_t mutex ; /* spinlock for this freelist */
144+ long nentries ; /* number of entries in associated buckets */
145+ HASHELEMENT * freeList ; /* chain of free elements */
133146} FreeListData ;
134147
135148/*
@@ -143,12 +156,14 @@ typedef struct
143156struct HASHHDR
144157{
145158 /*
146- * The freelist can become a point of contention on high-concurrency hash
147- * tables, so we use an array of freelist, each with its own mutex and
148- * nentries count, instead of just a single one.
159+ * The freelist can become a point of contention in high-concurrency hash
160+ * tables, so we use an array of freelists, each with its own mutex and
161+ * nentries count, instead of just a single one. Although the freelists
162+ * normally operate independently, we will scavenge entries from freelists
163+ * other than a hashcode's default freelist when necessary.
149164 *
150- * If hash table is not partitioned only freeList[0] is used and spinlocks
151- * are not used at all.
165+ * If the hash table is not partitioned, only freeList[0] is used and its
166+ * spinlock is not used at all; callers' locking is assumed sufficient .
152167 */
153168 FreeListData freeList [NUM_FREELISTS ];
154169
@@ -184,7 +199,7 @@ struct HASHHDR
184199#define IS_PARTITIONED (hctl ) ((hctl)->num_partitions != 0)
185200
186201#define FREELIST_IDX (hctl , hashcode ) \
187- (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
202+ (IS_PARTITIONED(hctl) ? ( hashcode) % NUM_FREELISTS : 0)
188203
189204/*
190205 * Top control structure for a hashtable --- in a shared table, each backend
@@ -506,19 +521,22 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
506521 nelem_alloc_first ;
507522
508523 /*
509- * If hash table is partitioned all freeLists have equal number of
510- * elements. Otherwise only freeList[0] is used.
524+ * If hash table is partitioned, give each freelist an equal share of
525+ * the initial allocation. Otherwise only freeList[0] is used.
511526 */
512527 if (IS_PARTITIONED (hashp -> hctl ))
513528 freelist_partitions = NUM_FREELISTS ;
514529 else
515530 freelist_partitions = 1 ;
516531
517532 nelem_alloc = nelem / freelist_partitions ;
518- if (nelem_alloc = = 0 )
533+ if (nelem_alloc < = 0 )
519534 nelem_alloc = 1 ;
520535
521- /* Make sure all memory will be used */
536+ /*
537+ * Make sure we'll allocate all the requested elements; freeList[0]
538+ * gets the excess if the request isn't divisible by NUM_FREELISTS.
539+ */
522540 if (nelem_alloc * freelist_partitions < nelem )
523541 nelem_alloc_first =
524542 nelem - nelem_alloc * (freelist_partitions - 1 );
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
620638 int i ;
621639
622640 /*
623- * initialize mutex if it's a partitioned table
641+ * initialize mutexes if it's a partitioned table
624642 */
625643 if (IS_PARTITIONED (hctl ))
626644 for (i = 0 ; i < NUM_FREELISTS ; i ++ )
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
902920 bool * foundPtr )
903921{
904922 HASHHDR * hctl = hashp -> hctl ;
923+ int freelist_idx = FREELIST_IDX (hctl , hashvalue );
905924 Size keysize ;
906925 uint32 bucket ;
907926 long segment_num ;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
910929 HASHBUCKET currBucket ;
911930 HASHBUCKET * prevBucketPtr ;
912931 HashCompareFunc match ;
913- int freelist_idx = FREELIST_IDX (hctl , hashvalue );
914932
915933#if HASH_STATISTICS
916934 hash_accesses ++ ;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
9931011 if (IS_PARTITIONED (hctl ))
9941012 SpinLockAcquire (& (hctl -> freeList [freelist_idx ].mutex ));
9951013
1014+ /* delete the record from the appropriate nentries counter. */
9961015 Assert (hctl -> freeList [freelist_idx ].nentries > 0 );
9971016 hctl -> freeList [freelist_idx ].nentries -- ;
9981017
9991018 /* remove record from hash bucket's chain. */
10001019 * prevBucketPtr = currBucket -> link ;
10011020
1002- /* add the record to the freelist for this table. */
1021+ /* add the record to the appropriate freelist. */
10031022 currBucket -> link = hctl -> freeList [freelist_idx ].freeList ;
10041023 hctl -> freeList [freelist_idx ].freeList = currBucket ;
10051024
@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
12201239}
12211240
12221241/*
1223- * create a new entry if possible
1242+ * Allocate a new hashtable entry if possible; return NULL if out of memory.
1243+ * (Or, if the underlying space allocator throws error for out-of-memory,
1244+ * we won't return at all.)
12241245 */
12251246static HASHBUCKET
12261247get_hash_entry (HTAB * hashp , int freelist_idx )
12271248{
12281249 HASHHDR * hctl = hashp -> hctl ;
12291250 HASHBUCKET newElement ;
1230- int borrow_from_idx ;
12311251
12321252 for (;;)
12331253 {
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12441264 if (IS_PARTITIONED (hctl ))
12451265 SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
12461266
1247- /* no free elements. allocate another chunk of buckets */
1267+ /*
1268+ * No free elements in this freelist. In a partitioned table, there
1269+ * might be entries in other freelists, but to reduce contention we
1270+ * prefer to first try to get another chunk of buckets from the main
1271+ * shmem allocator. If that fails, though, we *MUST* root through all
1272+ * the other freelists before giving up. There are multiple callers
1273+ * that assume that they can allocate every element in the initially
1274+ * requested table size, or that deleting an element guarantees they
1275+ * can insert a new element, even if shared memory is entirely full.
1276+ * Failing because the needed element is in a different freelist is
1277+ * not acceptable.
1278+ */
12481279 if (!element_alloc (hashp , hctl -> nelem_alloc , freelist_idx ))
12491280 {
1281+ int borrow_from_idx ;
1282+
12501283 if (!IS_PARTITIONED (hctl ))
12511284 return NULL ; /* out of memory */
12521285
1253- /* try to borrow element from another partition */
1286+ /* try to borrow element from another freelist */
12541287 borrow_from_idx = freelist_idx ;
12551288 for (;;)
12561289 {
12571290 borrow_from_idx = (borrow_from_idx + 1 ) % NUM_FREELISTS ;
12581291 if (borrow_from_idx == freelist_idx )
1259- break ;
1292+ break ; /* examined all freelists, fail */
12601293
12611294 SpinLockAcquire (& (hctl -> freeList [borrow_from_idx ].mutex ));
12621295 newElement = hctl -> freeList [borrow_from_idx ].freeList ;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12661299 hctl -> freeList [borrow_from_idx ].freeList = newElement -> link ;
12671300 SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
12681301
1302+ /* careful: count the new element in its proper freelist */
12691303 SpinLockAcquire (& hctl -> freeList [freelist_idx ].mutex );
12701304 hctl -> freeList [freelist_idx ].nentries ++ ;
12711305 SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
12721306
1273- break ;
1307+ return newElement ;
12741308 }
12751309
12761310 SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
12771311 }
12781312
1279- return newElement ;
1313+ /* no elements available to borrow either, so out of memory */
1314+ return NULL ;
12801315 }
12811316 }
12821317
@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
13001335 long sum = hashp -> hctl -> freeList [0 ].nentries ;
13011336
13021337 /*
1303- * We currently don't bother with the mutex; it's only sensible to call
1304- * this function if you've got lock on all partitions of the table.
1338+ * We currently don't bother with acquiring the mutexes; it's only
1339+ * sensible to call this function if you've got lock on all partitions of
1340+ * the table.
13051341 */
1306-
1307- if (!IS_PARTITIONED (hashp -> hctl ))
1308- return sum ;
1309-
1310- for (i = 1 ; i < NUM_FREELISTS ; i ++ )
1311- sum += hashp -> hctl -> freeList [i ].nentries ;
1342+ if (IS_PARTITIONED (hashp -> hctl ))
1343+ {
1344+ for (i = 1 ; i < NUM_FREELISTS ; i ++ )
1345+ sum += hashp -> hctl -> freeList [i ].nentries ;
1346+ }
13121347
13131348 return sum ;
13141349}
0 commit comments