PostgreSQL Source Code git master
tidbitmap.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tidbitmap.c
4 * PostgreSQL tuple-id (TID) bitmap package
5 *
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
14 *
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
21 *
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
30 *
31 *
32 * Copyright (c) 2003-2025, PostgreSQL Global Development Group
33 *
34 * IDENTIFICATION
35 * src/backend/nodes/tidbitmap.c
36 *
37 *-------------------------------------------------------------------------
38 */
39#include "postgres.h"
40
41#include <limits.h>
42
43#include "access/htup_details.h"
44#include "common/hashfn.h"
45#include "common/int.h"
46#include "nodes/bitmapset.h"
47#include "nodes/tidbitmap.h"
48#include "storage/lwlock.h"
49#include "utils/dsa.h"
50
51/*
52 * When we have to switch over to lossy storage, we use a data structure
53 * with one bit per page, where all pages having the same number DIV
54 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
55 * and has the bit set for a given page, there must not be a per-page entry
56 * for that page in the page table.
57 *
58 * We actually store both exact pages and lossy chunks in the same hash
59 * table, using identical data structures. (This is because the memory
60 * management for hashtables doesn't easily/efficiently allow space to be
61 * transferred easily from one hashtable to another.) Therefore it's best
62 * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
63 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
64 * avoid expensive integer remainder operations. So, define it like this:
65 */
66#define PAGES_PER_CHUNK (BLCKSZ / 32)
67
68/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
69
70#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
71#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
72
73/* number of active words for an exact page: */
74#define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
75/* number of active words for a lossy chunk: */
76#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
77
78/*
79 * The hashtable entries are represented by this data structure. For
80 * an exact page, blockno is the page number and bit k of the bitmap
81 * represents tuple offset k+1. For a lossy chunk, blockno is the first
82 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
83 * bit k represents page blockno+k. Note that it is not possible to
84 * have exact storage for the first page of a chunk if we are using
85 * lossy storage for any page in the chunk's range, since the same
86 * hashtable entry has to serve both purposes.
87 *
88 * recheck is used only on exact pages --- it indicates that although
89 * only the stated tuples need be checked, the full index qual condition
90 * must be checked for each (ie, these are candidate matches).
91 */
92typedef struct PagetableEntry
93{
94 BlockNumber blockno; /* page number (hashtable key) */
95 char status; /* hash entry status */
96 bool ischunk; /* T = lossy storage, F = exact */
97 bool recheck; /* should the tuples be rechecked? */
100
101/*
102 * Holds array of pagetable entries.
103 */
104typedef struct PTEntryArray
105{
106 pg_atomic_uint32 refcount; /* no. of iterator attached */
109
110/*
111 * We want to avoid the overhead of creating the hashtable, which is
112 * comparatively large, when not necessary. Particularly when we are using a
113 * bitmap scan on the inside of a nestloop join: a bitmap may well live only
114 * long enough to accumulate one entry in such cases. We therefore avoid
115 * creating an actual hashtable until we need two pagetable entries. When
116 * just one pagetable entry is needed, we store it in a fixed field of
117 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
118 * shrinks down to zero or one page again. So, status can be TBM_HASH even
119 * when nentries is zero or one.)
120 */
121typedef enum
122{
123 TBM_EMPTY, /* no hashtable, nentries == 0 */
124 TBM_ONE_PAGE, /* entry1 contains the single entry */
125 TBM_HASH, /* pagetable is valid, entry1 is not */
126} TBMStatus;
127
128/*
129 * Current iterating state of the TBM.
130 */
131typedef enum
132{
133 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
134 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
135 TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
137
138/*
139 * Here is the representation for a whole TIDBitMap:
140 */
142{
143 NodeTag type; /* to make it a valid Node */
144 MemoryContext mcxt; /* memory context containing me */
145 TBMStatus status; /* see codes above */
146 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
147 int nentries; /* number of entries in pagetable */
148 int maxentries; /* limit on same to meet maxbytes */
149 int npages; /* number of exact entries in pagetable */
150 int nchunks; /* number of lossy entries in pagetable */
151 TBMIteratingState iterating; /* tbm_begin_iterate called? */
152 uint32 lossify_start; /* offset to start lossifying hashtable at */
153 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
154 /* these are valid when iterating is true: */
155 PagetableEntry **spages; /* sorted exact-page list, or NULL */
156 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
157 dsa_pointer dsapagetable; /* dsa_pointer to the element array */
158 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
159 dsa_pointer ptpages; /* dsa_pointer to the page array */
160 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
161 dsa_area *dsa; /* reference to per-query dsa area */
162};
163
164/*
165 * When iterating over a backend-local bitmap in sorted order, a
166 * TBMPrivateIterator is used to track our progress. There can be several
167 * iterators scanning the same bitmap concurrently. Note that the bitmap
168 * becomes read-only as soon as any iterator is created.
169 */
171{
172 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
173 int spageptr; /* next spages index */
174 int schunkptr; /* next schunks index */
175 int schunkbit; /* next bit to check in current schunk */
176};
177
178/*
179 * Holds the shared members of the iterator so that multiple processes
180 * can jointly iterate.
181 */
183{
184 int nentries; /* number of entries in pagetable */
185 int maxentries; /* limit on same to meet maxbytes */
186 int npages; /* number of exact entries in pagetable */
187 int nchunks; /* number of lossy entries in pagetable */
188 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
189 dsa_pointer spages; /* dsa pointer to page array */
190 dsa_pointer schunks; /* dsa pointer to chunk array */
191 LWLock lock; /* lock to protect below members */
192 int spageptr; /* next spages index */
193 int schunkptr; /* next schunks index */
194 int schunkbit; /* next bit to check in current schunk */
196
197/*
198 * pagetable iteration array.
199 */
200typedef struct PTIterationArray
201{
202 pg_atomic_uint32 refcount; /* no. of iterator attached */
203 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
205
206/*
207 * same as TBMPrivateIterator, but it is used for joint iteration, therefore
208 * this also holds a reference to the shared state.
209 */
211{
212 TBMSharedIteratorState *state; /* shared state */
213 PTEntryArray *ptbase; /* pagetable element array */
214 PTIterationArray *ptpages; /* sorted exact page index list */
215 PTIterationArray *ptchunks; /* sorted lossy page index list */
216};
217
218/* Local function prototypes */
219static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
220static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
221 const TIDBitmap *b);
222static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
223 BlockNumber pageno);
225static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
226static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
227static void tbm_lossify(TIDBitmap *tbm);
228static int tbm_comparator(const void *left, const void *right);
229static int tbm_shared_comparator(const void *left, const void *right,
230 void *arg);
231
232/* define hashtable mapping block numbers to PagetableEntry's */
233#define SH_USE_NONDEFAULT_ALLOCATOR
234#define SH_PREFIX pagetable
235#define SH_ELEMENT_TYPE PagetableEntry
236#define SH_KEY_TYPE BlockNumber
237#define SH_KEY blockno
238#define SH_HASH_KEY(tb, key) murmurhash32(key)
239#define SH_EQUAL(tb, a, b) a == b
240#define SH_SCOPE static inline
241#define SH_DEFINE
242#define SH_DECLARE
243#include "lib/simplehash.h"
244
245
246/*
247 * tbm_create - create an initially-empty bitmap
248 *
249 * The bitmap will live in the memory context that is CurrentMemoryContext
250 * at the time of this call. It will be limited to (approximately) maxbytes
251 * total memory consumption. If the DSA passed to this function is not NULL
252 * then the memory for storing elements of the underlying page table will
253 * be allocated from the DSA.
254 */
255TIDBitmap *
256tbm_create(Size maxbytes, dsa_area *dsa)
257{
258 TIDBitmap *tbm;
259
260 /* Create the TIDBitmap struct and zero all its fields */
261 tbm = makeNode(TIDBitmap);
262
264 tbm->status = TBM_EMPTY;
265
266 tbm->maxentries = tbm_calculate_entries(maxbytes);
267 tbm->lossify_start = 0;
268 tbm->dsa = dsa;
273
274 return tbm;
275}
276
277/*
278 * Actually create the hashtable. Since this is a moderately expensive
279 * proposition, we don't do it until we have to.
280 */
281static void
283{
284 Assert(tbm->status != TBM_HASH);
285 Assert(tbm->pagetable == NULL);
286
287 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
288
289 /* If entry1 is valid, push it into the hashtable */
290 if (tbm->status == TBM_ONE_PAGE)
291 {
292 PagetableEntry *page;
293 bool found;
294 char oldstatus;
295
296 page = pagetable_insert(tbm->pagetable,
297 tbm->entry1.blockno,
298 &found);
299 Assert(!found);
300 oldstatus = page->status;
301 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
302 page->status = oldstatus;
303 }
304
305 tbm->status = TBM_HASH;
306}
307
308/*
309 * tbm_free - free a TIDBitmap
310 */
311void
313{
314 if (tbm->pagetable)
315 pagetable_destroy(tbm->pagetable);
316 if (tbm->spages)
317 pfree(tbm->spages);
318 if (tbm->schunks)
319 pfree(tbm->schunks);
320 pfree(tbm);
321}
322
323/*
324 * tbm_free_shared_area - free shared state
325 *
326 * Free shared iterator state, Also free shared pagetable and iterator arrays
327 * memory if they are not referred by any of the shared iterator i.e recount
328 * is becomes 0.
329 */
330void
332{
333 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
334 PTEntryArray *ptbase;
335 PTIterationArray *ptpages;
336 PTIterationArray *ptchunks;
337
338 if (DsaPointerIsValid(istate->pagetable))
339 {
340 ptbase = dsa_get_address(dsa, istate->pagetable);
341 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
342 dsa_free(dsa, istate->pagetable);
343 }
344 if (DsaPointerIsValid(istate->spages))
345 {
346 ptpages = dsa_get_address(dsa, istate->spages);
347 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
348 dsa_free(dsa, istate->spages);
349 }
350 if (DsaPointerIsValid(istate->schunks))
351 {
352 ptchunks = dsa_get_address(dsa, istate->schunks);
353 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
354 dsa_free(dsa, istate->schunks);
355 }
356
357 dsa_free(dsa, dp);
358}
359
360/*
361 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
362 *
363 * If recheck is true, then the recheck flag will be set in the
364 * TBMIterateResult when any of these tuples are reported out.
365 */
366void
367tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids,
368 bool recheck)
369{
371 PagetableEntry *page = NULL; /* only valid when currblk is valid */
372 int i;
373
375 for (i = 0; i < ntids; i++)
376 {
379 int wordnum,
380 bitnum;
381
382 /* safety check to ensure we don't overrun bit array bounds */
383 if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
384 elog(ERROR, "tuple offset out of range: %u", off);
385
386 /*
387 * Look up target page unless we already did. This saves cycles when
388 * the input includes consecutive tuples on the same page, which is
389 * common enough to justify an extra test here.
390 */
391 if (blk != currblk)
392 {
393 if (tbm_page_is_lossy(tbm, blk))
394 page = NULL; /* remember page is lossy */
395 else
396 page = tbm_get_pageentry(tbm, blk);
397 currblk = blk;
398 }
399
400 if (page == NULL)
401 continue; /* whole page is already marked */
402
403 if (page->ischunk)
404 {
405 /* The page is a lossy chunk header, set bit for itself */
406 wordnum = bitnum = 0;
407 }
408 else
409 {
410 /* Page is exact, so set bit for individual tuple */
411 wordnum = WORDNUM(off - 1);
412 bitnum = BITNUM(off - 1);
413 }
414 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
415 page->recheck |= recheck;
416
417 if (tbm->nentries > tbm->maxentries)
418 {
419 tbm_lossify(tbm);
420 /* Page could have been converted to lossy, so force new lookup */
421 currblk = InvalidBlockNumber;
422 }
423 }
424}
425
426/*
427 * tbm_add_page - add a whole page to a TIDBitmap
428 *
429 * This causes the whole page to be reported (with the recheck flag)
430 * when the TIDBitmap is scanned.
431 */
432void
434{
435 /* Enter the page in the bitmap, or mark it lossy if already present */
436 tbm_mark_page_lossy(tbm, pageno);
437 /* If we went over the memory limit, lossify some more pages */
438 if (tbm->nentries > tbm->maxentries)
439 tbm_lossify(tbm);
440}
441
442/*
443 * tbm_union - set union
444 *
445 * a is modified in-place, b is not changed
446 */
447void
449{
450 Assert(!a->iterating);
451 /* Nothing to do if b is empty */
452 if (b->nentries == 0)
453 return;
454 /* Scan through chunks and pages in b, merge into a */
455 if (b->status == TBM_ONE_PAGE)
456 tbm_union_page(a, &b->entry1);
457 else
458 {
459 pagetable_iterator i;
460 PagetableEntry *bpage;
461
462 Assert(b->status == TBM_HASH);
463 pagetable_start_iterate(b->pagetable, &i);
464 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
465 tbm_union_page(a, bpage);
466 }
467}
468
469/* Process one page of b during a union op */
470static void
472{
473 PagetableEntry *apage;
474 int wordnum;
475
476 if (bpage->ischunk)
477 {
478 /* Scan b's chunk, mark each indicated page lossy in a */
479 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
480 {
481 bitmapword w = bpage->words[wordnum];
482
483 if (w != 0)
484 {
485 BlockNumber pg;
486
487 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
488 while (w != 0)
489 {
490 if (w & 1)
492 pg++;
493 w >>= 1;
494 }
495 }
496 }
497 }
498 else if (tbm_page_is_lossy(a, bpage->blockno))
499 {
500 /* page is already lossy in a, nothing to do */
501 return;
502 }
503 else
504 {
505 apage = tbm_get_pageentry(a, bpage->blockno);
506 if (apage->ischunk)
507 {
508 /* The page is a lossy chunk header, set bit for itself */
509 apage->words[0] |= ((bitmapword) 1 << 0);
510 }
511 else
512 {
513 /* Both pages are exact, merge at the bit level */
514 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
515 apage->words[wordnum] |= bpage->words[wordnum];
516 apage->recheck |= bpage->recheck;
517 }
518 }
519
520 if (a->nentries > a->maxentries)
521 tbm_lossify(a);
522}
523
524/*
525 * tbm_intersect - set intersection
526 *
527 * a is modified in-place, b is not changed
528 */
529void
531{
532 Assert(!a->iterating);
533 /* Nothing to do if a is empty */
534 if (a->nentries == 0)
535 return;
536 /* Scan through chunks and pages in a, try to match to b */
537 if (a->status == TBM_ONE_PAGE)
538 {
539 if (tbm_intersect_page(a, &a->entry1, b))
540 {
541 /* Page is now empty, remove it from a */
542 Assert(!a->entry1.ischunk);
543 a->npages--;
544 a->nentries--;
545 Assert(a->nentries == 0);
546 a->status = TBM_EMPTY;
547 }
548 }
549 else
550 {
551 pagetable_iterator i;
552 PagetableEntry *apage;
553
554 Assert(a->status == TBM_HASH);
555 pagetable_start_iterate(a->pagetable, &i);
556 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
557 {
558 if (tbm_intersect_page(a, apage, b))
559 {
560 /* Page or chunk is now empty, remove it from a */
561 if (apage->ischunk)
562 a->nchunks--;
563 else
564 a->npages--;
565 a->nentries--;
566 if (!pagetable_delete(a->pagetable, apage->blockno))
567 elog(ERROR, "hash table corrupted");
568 }
569 }
570 }
571}
572
573/*
574 * Process one page of a during an intersection op
575 *
576 * Returns true if apage is now empty and should be deleted from a
577 */
578static bool
580{
581 const PagetableEntry *bpage;
582 int wordnum;
583
584 if (apage->ischunk)
585 {
586 /* Scan each bit in chunk, try to clear */
587 bool candelete = true;
588
589 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
590 {
591 bitmapword w = apage->words[wordnum];
592
593 if (w != 0)
594 {
595 bitmapword neww = w;
596 BlockNumber pg;
597 int bitnum;
598
599 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
600 bitnum = 0;
601 while (w != 0)
602 {
603 if (w & 1)
604 {
605 if (!tbm_page_is_lossy(b, pg) &&
606 tbm_find_pageentry(b, pg) == NULL)
607 {
608 /* Page is not in b at all, lose lossy bit */
609 neww &= ~((bitmapword) 1 << bitnum);
610 }
611 }
612 pg++;
613 bitnum++;
614 w >>= 1;
615 }
616 apage->words[wordnum] = neww;
617 if (neww != 0)
618 candelete = false;
619 }
620 }
621 return candelete;
622 }
623 else if (tbm_page_is_lossy(b, apage->blockno))
624 {
625 /*
626 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
627 * because the page 'b' is lossy, we don't know which ones. Therefore
628 * we mark 'a' as requiring rechecks, to indicate that at most those
629 * tuples set in 'a' are matches.
630 */
631 apage->recheck = true;
632 return false;
633 }
634 else
635 {
636 bool candelete = true;
637
638 bpage = tbm_find_pageentry(b, apage->blockno);
639 if (bpage != NULL)
640 {
641 /* Both pages are exact, merge at the bit level */
642 Assert(!bpage->ischunk);
643 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
644 {
645 apage->words[wordnum] &= bpage->words[wordnum];
646 if (apage->words[wordnum] != 0)
647 candelete = false;
648 }
649 apage->recheck |= bpage->recheck;
650 }
651 /* If there is no matching b page, we can just delete the a page */
652 return candelete;
653 }
654}
655
656/*
657 * tbm_is_empty - is a TIDBitmap completely empty?
658 */
659bool
661{
662 return (tbm->nentries == 0);
663}
664
665/*
666 * tbm_begin_private_iterate - prepare to iterate through a TIDBitmap
667 *
668 * The TBMPrivateIterator struct is created in the caller's memory context.
669 * For a clean shutdown of the iteration, call tbm_end_private_iterate; but
670 * it's okay to just allow the memory context to be released, too. It is
671 * caller's responsibility not to touch the TBMPrivateIterator anymore once
672 * the TIDBitmap is freed.
673 *
674 * NB: after this is called, it is no longer allowed to modify the contents
675 * of the bitmap. However, you can call this multiple times to scan the
676 * contents repeatedly, including parallel scans.
677 */
680{
681 TBMPrivateIterator *iterator;
682
684
685 /*
686 * Create the TBMPrivateIterator struct, with enough trailing space to
687 * serve the needs of the TBMIterateResult sub-struct.
688 */
689 iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
690 iterator->tbm = tbm;
691
692 /*
693 * Initialize iteration pointers.
694 */
695 iterator->spageptr = 0;
696 iterator->schunkptr = 0;
697 iterator->schunkbit = 0;
698
699 /*
700 * If we have a hashtable, create and fill the sorted page lists, unless
701 * we already did that for a previous iterator. Note that the lists are
702 * attached to the bitmap not the iterator, so they can be used by more
703 * than one iterator.
704 */
705 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
706 {
707 pagetable_iterator i;
708 PagetableEntry *page;
709 int npages;
710 int nchunks;
711
712 if (!tbm->spages && tbm->npages > 0)
713 tbm->spages = (PagetableEntry **)
715 tbm->npages * sizeof(PagetableEntry *));
716 if (!tbm->schunks && tbm->nchunks > 0)
717 tbm->schunks = (PagetableEntry **)
719 tbm->nchunks * sizeof(PagetableEntry *));
720
721 npages = nchunks = 0;
722 pagetable_start_iterate(tbm->pagetable, &i);
723 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
724 {
725 if (page->ischunk)
726 tbm->schunks[nchunks++] = page;
727 else
728 tbm->spages[npages++] = page;
729 }
730 Assert(npages == tbm->npages);
731 Assert(nchunks == tbm->nchunks);
732 if (npages > 1)
733 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
735 if (nchunks > 1)
736 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
738 }
739
741
742 return iterator;
743}
744
745/*
746 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
747 *
748 * The necessary shared state will be allocated from the DSA passed to
749 * tbm_create, so that multiple processes can attach to it and iterate jointly.
750 *
751 * This will convert the pagetable hash into page and chunk array of the index
752 * into pagetable array.
753 */
756{
757 dsa_pointer dp;
759 PTEntryArray *ptbase = NULL;
760 PTIterationArray *ptpages = NULL;
761 PTIterationArray *ptchunks = NULL;
762
763 Assert(tbm->dsa != NULL);
765
766 /*
767 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
768 * lock, this will also be used by multiple worker for shared iterate.
769 */
770 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
771 istate = dsa_get_address(tbm->dsa, dp);
772
773 /*
774 * If we're not already iterating, create and fill the sorted page lists.
775 * (If we are, the sorted page lists are already stored in the TIDBitmap,
776 * and we can just reuse them.)
777 */
778 if (tbm->iterating == TBM_NOT_ITERATING)
779 {
780 pagetable_iterator i;
781 PagetableEntry *page;
782 int idx;
783 int npages;
784 int nchunks;
785
786 /*
787 * Allocate the page and chunk array memory from the DSA to share
788 * across multiple processes.
789 */
790 if (tbm->npages)
791 {
792 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
793 tbm->npages * sizeof(int));
794 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
795 pg_atomic_init_u32(&ptpages->refcount, 0);
796 }
797 if (tbm->nchunks)
798 {
799 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
800 tbm->nchunks * sizeof(int));
801 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
802 pg_atomic_init_u32(&ptchunks->refcount, 0);
803 }
804
805 /*
806 * If TBM status is TBM_HASH then iterate over the pagetable and
807 * convert it to page and chunk arrays. But if it's in the
808 * TBM_ONE_PAGE mode then directly allocate the space for one entry
809 * from the DSA.
810 */
811 npages = nchunks = 0;
812 if (tbm->status == TBM_HASH)
813 {
814 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
815
816 pagetable_start_iterate(tbm->pagetable, &i);
817 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
818 {
819 idx = page - ptbase->ptentry;
820 if (page->ischunk)
821 ptchunks->index[nchunks++] = idx;
822 else
823 ptpages->index[npages++] = idx;
824 }
825
826 Assert(npages == tbm->npages);
827 Assert(nchunks == tbm->nchunks);
828 }
829 else if (tbm->status == TBM_ONE_PAGE)
830 {
831 /*
832 * In one page mode allocate the space for one pagetable entry,
833 * initialize it, and directly store its index (i.e. 0) in the
834 * page array.
835 */
836 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
837 sizeof(PagetableEntry));
838 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
839 memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
840 ptpages->index[0] = 0;
841 }
842
843 if (ptbase != NULL)
844 pg_atomic_init_u32(&ptbase->refcount, 0);
845 if (npages > 1)
846 qsort_arg(ptpages->index, npages, sizeof(int),
848 if (nchunks > 1)
849 qsort_arg(ptchunks->index, nchunks, sizeof(int),
851 }
852
853 /*
854 * Store the TBM members in the shared state so that we can share them
855 * across multiple processes.
856 */
857 istate->nentries = tbm->nentries;
858 istate->maxentries = tbm->maxentries;
859 istate->npages = tbm->npages;
860 istate->nchunks = tbm->nchunks;
861 istate->pagetable = tbm->dsapagetable;
862 istate->spages = tbm->ptpages;
863 istate->schunks = tbm->ptchunks;
864
865 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
866 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
867 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
868
869 /*
870 * For every shared iterator referring to pagetable and iterator array,
871 * increase the refcount by 1 so that while freeing the shared iterator we
872 * don't free pagetable and iterator array until its refcount becomes 0.
873 */
874 if (ptbase != NULL)
876 if (ptpages != NULL)
877 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
878 if (ptchunks != NULL)
879 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
880
881 /* Initialize the iterator lock */
882 LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
883
884 /* Initialize the shared iterator state */
885 istate->schunkbit = 0;
886 istate->schunkptr = 0;
887 istate->spageptr = 0;
888
890
891 return dp;
892}
893
894/*
895 * tbm_extract_page_tuple - extract the tuple offsets from a page
896 *
897 * Returns the number of offsets it filled in if <= max_offsets. Otherwise,
898 * fills in as many offsets as fit and returns the total number of offsets in
899 * the page.
900 */
901int
903 OffsetNumber *offsets,
904 uint32 max_offsets)
905{
906 PagetableEntry *page = iteritem->internal_page;
907 int wordnum;
908 int ntuples = 0;
909
910 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
911 {
912 bitmapword w = page->words[wordnum];
913
914 if (w != 0)
915 {
916 int off = wordnum * BITS_PER_BITMAPWORD + 1;
917
918 while (w != 0)
919 {
920 if (w & 1)
921 {
922 if (ntuples < max_offsets)
923 offsets[ntuples] = (OffsetNumber) off;
924 ntuples++;
925 }
926 off++;
927 w >>= 1;
928 }
929 }
930 }
931
932 return ntuples;
933}
934
935/*
936 * tbm_advance_schunkbit - Advance the schunkbit
937 */
938static inline void
939tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
940{
941 int schunkbit = *schunkbitp;
942
943 while (schunkbit < PAGES_PER_CHUNK)
944 {
945 int wordnum = WORDNUM(schunkbit);
946 int bitnum = BITNUM(schunkbit);
947
948 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
949 break;
950 schunkbit++;
951 }
952
953 *schunkbitp = schunkbit;
954}
955
956/*
957 * tbm_private_iterate - scan through next page of a TIDBitmap
958 *
959 * Caller must pass in a TBMIterateResult to be filled.
960 *
961 * Pages are guaranteed to be delivered in numerical order.
962 *
963 * Returns false when there are no more pages to scan and true otherwise. When
964 * there are no more pages to scan, tbmres->blockno is set to
965 * InvalidBlockNumber.
966 *
967 * If lossy is true, then the bitmap is "lossy" and failed to remember
968 * the exact tuples to look at on this page --- the caller must examine all
969 * tuples on the page and check if they meet the intended condition. If lossy
970 * is false, the caller must later extract the tuple offsets from the page
971 * pointed to by internal_page with tbm_extract_page_tuple.
972 *
973 * If tbmres->recheck is true, only the indicated tuples need be examined, but
974 * the condition must be rechecked anyway. (For ease of testing, recheck is
975 * always set true when lossy is true.)
976 */
977bool
979{
980 TIDBitmap *tbm = iterator->tbm;
981
983
984 /*
985 * If lossy chunk pages remain, make sure we've advanced schunkptr/
986 * schunkbit to the next set bit.
987 */
988 while (iterator->schunkptr < tbm->nchunks)
989 {
990 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
991 int schunkbit = iterator->schunkbit;
992
993 tbm_advance_schunkbit(chunk, &schunkbit);
994 if (schunkbit < PAGES_PER_CHUNK)
995 {
996 iterator->schunkbit = schunkbit;
997 break;
998 }
999 /* advance to next chunk */
1000 iterator->schunkptr++;
1001 iterator->schunkbit = 0;
1002 }
1003
1004 /*
1005 * If both chunk and per-page data remain, must output the numerically
1006 * earlier page.
1007 */
1008 if (iterator->schunkptr < tbm->nchunks)
1009 {
1010 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1011 BlockNumber chunk_blockno;
1012
1013 chunk_blockno = chunk->blockno + iterator->schunkbit;
1014 if (iterator->spageptr >= tbm->npages ||
1015 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1016 {
1017 /* Return a lossy page indicator from the chunk */
1018 tbmres->blockno = chunk_blockno;
1019 tbmres->lossy = true;
1020 tbmres->recheck = true;
1021 tbmres->internal_page = NULL;
1022 iterator->schunkbit++;
1023 return true;
1024 }
1025 }
1026
1027 if (iterator->spageptr < tbm->npages)
1028 {
1029 PagetableEntry *page;
1030
1031 /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1032 if (tbm->status == TBM_ONE_PAGE)
1033 page = &tbm->entry1;
1034 else
1035 page = tbm->spages[iterator->spageptr];
1036
1037 tbmres->internal_page = page;
1038 tbmres->blockno = page->blockno;
1039 tbmres->lossy = false;
1040 tbmres->recheck = page->recheck;
1041 iterator->spageptr++;
1042 return true;
1043 }
1044
1045 /* Nothing more in the bitmap */
1046 tbmres->blockno = InvalidBlockNumber;
1047 return false;
1048}
1049
1050/*
1051 * tbm_shared_iterate - scan through next page of a TIDBitmap
1052 *
1053 * As above, but this will iterate using an iterator which is shared
1054 * across multiple processes. We need to acquire the iterator LWLock,
1055 * before accessing the shared members.
1056 */
1057bool
1059{
1060 TBMSharedIteratorState *istate = iterator->state;
1061 PagetableEntry *ptbase = NULL;
1062 int *idxpages = NULL;
1063 int *idxchunks = NULL;
1064
1065 if (iterator->ptbase != NULL)
1066 ptbase = iterator->ptbase->ptentry;
1067 if (iterator->ptpages != NULL)
1068 idxpages = iterator->ptpages->index;
1069 if (iterator->ptchunks != NULL)
1070 idxchunks = iterator->ptchunks->index;
1071
1072 /* Acquire the LWLock before accessing the shared members */
1073 LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1074
1075 /*
1076 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1077 * schunkbit to the next set bit.
1078 */
1079 while (istate->schunkptr < istate->nchunks)
1080 {
1081 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1082 int schunkbit = istate->schunkbit;
1083
1084 tbm_advance_schunkbit(chunk, &schunkbit);
1085 if (schunkbit < PAGES_PER_CHUNK)
1086 {
1087 istate->schunkbit = schunkbit;
1088 break;
1089 }
1090 /* advance to next chunk */
1091 istate->schunkptr++;
1092 istate->schunkbit = 0;
1093 }
1094
1095 /*
1096 * If both chunk and per-page data remain, must output the numerically
1097 * earlier page.
1098 */
1099 if (istate->schunkptr < istate->nchunks)
1100 {
1101 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1102 BlockNumber chunk_blockno;
1103
1104 chunk_blockno = chunk->blockno + istate->schunkbit;
1105
1106 if (istate->spageptr >= istate->npages ||
1107 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1108 {
1109 /* Return a lossy page indicator from the chunk */
1110 tbmres->blockno = chunk_blockno;
1111 tbmres->lossy = true;
1112 tbmres->recheck = true;
1113 tbmres->internal_page = NULL;
1114 istate->schunkbit++;
1115
1116 LWLockRelease(&istate->lock);
1117 return true;
1118 }
1119 }
1120
1121 if (istate->spageptr < istate->npages)
1122 {
1123 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1124
1125 tbmres->internal_page = page;
1126 tbmres->blockno = page->blockno;
1127 tbmres->lossy = false;
1128 tbmres->recheck = page->recheck;
1129 istate->spageptr++;
1130
1131 LWLockRelease(&istate->lock);
1132
1133 return true;
1134 }
1135
1136 LWLockRelease(&istate->lock);
1137
1138 /* Nothing more in the bitmap */
1139 tbmres->blockno = InvalidBlockNumber;
1140 return false;
1141}
1142
1143/*
1144 * tbm_end_private_iterate - finish an iteration over a TIDBitmap
1145 *
1146 * Currently this is just a pfree, but it might do more someday. (For
1147 * instance, it could be useful to count open iterators and allow the
1148 * bitmap to return to read/write status when there are no more iterators.)
1149 */
1150void
1152{
1153 pfree(iterator);
1154}
1155
1156/*
1157 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1158 *
1159 * This doesn't free any of the shared state associated with the iterator,
1160 * just our backend-private state.
1161 */
1162void
1164{
1165 pfree(iterator);
1166}
1167
1168/*
1169 * tbm_find_pageentry - find a PagetableEntry for the pageno
1170 *
1171 * Returns NULL if there is no non-lossy entry for the pageno.
1172 */
1173static const PagetableEntry *
1175{
1176 const PagetableEntry *page;
1177
1178 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1179 return NULL;
1180
1181 if (tbm->status == TBM_ONE_PAGE)
1182 {
1183 page = &tbm->entry1;
1184 if (page->blockno != pageno)
1185 return NULL;
1186 Assert(!page->ischunk);
1187 return page;
1188 }
1189
1190 page = pagetable_lookup(tbm->pagetable, pageno);
1191 if (page == NULL)
1192 return NULL;
1193 if (page->ischunk)
1194 return NULL; /* don't want a lossy chunk header */
1195 return page;
1196}
1197
1198/*
1199 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1200 *
1201 * If new, the entry is marked as an exact (non-chunk) entry.
1202 *
1203 * This may cause the table to exceed the desired memory size. It is
1204 * up to the caller to call tbm_lossify() at the next safe point if so.
1205 */
1206static PagetableEntry *
1208{
1209 PagetableEntry *page;
1210 bool found;
1211
1212 if (tbm->status == TBM_EMPTY)
1213 {
1214 /* Use the fixed slot */
1215 page = &tbm->entry1;
1216 found = false;
1217 tbm->status = TBM_ONE_PAGE;
1218 }
1219 else
1220 {
1221 if (tbm->status == TBM_ONE_PAGE)
1222 {
1223 page = &tbm->entry1;
1224 if (page->blockno == pageno)
1225 return page;
1226 /* Time to switch from one page to a hashtable */
1228 }
1229
1230 /* Look up or create an entry */
1231 page = pagetable_insert(tbm->pagetable, pageno, &found);
1232 }
1233
1234 /* Initialize it if not present before */
1235 if (!found)
1236 {
1237 char oldstatus = page->status;
1238
1239 MemSet(page, 0, sizeof(PagetableEntry));
1240 page->status = oldstatus;
1241 page->blockno = pageno;
1242 /* must count it too */
1243 tbm->nentries++;
1244 tbm->npages++;
1245 }
1246
1247 return page;
1248}
1249
1250/*
1251 * tbm_page_is_lossy - is the page marked as lossily stored?
1252 */
1253static bool
1255{
1256 PagetableEntry *page;
1257 BlockNumber chunk_pageno;
1258 int bitno;
1259
1260 /* we can skip the lookup if there are no lossy chunks */
1261 if (tbm->nchunks == 0)
1262 return false;
1263 Assert(tbm->status == TBM_HASH);
1264
1265 bitno = pageno % PAGES_PER_CHUNK;
1266 chunk_pageno = pageno - bitno;
1267
1268 page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1269
1270 if (page != NULL && page->ischunk)
1271 {
1272 int wordnum = WORDNUM(bitno);
1273 int bitnum = BITNUM(bitno);
1274
1275 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1276 return true;
1277 }
1278 return false;
1279}
1280
1281/*
1282 * tbm_mark_page_lossy - mark the page number as lossily stored
1283 *
1284 * This may cause the table to exceed the desired memory size. It is
1285 * up to the caller to call tbm_lossify() at the next safe point if so.
1286 */
1287static void
1289{
1290 PagetableEntry *page;
1291 bool found;
1292 BlockNumber chunk_pageno;
1293 int bitno;
1294 int wordnum;
1295 int bitnum;
1296
1297 /* We force the bitmap into hashtable mode whenever it's lossy */
1298 if (tbm->status != TBM_HASH)
1300
1301 bitno = pageno % PAGES_PER_CHUNK;
1302 chunk_pageno = pageno - bitno;
1303
1304 /*
1305 * Remove any extant non-lossy entry for the page. If the page is its own
1306 * chunk header, however, we skip this and handle the case below.
1307 */
1308 if (bitno != 0)
1309 {
1310 if (pagetable_delete(tbm->pagetable, pageno))
1311 {
1312 /* It was present, so adjust counts */
1313 tbm->nentries--;
1314 tbm->npages--; /* assume it must have been non-lossy */
1315 }
1316 }
1317
1318 /* Look up or create entry for chunk-header page */
1319 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1320
1321 /* Initialize it if not present before */
1322 if (!found)
1323 {
1324 char oldstatus = page->status;
1325
1326 MemSet(page, 0, sizeof(PagetableEntry));
1327 page->status = oldstatus;
1328 page->blockno = chunk_pageno;
1329 page->ischunk = true;
1330 /* must count it too */
1331 tbm->nentries++;
1332 tbm->nchunks++;
1333 }
1334 else if (!page->ischunk)
1335 {
1336 char oldstatus = page->status;
1337
1338 /* chunk header page was formerly non-lossy, make it lossy */
1339 MemSet(page, 0, sizeof(PagetableEntry));
1340 page->status = oldstatus;
1341 page->blockno = chunk_pageno;
1342 page->ischunk = true;
1343 /* we assume it had some tuple bit(s) set, so mark it lossy */
1344 page->words[0] = ((bitmapword) 1 << 0);
1345 /* adjust counts */
1346 tbm->nchunks++;
1347 tbm->npages--;
1348 }
1349
1350 /* Now set the original target page's bit */
1351 wordnum = WORDNUM(bitno);
1352 bitnum = BITNUM(bitno);
1353 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1354}
1355
1356/*
1357 * tbm_lossify - lose some information to get back under the memory limit
1358 */
1359static void
1361{
1362 pagetable_iterator i;
1363 PagetableEntry *page;
1364
1365 /*
1366 * XXX Really stupid implementation: this just lossifies pages in
1367 * essentially random order. We should be paying some attention to the
1368 * number of bits set in each page, instead.
1369 *
1370 * Since we are called as soon as nentries exceeds maxentries, we should
1371 * push nentries down to significantly less than maxentries, or else we'll
1372 * just end up doing this again very soon. We shoot for maxentries/2.
1373 */
1375 Assert(tbm->status == TBM_HASH);
1376
1377 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1378 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1379 {
1380 if (page->ischunk)
1381 continue; /* already a chunk header */
1382
1383 /*
1384 * If the page would become a chunk header, we won't save anything by
1385 * converting it to lossy, so skip it.
1386 */
1387 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1388 continue;
1389
1390 /* This does the dirty work ... */
1391 tbm_mark_page_lossy(tbm, page->blockno);
1392
1393 if (tbm->nentries <= tbm->maxentries / 2)
1394 {
1395 /*
1396 * We have made enough room. Remember where to start lossifying
1397 * next round, so we evenly iterate over the hashtable.
1398 */
1399 tbm->lossify_start = i.cur;
1400 break;
1401 }
1402
1403 /*
1404 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1405 * hashtable and may have deleted the non-lossy chunk. We can
1406 * continue the same hash table scan, since failure to visit one
1407 * element or visiting the newly inserted element, isn't fatal.
1408 */
1409 }
1410
1411 /*
1412 * With a big bitmap and small work_mem, it's possible that we cannot get
1413 * under maxentries. Again, if that happens, we'd end up uselessly
1414 * calling tbm_lossify over and over. To prevent this from becoming a
1415 * performance sink, force maxentries up to at least double the current
1416 * number of entries. (In essence, we're admitting inability to fit
1417 * within work_mem when we do this.) Note that this test will not fire if
1418 * we broke out of the loop early; and if we didn't, the current number of
1419 * entries is simply not reducible any further.
1420 */
1421 if (tbm->nentries > tbm->maxentries / 2)
1422 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1423}
1424
1425/*
1426 * qsort comparator to handle PagetableEntry pointers.
1427 */
1428static int
1429tbm_comparator(const void *left, const void *right)
1430{
1431 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1432 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1433
1434 return pg_cmp_u32(l, r);
1435}
1436
1437/*
1438 * As above, but this will get index into PagetableEntry array. Therefore,
1439 * it needs to get actual PagetableEntry using the index before comparing the
1440 * blockno.
1441 */
1442static int
1443tbm_shared_comparator(const void *left, const void *right, void *arg)
1444{
1445 PagetableEntry *base = (PagetableEntry *) arg;
1446 PagetableEntry *lpage = &base[*(int *) left];
1447 PagetableEntry *rpage = &base[*(int *) right];
1448
1449 if (lpage->blockno < rpage->blockno)
1450 return -1;
1451 else if (lpage->blockno > rpage->blockno)
1452 return 1;
1453 return 0;
1454}
1455
1456/*
1457 * tbm_attach_shared_iterate
1458 *
1459 * Allocate a backend-private iterator and attach the shared iterator state
1460 * to it so that multiple processed can iterate jointly.
1461 *
1462 * We also converts the DSA pointers to local pointers and store them into
1463 * our private iterator.
1464 */
1467{
1468 TBMSharedIterator *iterator;
1469 TBMSharedIteratorState *istate;
1470
1471 /*
1472 * Create the TBMSharedIterator struct, with enough trailing space to
1473 * serve the needs of the TBMIterateResult sub-struct.
1474 */
1475 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
1476
1477 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1478
1479 iterator->state = istate;
1480
1481 iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1482
1483 if (istate->npages)
1484 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1485 if (istate->nchunks)
1486 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1487
1488 return iterator;
1489}
1490
1491/*
1492 * pagetable_allocate
1493 *
1494 * Callback function for allocating the memory for hashtable elements.
1495 * Allocate memory for hashtable elements, using DSA if available.
1496 */
1497static inline void *
1498pagetable_allocate(pagetable_hash *pagetable, Size size)
1499{
1500 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1501 PTEntryArray *ptbase;
1502
1503 if (tbm->dsa == NULL)
1504 return MemoryContextAllocExtended(pagetable->ctx, size,
1506
1507 /*
1508 * Save the dsapagetable reference in dsapagetableold before allocating
1509 * new memory so that pagetable_free can free the old entry.
1510 */
1511 tbm->dsapagetableold = tbm->dsapagetable;
1513 sizeof(PTEntryArray) + size,
1515 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1516
1517 return ptbase->ptentry;
1518}
1519
1520/*
1521 * pagetable_free
1522 *
1523 * Callback function for freeing hash table elements.
1524 */
1525static inline void
1526pagetable_free(pagetable_hash *pagetable, void *pointer)
1527{
1528 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1529
1530 /* pfree the input pointer if DSA is not available */
1531 if (tbm->dsa == NULL)
1532 pfree(pointer);
1533 else if (DsaPointerIsValid(tbm->dsapagetableold))
1534 {
1535 dsa_free(tbm->dsa, tbm->dsapagetableold);
1537 }
1538}
1539
1540/*
1541 * tbm_calculate_entries
1542 *
1543 * Estimate number of hashtable entries we can have within maxbytes.
1544 */
1545int
1547{
1548 Size nbuckets;
1549
1550 /*
1551 * Estimate number of hashtable entries we can have within maxbytes. This
1552 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1553 * for our purpose. Also count an extra Pointer per entry for the arrays
1554 * created during iteration readout.
1555 */
1556 nbuckets = maxbytes /
1557 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1558 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1559 nbuckets = Max(nbuckets, 16); /* sanity limit */
1560
1561 return (int) nbuckets;
1562}
1563
1564/*
1565 * Create a shared or private bitmap iterator and start iteration.
1566 *
1567 * `tbm` is only used to create the private iterator and dsa and dsp are only
1568 * used to create the shared iterator.
1569 *
1570 * Before invoking tbm_begin_iterate() to create a shared iterator, one
1571 * process must already have invoked tbm_prepare_shared_iterate() to create
1572 * and set up the TBMSharedIteratorState.
1573 */
1576{
1577 TBMIterator iterator = {0};
1578
1579 /* Allocate a private iterator and attach the shared state to it */
1580 if (DsaPointerIsValid(dsp))
1581 {
1582 iterator.shared = true;
1583 iterator.i.shared_iterator = tbm_attach_shared_iterate(dsa, dsp);
1584 }
1585 else
1586 {
1587 iterator.shared = false;
1589 }
1590
1591 return iterator;
1592}
1593
1594/*
1595 * Clean up shared or private bitmap iterator.
1596 */
1597void
1599{
1600 Assert(iterator && !tbm_exhausted(iterator));
1601
1602 if (iterator->shared)
1604 else
1606
1607 *iterator = (TBMIterator)
1608 {
1609 0
1610 };
1611}
1612
1613/*
1614 * Populate the next TBMIterateResult using the shared or private bitmap
1615 * iterator. Returns false when there is nothing more to scan.
1616 */
1617bool
1619{
1620 Assert(iterator);
1621 Assert(tbmres);
1622
1623 if (iterator->shared)
1624 return tbm_shared_iterate(iterator->i.shared_iterator, tbmres);
1625 else
1626 return tbm_private_iterate(iterator->i.private_iterator, tbmres);
1627}
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:262
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:437
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:219
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:422
uint32 bitmapword
Definition: bitmapset.h:44
#define BITS_PER_BITMAPWORD
Definition: bitmapset.h:43
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define Min(x, y)
Definition: c.h:1008
#define Max(x, y)
Definition: c.h:1002
char * Pointer
Definition: c.h:534
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:475
uint32_t uint32
Definition: c.h:543
#define MemSet(start, val, len)
Definition: c.h:1024
size_t Size
Definition: c.h:615
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:957
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:686
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:841
#define dsa_allocate0(area, size)
Definition: dsa.h:113
uint64 dsa_pointer
Definition: dsa.h:62
#define dsa_allocate(area, size)
Definition: dsa.h:109
#define InvalidDsaPointer
Definition: dsa.h:78
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:28
Assert(PointerIsAligned(start, uint64))
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:652
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
void * palloc(Size size)
Definition: mcxt.c:1365
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:1286
NodeTag
Definition: nodes.h:27
#define makeNode(_type_)
Definition: nodes.h:161
uint16 OffsetNumber
Definition: off.h:24
void * arg
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
#define qsort(a, b, c, d)
Definition: port.h:479
Definition: lwlock.h:42
pg_atomic_uint32 refcount
Definition: tidbitmap.c:106
PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:107
pg_atomic_uint32 refcount
Definition: tidbitmap.c:202
int index[FLEXIBLE_ARRAY_MEMBER]
Definition: tidbitmap.c:203
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]
Definition: tidbitmap.c:98
BlockNumber blockno
Definition: tidbitmap.c:94
void * internal_page
Definition: tidbitmap.h:78
BlockNumber blockno
Definition: tidbitmap.h:63
TBMSharedIterator * shared_iterator
Definition: tidbitmap.h:56
union TBMIterator::@111 i
TBMPrivateIterator * private_iterator
Definition: tidbitmap.h:55
bool shared
Definition: tidbitmap.h:52
TIDBitmap * tbm
Definition: tidbitmap.c:172
dsa_pointer pagetable
Definition: tidbitmap.c:188
dsa_pointer spages
Definition: tidbitmap.c:189
dsa_pointer schunks
Definition: tidbitmap.c:190
TBMSharedIteratorState * state
Definition: tidbitmap.c:212
PTEntryArray * ptbase
Definition: tidbitmap.c:213
PTIterationArray * ptchunks
Definition: tidbitmap.c:215
PTIterationArray * ptpages
Definition: tidbitmap.c:214
dsa_pointer ptpages
Definition: tidbitmap.c:159
TBMIteratingState iterating
Definition: tidbitmap.c:151
int nentries
Definition: tidbitmap.c:147
dsa_pointer dsapagetableold
Definition: tidbitmap.c:158
struct pagetable_hash * pagetable
Definition: tidbitmap.c:146
PagetableEntry ** schunks
Definition: tidbitmap.c:156
MemoryContext mcxt
Definition: tidbitmap.c:144
int npages
Definition: tidbitmap.c:149
int maxentries
Definition: tidbitmap.c:148
uint32 lossify_start
Definition: tidbitmap.c:152
int nchunks
Definition: tidbitmap.c:150
NodeTag type
Definition: tidbitmap.c:143
dsa_pointer ptchunks
Definition: tidbitmap.c:160
TBMStatus status
Definition: tidbitmap.c:145
PagetableEntry entry1
Definition: tidbitmap.c:153
dsa_area * dsa
Definition: tidbitmap.c:161
dsa_pointer dsapagetable
Definition: tidbitmap.c:157
PagetableEntry ** spages
Definition: tidbitmap.c:155
Definition: dsa.c:348
Definition: type.h:96
void tbm_free(TIDBitmap *tbm)
Definition: tidbitmap.c:312
struct PTIterationArray PTIterationArray
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:1618
bool tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:1058
#define WORDNUM(x)
Definition: tidbitmap.c:70
static void tbm_lossify(TIDBitmap *tbm)
Definition: tidbitmap.c:1360
bool tbm_is_empty(const TIDBitmap *tbm)
Definition: tidbitmap.c:660
void tbm_end_iterate(TBMIterator *iterator)
Definition: tidbitmap.c:1598
#define WORDS_PER_PAGE
Definition: tidbitmap.c:74
TBMIteratingState
Definition: tidbitmap.c:132
@ TBM_ITERATING_SHARED
Definition: tidbitmap.c:135
@ TBM_NOT_ITERATING
Definition: tidbitmap.c:133
@ TBM_ITERATING_PRIVATE
Definition: tidbitmap.c:134
void tbm_end_shared_iterate(TBMSharedIterator *iterator)
Definition: tidbitmap.c:1163
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1254
static void * pagetable_allocate(pagetable_hash *pagetable, Size size)
Definition: tidbitmap.c:1498
dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:755
void tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:530
void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:331
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1288
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
Definition: tidbitmap.c:471
#define BITNUM(x)
Definition: tidbitmap.c:71
static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1174
bool tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres)
Definition: tidbitmap.c:978
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:433
static void tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
Definition: tidbitmap.c:939
TBMStatus
Definition: tidbitmap.c:122
@ TBM_EMPTY
Definition: tidbitmap.c:123
@ TBM_ONE_PAGE
Definition: tidbitmap.c:124
@ TBM_HASH
Definition: tidbitmap.c:125
TBMSharedIterator * tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
Definition: tidbitmap.c:1466
#define PAGES_PER_CHUNK
Definition: tidbitmap.c:66
TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp)
Definition: tidbitmap.c:1575
static int tbm_shared_comparator(const void *left, const void *right, void *arg)
Definition: tidbitmap.c:1443
static int tbm_comparator(const void *left, const void *right)
Definition: tidbitmap.c:1429
static void tbm_create_pagetable(TIDBitmap *tbm)
Definition: tidbitmap.c:282
int tbm_extract_page_tuple(TBMIterateResult *iteritem, OffsetNumber *offsets, uint32 max_offsets)
Definition: tidbitmap.c:902
#define WORDS_PER_CHUNK
Definition: tidbitmap.c:76
void tbm_union(TIDBitmap *a, const TIDBitmap *b)
Definition: tidbitmap.c:448
void tbm_end_private_iterate(TBMPrivateIterator *iterator)
Definition: tidbitmap.c:1151
TIDBitmap * tbm_create(Size maxbytes, dsa_area *dsa)
Definition: tidbitmap.c:256
struct PagetableEntry PagetableEntry
TBMPrivateIterator * tbm_begin_private_iterate(TIDBitmap *tbm)
Definition: tidbitmap.c:679
struct PTEntryArray PTEntryArray
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition: tidbitmap.c:367
int tbm_calculate_entries(Size maxbytes)
Definition: tidbitmap.c:1546
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
Definition: tidbitmap.c:579
struct TBMSharedIteratorState TBMSharedIteratorState
static void pagetable_free(pagetable_hash *pagetable, void *pointer)
Definition: tidbitmap.c:1526
static PagetableEntry * tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:1207
struct TBMIterator TBMIterator
#define TBM_MAX_TUPLES_PER_PAGE
Definition: tidbitmap.h:34
static bool tbm_exhausted(TBMIterator *iterator)
Definition: tidbitmap.h:118