@@ -1155,8 +1155,12 @@ typedef struct GinBuffer
11551155 int16 typlen ;
11561156 bool typbyval ;
11571157
1158+ /* Number of TIDs to collect before attempt to write some out. */
1159+ int maxitems ;
1160+
11581161 /* array of TID values */
11591162 int nitems ;
1163+ int nfrozen ;
11601164 SortSupport ssup ; /* for sorting/comparing keys */
11611165 ItemPointerData * items ;
11621166} GinBuffer ;
@@ -1229,6 +1233,13 @@ GinBufferInit(Relation index)
12291233 nKeys ;
12301234 TupleDesc desc = RelationGetDescr (index );
12311235
1236+ /*
1237+ * How many items can we fit into the memory limit? We don't want to end
1238+ * with too many TIDs. and 64kB seems more than enough. But maybe this
1239+ * should be tied to maintenance_work_mem or something like that?
1240+ */
1241+ buffer -> maxitems = (64 * 1024L ) / sizeof (ItemPointerData );
1242+
12321243 nKeys = IndexRelationGetNumberOfKeyAttributes (index );
12331244
12341245 buffer -> ssup = palloc0 (sizeof (SortSupportData ) * nKeys );
@@ -1336,6 +1347,48 @@ GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
13361347 return (r == 0 );
13371348}
13381349
1350+ /*
1351+ * GinBufferShouldTrim
1352+ * Should we trim the list of item pointers?
1353+ *
1354+ * By trimming we understand writing out and removing the tuple IDs that
1355+ * we know can't change by future merges. We can deduce the TID up to which
1356+ * this is guaranteed from the "first" TID in each GIN tuple, which provides
1357+ * a "horizon" (for a given key) thanks to the sort.
1358+ *
1359+ * We don't want to do this too often - compressing longer TID lists is more
1360+ * efficient. But we also don't want to accumulate too many TIDs, for two
1361+ * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1362+ * (or whatever limit applies), even if that's unlikely because TIDs are very
1363+ * small so we can fit a lot of them. Second, and more importantly, long TID
1364+ * lists are an issue if the scan wraps around, because a key may get a very
1365+ * wide list (with min/max TID for that key), forcing "full" mergesorts for
1366+ * every list merged into it (instead of the efficient append).
1367+ *
1368+ * So we look at two things when deciding if to trim - if the resulting list
1369+ * (after adding TIDs from the new tuple) would be too long, and if there is
1370+ * enough TIDs to trim (with values less than "first" TID from the new tuple),
1371+ * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1372+ * number).
1373+ */
1374+ static bool
1375+ GinBufferShouldTrim (GinBuffer * buffer , GinTuple * tup )
1376+ {
1377+ /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1378+ if (buffer -> nfrozen < 1024 )
1379+ return false;
1380+
1381+ /* no need to trim if we have not hit the memory limit yet */
1382+ if ((buffer -> nitems + tup -> nitems ) < buffer -> maxitems )
1383+ return false;
1384+
1385+ /*
1386+ * OK, we have enough frozen TIDs to flush, and we have hit the memory
1387+ * limit, so it's time to write it out.
1388+ */
1389+ return true;
1390+ }
1391+
13391392/*
13401393 * GinBufferStoreTuple
13411394 * Add data (especially TID list) from a GIN tuple to the buffer.
@@ -1386,21 +1439,76 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
13861439 buffer -> key = (Datum ) 0 ;
13871440 }
13881441
1442+ /*
1443+ * Try freeze TIDs at the beginning of the list, i.e. exclude them from
1444+ * the mergesort. We can do that with TIDs before the first TID in the new
1445+ * tuple we're about to add into the buffer.
1446+ *
1447+ * We do this incrementally when adding data into the in-memory buffer,
1448+ * and not later (e.g. when hitting a memory limit), because it allows us
1449+ * to skip the frozen data during the mergesort, making it cheaper.
1450+ */
1451+
1452+ /*
1453+ * Check if the last TID in the current list is frozen. This is the case
1454+ * when merging non-overlapping lists, e.g. in each parallel worker.
1455+ */
1456+ if ((buffer -> nitems > 0 ) &&
1457+ (ItemPointerCompare (& buffer -> items [buffer -> nitems - 1 ],
1458+ GinTupleGetFirst (tup )) == 0 ))
1459+ buffer -> nfrozen = buffer -> nitems ;
1460+
1461+ /*
1462+ * Now find the last TID we know to be frozen, i.e. the last TID right
1463+ * before the new GIN tuple.
1464+ *
1465+ * Start with the first not-yet-frozen tuple, and walk until we find the
1466+ * first TID that's higher. If we already know the whole list is frozen
1467+ * (i.e. nfrozen == nitems), this does nothing.
1468+ *
1469+ * XXX This might do a binary search for sufficiently long lists, but it
1470+ * does not seem worth the complexity. Overlapping lists should be rare
1471+ * common, TID comparisons are cheap, and we should quickly freeze most of
1472+ * the list.
1473+ */
1474+ for (int i = buffer -> nfrozen ; i < buffer -> nitems ; i ++ )
1475+ {
1476+ /* Is the TID after the first TID of the new tuple? Can't freeze. */
1477+ if (ItemPointerCompare (& buffer -> items [i ],
1478+ GinTupleGetFirst (tup )) > 0 )
1479+ break ;
1480+
1481+ buffer -> nfrozen ++ ;
1482+ }
1483+
13891484 /* add the new TIDs into the buffer, combine using merge-sort */
13901485 {
13911486 int nnew ;
13921487 ItemPointer new ;
13931488
1394- new = ginMergeItemPointers (buffer -> items , buffer -> nitems ,
1489+ /*
1490+ * Resize the array - we do this first, because we'll dereference the
1491+ * first unfrozen TID, which would fail if the array is NULL. We'll
1492+ * still pass 0 as number of elements in that array though.
1493+ */
1494+ if (buffer -> items == NULL )
1495+ buffer -> items = palloc ((buffer -> nitems + tup -> nitems ) * sizeof (ItemPointerData ));
1496+ else
1497+ buffer -> items = repalloc (buffer -> items ,
1498+ (buffer -> nitems + tup -> nitems ) * sizeof (ItemPointerData ));
1499+
1500+ new = ginMergeItemPointers (& buffer -> items [buffer -> nfrozen ], /* first unfronzen */
1501+ (buffer -> nitems - buffer -> nfrozen ), /* num of unfrozen */
13951502 items , tup -> nitems , & nnew );
13961503
1397- Assert (nnew == buffer -> nitems + tup -> nitems );
1504+ Assert (nnew == ( tup -> nitems + ( buffer -> nitems - buffer -> nfrozen )) );
13981505
1399- if ( buffer -> items )
1400- pfree ( buffer -> items );
1506+ memcpy ( & buffer -> items [ buffer -> nfrozen ], new ,
1507+ nnew * sizeof ( ItemPointerData ) );
14011508
1402- buffer -> items = new ;
1403- buffer -> nitems = nnew ;
1509+ pfree (new );
1510+
1511+ buffer -> nitems += tup -> nitems ;
14041512
14051513 AssertCheckItemPointers (buffer );
14061514 }
@@ -1432,11 +1540,29 @@ GinBufferReset(GinBuffer *buffer)
14321540 buffer -> category = 0 ;
14331541 buffer -> keylen = 0 ;
14341542 buffer -> nitems = 0 ;
1543+ buffer -> nfrozen = 0 ;
14351544
14361545 buffer -> typlen = 0 ;
14371546 buffer -> typbyval = 0 ;
14381547}
14391548
1549+ /*
1550+ * GinBufferTrim
1551+ * Discard the "frozen" part of the TID list (which should have been
1552+ * written to disk/index before this call).
1553+ */
1554+ static void
1555+ GinBufferTrim (GinBuffer * buffer )
1556+ {
1557+ Assert ((buffer -> nfrozen > 0 ) && (buffer -> nfrozen <= buffer -> nitems ));
1558+
1559+ memmove (& buffer -> items [0 ], & buffer -> items [buffer -> nfrozen ],
1560+ sizeof (ItemPointerData ) * (buffer -> nitems - buffer -> nfrozen ));
1561+
1562+ buffer -> nitems -= buffer -> nfrozen ;
1563+ buffer -> nfrozen = 0 ;
1564+ }
1565+
14401566/*
14411567 * GinBufferFree
14421568 * Release memory associated with the GinBuffer (including TID array).
@@ -1504,7 +1630,12 @@ _gin_parallel_merge(GinBuildState *state)
15041630 /* do the actual sort in the leader */
15051631 tuplesort_performsort (state -> bs_sortstate );
15061632
1507- /* initialize buffer to combine entries for the same key */
1633+ /*
1634+ * Initialize buffer to combine entries for the same key.
1635+ *
1636+ * The leader is allowed to use the whole maintenance_work_mem buffer to
1637+ * combine data. The parallel workers already completed.
1638+ */
15081639 buffer = GinBufferInit (state -> ginstate .index );
15091640
15101641 /*
@@ -1562,6 +1693,32 @@ _gin_parallel_merge(GinBuildState *state)
15621693 GinBufferReset (buffer );
15631694 }
15641695
1696+ /*
1697+ * We're about to add a GIN tuple to the buffer - check the memory
1698+ * limit first, and maybe write out some of the data into the index
1699+ * first, if needed (and possible). We only flush the part of the TID
1700+ * list that we know won't change, and only if there's enough data for
1701+ * compression to work well.
1702+ */
1703+ if (GinBufferShouldTrim (buffer , tup ))
1704+ {
1705+ Assert (buffer -> nfrozen > 0 );
1706+
1707+ /*
1708+ * Buffer is not empty and it's storing a different key - flush
1709+ * the data into the insert, and start a new entry for current
1710+ * GinTuple.
1711+ */
1712+ AssertCheckItemPointers (buffer );
1713+
1714+ ginEntryInsert (& state -> ginstate ,
1715+ buffer -> attnum , buffer -> key , buffer -> category ,
1716+ buffer -> items , buffer -> nfrozen , & state -> buildStats );
1717+
1718+ /* truncate the data we've just discarded */
1719+ GinBufferTrim (buffer );
1720+ }
1721+
15651722 /*
15661723 * Remember data for the current tuple (either remember the new key,
15671724 * or append if to the existing data).
@@ -1655,7 +1812,13 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort,
16551812
16561813 GinBuffer * buffer ;
16571814
1658- /* initialize buffer to combine entries for the same key */
1815+ /*
1816+ * Initialize buffer to combine entries for the same key.
1817+ *
1818+ * The workers are limited to the same amount of memory as during the sort
1819+ * in ginBuildCallbackParallel. But this probably should be the 32MB used
1820+ * during planning, just like there.
1821+ */
16591822 buffer = GinBufferInit (state -> ginstate .index );
16601823
16611824 /* sort the raw per-worker data */
@@ -1711,6 +1874,39 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort,
17111874 GinBufferReset (buffer );
17121875 }
17131876
1877+ /*
1878+ * We're about to add a GIN tuple to the buffer - check the memory
1879+ * limit first, and maybe write out some of the data into the index
1880+ * first, if needed (and possible). We only flush the part of the TID
1881+ * list that we know won't change, and only if there's enough data for
1882+ * compression to work well.
1883+ */
1884+ if (GinBufferShouldTrim (buffer , tup ))
1885+ {
1886+ GinTuple * ntup ;
1887+ Size ntuplen ;
1888+
1889+ Assert (buffer -> nfrozen > 0 );
1890+
1891+ /*
1892+ * Buffer is not empty and it's storing a different key - flush
1893+ * the data into the insert, and start a new entry for current
1894+ * GinTuple.
1895+ */
1896+ AssertCheckItemPointers (buffer );
1897+
1898+ ntup = _gin_build_tuple (buffer -> attnum , buffer -> category ,
1899+ buffer -> key , buffer -> typlen , buffer -> typbyval ,
1900+ buffer -> items , buffer -> nfrozen , & ntuplen );
1901+
1902+ tuplesort_putgintuple (state -> bs_sortstate , ntup , ntuplen );
1903+
1904+ pfree (ntup );
1905+
1906+ /* truncate the data we've just discarded */
1907+ GinBufferTrim (buffer );
1908+ }
1909+
17141910 /*
17151911 * Remember data for the current tuple (either remember the new key,
17161912 * or append if to the existing data).
0 commit comments