@@ -110,6 +110,18 @@ typedef struct TapeBlockTrailer
110110#define TapeBlockSetNBytes (buf , nbytes ) \
111111 (TapeBlockGetTrailer(buf)->next = -(nbytes))
112112
113+ /*
114+ * When multiple tapes are being written to concurrently (as in HashAgg),
115+ * avoid excessive fragmentation by preallocating block numbers to individual
116+ * tapes. Each preallocation doubles in size starting at
117+ * TAPE_WRITE_PREALLOC_MIN blocks up to TAPE_WRITE_PREALLOC_MAX blocks.
118+ *
119+ * No filesystem operations are performed for preallocation; only the block
120+ * numbers are reserved. This may lead to sparse writes, which will cause
121+ * ltsWriteBlock() to fill in holes with zeros.
122+ */
123+ #define TAPE_WRITE_PREALLOC_MIN 8
124+ #define TAPE_WRITE_PREALLOC_MAX 128
113125
114126/*
115127 * This data structure represents a single "logical tape" within the set
@@ -151,6 +163,15 @@ typedef struct LogicalTape
151163 int max_size ; /* highest useful, safe buffer_size */
152164 int pos ; /* next read/write position in buffer */
153165 int nbytes ; /* total # of valid bytes in buffer */
166+
167+ /*
168+ * Preallocated block numbers are held in an array sorted in descending
169+ * order; blocks are consumed from the end of the array (lowest block
170+ * numbers first).
171+ */
172+ long * prealloc ;
173+ int nprealloc ; /* number of elements in list */
174+ int prealloc_size ; /* number of elements list can hold */
154175} LogicalTape ;
155176
156177/*
@@ -198,6 +219,7 @@ struct LogicalTapeSet
198219static void ltsWriteBlock (LogicalTapeSet * lts , long blocknum , void * buffer );
199220static void ltsReadBlock (LogicalTapeSet * lts , long blocknum , void * buffer );
200221static long ltsGetFreeBlock (LogicalTapeSet * lts );
222+ static long ltsGetPreallocBlock (LogicalTapeSet * lts , LogicalTape * lt );
201223static void ltsReleaseBlock (LogicalTapeSet * lts , long blocknum );
202224static void ltsConcatWorkerTapes (LogicalTapeSet * lts , TapeShare * shared ,
203225 SharedFileSet * fileset );
@@ -397,6 +419,45 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
397419 return blocknum ;
398420}
399421
422+ /*
423+ * Return the lowest free block number from the tape's preallocation list.
424+ * Refill the preallocation list if necessary.
425+ */
426+ static long
427+ ltsGetPreallocBlock (LogicalTapeSet * lts , LogicalTape * lt )
428+ {
429+ /* sorted in descending order, so return the last element */
430+ if (lt -> nprealloc > 0 )
431+ return lt -> prealloc [-- lt -> nprealloc ];
432+
433+ if (lt -> prealloc == NULL )
434+ {
435+ lt -> prealloc_size = TAPE_WRITE_PREALLOC_MIN ;
436+ lt -> prealloc = (long * ) palloc (sizeof (long ) * lt -> prealloc_size );
437+ }
438+ else if (lt -> prealloc_size < TAPE_WRITE_PREALLOC_MAX )
439+ {
440+ /* when the preallocation list runs out, double the size */
441+ lt -> prealloc_size *= 2 ;
442+ if (lt -> prealloc_size > TAPE_WRITE_PREALLOC_MAX )
443+ lt -> prealloc_size = TAPE_WRITE_PREALLOC_MAX ;
444+ lt -> prealloc = (long * ) repalloc (lt -> prealloc ,
445+ sizeof (long ) * lt -> prealloc_size );
446+ }
447+
448+ /* refill preallocation list */
449+ lt -> nprealloc = lt -> prealloc_size ;
450+ for (int i = lt -> nprealloc ; i > 0 ; i -- )
451+ {
452+ lt -> prealloc [i - 1 ] = ltsGetFreeBlock (lts );
453+
454+ /* verify descending order */
455+ Assert (i == lt -> nprealloc || lt -> prealloc [i - 1 ] > lt -> prealloc [i ]);
456+ }
457+
458+ return lt -> prealloc [-- lt -> nprealloc ];
459+ }
460+
400461/*
401462 * Return a block# to the freelist.
402463 */
@@ -557,6 +618,9 @@ ltsInitTape(LogicalTape *lt)
557618 lt -> max_size = MaxAllocSize ;
558619 lt -> pos = 0 ;
559620 lt -> nbytes = 0 ;
621+ lt -> prealloc = NULL ;
622+ lt -> nprealloc = 0 ;
623+ lt -> prealloc_size = 0 ;
560624}
561625
562626/*
@@ -709,7 +773,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
709773 Assert (lt -> firstBlockNumber == -1 );
710774 Assert (lt -> pos == 0 );
711775
712- lt -> curBlockNumber = ltsGetFreeBlock (lts );
776+ lt -> curBlockNumber = ltsGetPreallocBlock (lts , lt );
713777 lt -> firstBlockNumber = lt -> curBlockNumber ;
714778
715779 TapeBlockGetTrailer (lt -> buffer )-> prev = -1L ;
@@ -733,7 +797,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
733797 * First allocate the next block, so that we can store it in the
734798 * 'next' pointer of this block.
735799 */
736- nextBlockNumber = ltsGetFreeBlock (lts );
800+ nextBlockNumber = ltsGetPreallocBlock (lts , lt );
737801
738802 /* set the next-pointer and dump the current block. */
739803 TapeBlockGetTrailer (lt -> buffer )-> next = nextBlockNumber ;
@@ -835,13 +899,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
835899 Assert (lt -> frozen );
836900 }
837901
838- /* Allocate a read buffer (unless the tape is empty) */
839902 if (lt -> buffer )
840903 pfree (lt -> buffer );
841904
842905 /* the buffer is lazily allocated, but set the size here */
843906 lt -> buffer = NULL ;
844907 lt -> buffer_size = buffer_size ;
908+
909+ /* free the preallocation list, and return unused block numbers */
910+ if (lt -> prealloc != NULL )
911+ {
912+ for (int i = lt -> nprealloc ; i > 0 ; i -- )
913+ ltsReleaseBlock (lts , lt -> prealloc [i - 1 ]);
914+ pfree (lt -> prealloc );
915+ lt -> prealloc = NULL ;
916+ lt -> nprealloc = 0 ;
917+ lt -> prealloc_size = 0 ;
918+ }
845919}
846920
847921/*
0 commit comments