5252 * not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
5353 * policy for free blocks would be better?)
5454 *
55+ * To further make the I/Os more sequential, we can use a larger buffer
56+ * when reading, and read multiple blocks from the same tape in one go,
57+ * whenever the buffer becomes empty. LogicalTapeAssignReadBufferSize()
58+ * can be used to set the size of the read buffer.
59+ *
5560 * To support the above policy of writing to the lowest free block,
5661 * ltsGetFreeBlock sorts the list of free block numbers into decreasing
5762 * order each time it is asked for a block and the list isn't currently
5863 * sorted. This is an efficient way to handle it because we expect cycles
5964 * of releasing many blocks followed by re-using many blocks, due to
60- * tuplesort.c's "preread" behavior .
65+ * the larger read buffer .
6166 *
6267 * Since all the bookkeeping and buffer memory is allocated with palloc(),
6368 * and the underlying file(s) are made with OpenTemporaryFile, all resources
7984
8085#include "storage/buffile.h"
8186#include "utils/logtape.h"
87+ #include "utils/memutils.h"
8288
8389/*
8490 * Block indexes are "long"s, so we can fit this many per indirect block.
@@ -131,9 +137,18 @@ typedef struct LogicalTape
131137 * reading.
132138 */
133139 char * buffer ; /* physical buffer (separately palloc'd) */
140+ int buffer_size ; /* allocated size of the buffer */
134141 long curBlockNumber ; /* this block's logical blk# within tape */
135142 int pos ; /* next read/write position in buffer */
136143 int nbytes ; /* total # of valid bytes in buffer */
144+
145+ /*
146+ * Desired buffer size to use when reading. To keep things simple, we use
147+ * a single-block buffer when writing, or when reading a frozen tape. But
148+ * when we are reading and will only read forwards, we allocate a larger
149+ * buffer, determined by read_buffer_size.
150+ */
151+ int read_buffer_size ;
137152} LogicalTape ;
138153
139154/*
@@ -227,6 +242,53 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
227242 blocknum )));
228243}
229244
245+ /*
246+ * Read as many blocks as we can into the per-tape buffer.
247+ *
248+ * The caller can specify the next physical block number to read, in
249+ * datablocknum, or -1 to fetch the next block number from the internal block.
250+ * If datablocknum == -1, the caller must've already set curBlockNumber.
251+ *
252+ * Returns true if anything was read, 'false' on EOF.
253+ */
254+ static bool
255+ ltsReadFillBuffer (LogicalTapeSet * lts , LogicalTape * lt , long datablocknum )
256+ {
257+ lt -> pos = 0 ;
258+ lt -> nbytes = 0 ;
259+
260+ do
261+ {
262+ /* Fetch next block number (unless provided by caller) */
263+ if (datablocknum == -1 )
264+ {
265+ datablocknum = ltsRecallNextBlockNum (lts , lt -> indirect , lt -> frozen );
266+ if (datablocknum == -1L )
267+ break ; /* EOF */
268+ lt -> curBlockNumber ++ ;
269+ }
270+
271+ /* Read the block */
272+ ltsReadBlock (lts , datablocknum , (void * ) (lt -> buffer + lt -> nbytes ));
273+ if (!lt -> frozen )
274+ ltsReleaseBlock (lts , datablocknum );
275+
276+ if (lt -> curBlockNumber < lt -> numFullBlocks )
277+ lt -> nbytes += BLCKSZ ;
278+ else
279+ {
280+ /* EOF */
281+ lt -> nbytes += lt -> lastBlockBytes ;
282+ break ;
283+ }
284+
285+ /* Advance to next block, if we have buffer space left */
286+ datablocknum = -1 ;
287+ } while (lt -> nbytes < lt -> buffer_size );
288+
289+ return (lt -> nbytes > 0 );
290+ }
291+
230292/*
231293 * qsort comparator for sorting freeBlocks[] into decreasing order.
232294 */
@@ -546,6 +608,8 @@ LogicalTapeSetCreate(int ntapes)
546608 lt -> numFullBlocks = 0L ;
547609 lt -> lastBlockBytes = 0 ;
548610 lt -> buffer = NULL ;
611+ lt -> buffer_size = 0 ;
612+ lt -> read_buffer_size = BLCKSZ ;
549613 lt -> curBlockNumber = 0L ;
550614 lt -> pos = 0 ;
551615 lt -> nbytes = 0 ;
@@ -628,14 +692,18 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
628692
629693 /* Allocate data buffer and first indirect block on first write */
630694 if (lt -> buffer == NULL )
695+ {
631696 lt -> buffer = (char * ) palloc (BLCKSZ );
697+ lt -> buffer_size = BLCKSZ ;
698+ }
632699 if (lt -> indirect == NULL )
633700 {
634701 lt -> indirect = (IndirectBlock * ) palloc (sizeof (IndirectBlock ));
635702 lt -> indirect -> nextSlot = 0 ;
636703 lt -> indirect -> nextup = NULL ;
637704 }
638705
706+ Assert (lt -> buffer_size == BLCKSZ );
639707 while (size > 0 )
640708 {
641709 if (lt -> pos >= BLCKSZ )
@@ -709,18 +777,19 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
709777 Assert (lt -> frozen );
710778 datablocknum = ltsRewindFrozenIndirectBlock (lts , lt -> indirect );
711779 }
780+
781+ /* Allocate a read buffer */
782+ if (lt -> buffer )
783+ pfree (lt -> buffer );
784+ lt -> buffer = palloc (lt -> read_buffer_size );
785+ lt -> buffer_size = lt -> read_buffer_size ;
786+
712787 /* Read the first block, or reset if tape is empty */
713788 lt -> curBlockNumber = 0L ;
714789 lt -> pos = 0 ;
715790 lt -> nbytes = 0 ;
716791 if (datablocknum != -1L )
717- {
718- ltsReadBlock (lts , datablocknum , (void * ) lt -> buffer );
719- if (!lt -> frozen )
720- ltsReleaseBlock (lts , datablocknum );
721- lt -> nbytes = (lt -> curBlockNumber < lt -> numFullBlocks ) ?
722- BLCKSZ : lt -> lastBlockBytes ;
723- }
792+ ltsReadFillBuffer (lts , lt , datablocknum );
724793 }
725794 else
726795 {
@@ -754,6 +823,13 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
754823 lt -> curBlockNumber = 0L ;
755824 lt -> pos = 0 ;
756825 lt -> nbytes = 0 ;
826+
827+ if (lt -> buffer )
828+ {
829+ pfree (lt -> buffer );
830+ lt -> buffer = NULL ;
831+ lt -> buffer_size = 0 ;
832+ }
757833 }
758834}
759835
@@ -779,20 +855,8 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
779855 if (lt -> pos >= lt -> nbytes )
780856 {
781857 /* Try to load more data into buffer. */
782- long datablocknum = ltsRecallNextBlockNum (lts , lt -> indirect ,
783- lt -> frozen );
784-
785- if (datablocknum == -1L )
858+ if (!ltsReadFillBuffer (lts , lt , -1 ))
786859 break ; /* EOF */
787- lt -> curBlockNumber ++ ;
788- lt -> pos = 0 ;
789- ltsReadBlock (lts , datablocknum , (void * ) lt -> buffer );
790- if (!lt -> frozen )
791- ltsReleaseBlock (lts , datablocknum );
792- lt -> nbytes = (lt -> curBlockNumber < lt -> numFullBlocks ) ?
793- BLCKSZ : lt -> lastBlockBytes ;
794- if (lt -> nbytes <= 0 )
795- break ; /* EOF (possible here?) */
796860 }
797861
798862 nthistime = lt -> nbytes - lt -> pos ;
@@ -842,6 +906,22 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
842906 lt -> writing = false;
843907 lt -> frozen = true;
844908 datablocknum = ltsRewindIndirectBlock (lts , lt -> indirect , true);
909+
910+ /*
911+ * The seek and backspace functions assume a single block read buffer.
912+ * That's OK with current usage. A larger buffer is helpful to make the
913+ * read pattern of the backing file look more sequential to the OS, when
914+ * we're reading from multiple tapes. But at the end of a sort, when a
915+ * tape is frozen, we only read from a single tape anyway.
916+ */
917+ if (!lt -> buffer || lt -> buffer_size != BLCKSZ )
918+ {
919+ if (lt -> buffer )
920+ pfree (lt -> buffer );
921+ lt -> buffer = palloc (BLCKSZ );
922+ lt -> buffer_size = BLCKSZ ;
923+ }
924+
845925 /* Read the first block, or reset if tape is empty */
846926 lt -> curBlockNumber = 0L ;
847927 lt -> pos = 0 ;
@@ -875,6 +955,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
875955 Assert (tapenum >= 0 && tapenum < lts -> nTapes );
876956 lt = & lts -> tapes [tapenum ];
877957 Assert (lt -> frozen );
958+ Assert (lt -> buffer_size == BLCKSZ );
878959
879960 /*
880961 * Easy case for seek within current block.
@@ -941,6 +1022,7 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
9411022 lt = & lts -> tapes [tapenum ];
9421023 Assert (lt -> frozen );
9431024 Assert (offset >= 0 && offset <= BLCKSZ );
1025+ Assert (lt -> buffer_size == BLCKSZ );
9441026
9451027 /*
9461028 * Easy case for seek within current block.
@@ -1002,6 +1084,10 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
10021084
10031085 Assert (tapenum >= 0 && tapenum < lts -> nTapes );
10041086 lt = & lts -> tapes [tapenum ];
1087+
1088+ /* With a larger buffer, 'pos' wouldn't be the same as offset within page */
1089+ Assert (lt -> buffer_size == BLCKSZ );
1090+
10051091 * blocknum = lt -> curBlockNumber ;
10061092 * offset = lt -> pos ;
10071093}
@@ -1014,3 +1100,28 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
10141100{
10151101 return lts -> nFileBlocks ;
10161102}
1103+
1104+ /*
1105+ * Set buffer size to use, when reading from given tape.
1106+ */
1107+ void
1108+ LogicalTapeAssignReadBufferSize (LogicalTapeSet * lts , int tapenum , size_t avail_mem )
1109+ {
1110+ LogicalTape * lt ;
1111+
1112+ Assert (tapenum >= 0 && tapenum < lts -> nTapes );
1113+ lt = & lts -> tapes [tapenum ];
1114+
1115+ /*
1116+ * The buffer size must be a multiple of BLCKSZ in size, so round the
1117+ * given value down to nearest BLCKSZ. Make sure we have at least one
1118+ * page. Also, don't go above MaxAllocSize, to avoid erroring out. A
1119+ * multi-gigabyte buffer is unlikely to be helpful, anyway.
1120+ */
1121+ if (avail_mem < BLCKSZ )
1122+ avail_mem = BLCKSZ ;
1123+ if (avail_mem > MaxAllocSize )
1124+ avail_mem = MaxAllocSize ;
1125+ avail_mem -= avail_mem % BLCKSZ ;
1126+ lt -> read_buffer_size = avail_mem ;
1127+ }
0 commit comments