PostgreSQL Source Code git master
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/read_stream.h"
62#include "storage/smgr.h"
63#include "storage/standby.h"
64#include "utils/memdebug.h"
65#include "utils/ps_status.h"
66#include "utils/rel.h"
67#include "utils/resowner.h"
68#include "utils/timestamp.h"
69
70
71/* Note: these two macros only work on shared buffers, not local ones! */
72#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
73#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
74
75/* Note: this macro only works on local buffers, not shared ones! */
76#define LocalBufHdrGetBlock(bufHdr) \
77 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
78
79/* Bits in SyncOneBuffer's return value */
80#define BUF_WRITTEN 0x01
81#define BUF_REUSABLE 0x02
82
83#define RELS_BSEARCH_THRESHOLD 20
84
85/*
86 * This is the size (in the number of blocks) above which we scan the
87 * entire buffer pool to remove the buffers for all the pages of relation
88 * being dropped. For the relations with size below this threshold, we find
89 * the buffers by doing lookups in BufMapping table.
90 */
91#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
92
94{
98
99/* 64 bytes, about the size of a cache line on common systems */
100#define REFCOUNT_ARRAY_ENTRIES 8
101
102/*
103 * Status of buffers to checkpoint for a particular tablespace, used
104 * internally in BufferSync.
105 */
106typedef struct CkptTsStatus
107{
108 /* oid of the tablespace */
110
111 /*
112 * Checkpoint progress for this tablespace. To make progress comparable
113 * between tablespaces the progress is, for each tablespace, measured as a
114 * number between 0 and the total number of to-be-checkpointed pages. Each
115 * page checkpointed in this tablespace increments this space's progress
116 * by progress_slice.
117 */
120
121 /* number of to-be checkpointed pages in this tablespace */
123 /* already processed pages in this tablespace */
125
126 /* current offset in CkptBufferIds for this tablespace */
127 int index;
129
130/*
131 * Type for array used to sort SMgrRelations
132 *
133 * FlushRelationsAllBuffers shares the same comparator function with
134 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
135 * compatible.
136 */
137typedef struct SMgrSortArray
138{
139 RelFileLocator rlocator; /* This must be the first member */
142
143/* GUC variables */
147bool track_io_timing = false;
148
149/*
150 * How many buffers PrefetchBuffer callers should try to stay ahead of their
151 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
152 * for buffers not belonging to tablespaces that have their
153 * effective_io_concurrency parameter set.
154 */
156
157/*
158 * Like effective_io_concurrency, but used by maintenance code paths that might
159 * benefit from a higher setting because they work on behalf of many sessions.
160 * Overridden by the tablespace setting of the same name.
161 */
163
164/*
165 * Limit on how many blocks should be handled in single I/O operations.
166 * StartReadBuffers() callers should respect it, as should other operations
167 * that call smgr APIs directly. It is computed as the minimum of underlying
168 * GUCs io_combine_limit_guc and io_max_combine_limit.
169 */
173
174/*
175 * GUC variables about triggering kernel writeback for buffers written; OS
176 * dependent defaults are set via the GUC mechanism.
177 */
181
182/* local state for LockBufferForCleanup */
184
185/*
186 * Backend-Private refcount management:
187 *
188 * Each buffer also has a private refcount that keeps track of the number of
189 * times the buffer is pinned in the current process. This is so that the
190 * shared refcount needs to be modified only once if a buffer is pinned more
191 * than once by an individual backend. It's also used to check that no buffers
192 * are still pinned at the end of transactions and when exiting.
193 *
194 *
195 * To avoid - as we used to - requiring an array with NBuffers entries to keep
196 * track of local buffers, we use a small sequentially searched array
197 * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
198 * keep track of backend local pins.
199 *
200 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
201 * refcounts are kept track of in the array; after that, new array entries
202 * displace old ones into the hash table. That way a frequently used entry
203 * can't get "stuck" in the hashtable while infrequent ones clog the array.
204 *
205 * Note that in most scenarios the number of pinned buffers will not exceed
206 * REFCOUNT_ARRAY_ENTRIES.
207 *
208 *
209 * To enter a buffer into the refcount tracking mechanism first reserve a free
210 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
211 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
212 * memory allocations in NewPrivateRefCountEntry() which can be important
213 * because in some scenarios it's called with a spinlock held...
214 */
220
222
223static void ReservePrivateRefCountEntry(void);
228
229/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
230static void ResOwnerReleaseBufferIO(Datum res);
231static char *ResOwnerPrintBufferIO(Datum res);
232static void ResOwnerReleaseBufferPin(Datum res);
233static char *ResOwnerPrintBufferPin(Datum res);
234
236{
237 .name = "buffer io",
238 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
239 .release_priority = RELEASE_PRIO_BUFFER_IOS,
240 .ReleaseResource = ResOwnerReleaseBufferIO,
241 .DebugPrint = ResOwnerPrintBufferIO
242};
243
245{
246 .name = "buffer pin",
247 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
248 .release_priority = RELEASE_PRIO_BUFFER_PINS,
249 .ReleaseResource = ResOwnerReleaseBufferPin,
250 .DebugPrint = ResOwnerPrintBufferPin
251};
252
253/*
254 * Ensure that the PrivateRefCountArray has sufficient space to store one more
255 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
256 * a new entry - but it's perfectly fine to not use a reserved entry.
257 */
258static void
260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
320
321/*
322 * Fill a previously reserved refcount entry.
323 */
326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}
342
343/*
344 * Return the PrivateRefCount entry for the passed buffer.
345 *
346 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
347 * do_move is true, and the entry resides in the hashtable the entry is
348 * optimized for frequent access by moving it to the array.
349 */
352{
354 int i;
355
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
418
419/*
420 * Returns how many times the passed buffer is pinned by this backend.
421 *
422 * Only works for shared memory buffers!
423 */
424static inline int32
426{
428
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
442
443/*
444 * Release resources used to track the reference count of a buffer which we no
445 * longer have pinned and don't want to pin again immediately.
446 */
447static void
449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
475
476/*
477 * BufferIsPinned
478 * True iff the buffer is pinned (also checks for valid buffer number).
479 *
480 * NOTE: what we check here is that *this* backend holds a pin on
481 * the buffer. We do not care whether some other backend does.
482 */
483#define BufferIsPinned(bufnum) \
484( \
485 !BufferIsValid(bufnum) ? \
486 false \
487 : \
488 BufferIsLocal(bufnum) ? \
489 (LocalRefCount[-(bufnum) - 1] > 0) \
490 : \
491 (GetPrivateRefCount(bufnum) > 0) \
492)
493
494
496 SMgrRelation smgr, char smgr_persistence,
497 ForkNumber forkNum, BlockNumber blockNum,
500 ForkNumber fork,
501 BufferAccessStrategy strategy,
502 uint32 flags,
503 uint32 extend_by,
504 BlockNumber extend_upto,
505 Buffer *buffers,
506 uint32 *extended_by);
508 ForkNumber fork,
509 BufferAccessStrategy strategy,
510 uint32 flags,
511 uint32 extend_by,
512 BlockNumber extend_upto,
513 Buffer *buffers,
514 uint32 *extended_by);
515static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
516 bool skip_if_not_valid);
517static void PinBuffer_Locked(BufferDesc *buf);
518static void UnpinBuffer(BufferDesc *buf);
519static void UnpinBufferNoOwner(BufferDesc *buf);
520static void BufferSync(int flags);
521static int SyncOneBuffer(int buf_id, bool skip_recently_used,
522 WritebackContext *wb_context);
523static void WaitIO(BufferDesc *buf);
524static void AbortBufferIO(Buffer buffer);
525static void shared_buffer_write_error_callback(void *arg);
526static void local_buffer_write_error_callback(void *arg);
527static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
528 char relpersistence,
529 ForkNumber forkNum,
530 BlockNumber blockNum,
531 BufferAccessStrategy strategy,
532 bool *foundPtr, IOContext io_context);
533static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
534static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
535static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
537 IOObject io_object, IOContext io_context);
538static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
539 IOObject io_object, IOContext io_context);
540static void FindAndDropRelationBuffers(RelFileLocator rlocator,
541 ForkNumber forkNum,
542 BlockNumber nForkBlock,
543 BlockNumber firstDelBlock);
545 RelFileLocator dstlocator,
546 ForkNumber forkNum, bool permanent);
547static void AtProcExit_Buffers(int code, Datum arg);
548static void CheckForBufferLeaks(void);
549#ifdef USE_ASSERT_CHECKING
550static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
551 void *unused_context);
552#endif
553static int rlocator_comparator(const void *p1, const void *p2);
554static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
555static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
556static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
557
558
559/*
560 * Implementation of PrefetchBuffer() for shared buffers.
561 */
564 ForkNumber forkNum,
565 BlockNumber blockNum)
566{
567 PrefetchBufferResult result = {InvalidBuffer, false};
568 BufferTag newTag; /* identity of requested block */
569 uint32 newHash; /* hash value for newTag */
570 LWLock *newPartitionLock; /* buffer partition lock for it */
571 int buf_id;
572
573 Assert(BlockNumberIsValid(blockNum));
574
575 /* create a tag so we can lookup the buffer */
576 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
577 forkNum, blockNum);
578
579 /* determine its hash code and partition lock ID */
580 newHash = BufTableHashCode(&newTag);
581 newPartitionLock = BufMappingPartitionLock(newHash);
582
583 /* see if the block is in the buffer pool already */
584 LWLockAcquire(newPartitionLock, LW_SHARED);
585 buf_id = BufTableLookup(&newTag, newHash);
586 LWLockRelease(newPartitionLock);
587
588 /* If not in buffers, initiate prefetch */
589 if (buf_id < 0)
590 {
591#ifdef USE_PREFETCH
592 /*
593 * Try to initiate an asynchronous read. This returns false in
594 * recovery if the relation file doesn't exist.
595 */
596 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
597 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
598 {
599 result.initiated_io = true;
600 }
601#endif /* USE_PREFETCH */
602 }
603 else
604 {
605 /*
606 * Report the buffer it was in at that time. The caller may be able
607 * to avoid a buffer table lookup, but it's not pinned and it must be
608 * rechecked!
609 */
610 result.recent_buffer = buf_id + 1;
611 }
612
613 /*
614 * If the block *is* in buffers, we do nothing. This is not really ideal:
615 * the block might be just about to be evicted, which would be stupid
616 * since we know we are going to need it soon. But the only easy answer
617 * is to bump the usage_count, which does not seem like a great solution:
618 * when the caller does ultimately touch the block, usage_count would get
619 * bumped again, resulting in too much favoritism for blocks that are
620 * involved in a prefetch sequence. A real fix would involve some
621 * additional per-buffer state, and it's not clear that there's enough of
622 * a problem to justify that.
623 */
624
625 return result;
626}
627
628/*
629 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
630 *
631 * This is named by analogy to ReadBuffer but doesn't actually allocate a
632 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
633 * block will not be delayed by the I/O. Prefetching is optional.
634 *
635 * There are three possible outcomes:
636 *
637 * 1. If the block is already cached, the result includes a valid buffer that
638 * could be used by the caller to avoid the need for a later buffer lookup, but
639 * it's not pinned, so the caller must recheck it.
640 *
641 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
642 * true. Currently there is no way to know if the data was already cached by
643 * the kernel and therefore didn't really initiate I/O, and no way to know when
644 * the I/O completes other than using synchronous ReadBuffer().
645 *
646 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
647 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
648 * lack of a kernel facility), direct I/O is enabled, or the underlying
649 * relation file wasn't found and we are in recovery. (If the relation file
650 * wasn't found and we are not in recovery, an error is raised).
651 */
654{
655 Assert(RelationIsValid(reln));
656 Assert(BlockNumberIsValid(blockNum));
657
658 if (RelationUsesLocalBuffers(reln))
659 {
660 /* see comments in ReadBufferExtended */
661 if (RELATION_IS_OTHER_TEMP(reln))
663 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
664 errmsg("cannot access temporary tables of other sessions")));
665
666 /* pass it off to localbuf.c */
667 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
668 }
669 else
670 {
671 /* pass it to the shared buffer version */
672 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
673 }
674}
675
676/*
677 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
678 *
679 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
680 * successful. Return true if the buffer is valid and still has the expected
681 * tag. In that case, the buffer is pinned and the usage count is bumped.
682 */
683bool
685 Buffer recent_buffer)
686{
687 BufferDesc *bufHdr;
688 BufferTag tag;
689 uint32 buf_state;
690
691 Assert(BufferIsValid(recent_buffer));
692
695 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
696
697 if (BufferIsLocal(recent_buffer))
698 {
699 int b = -recent_buffer - 1;
700
701 bufHdr = GetLocalBufferDescriptor(b);
702 buf_state = pg_atomic_read_u32(&bufHdr->state);
703
704 /* Is it still valid and holding the right tag? */
705 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
706 {
707 PinLocalBuffer(bufHdr, true);
708
710
711 return true;
712 }
713 }
714 else
715 {
716 bufHdr = GetBufferDescriptor(recent_buffer - 1);
717
718 /*
719 * Is it still valid and holding the right tag? We do an unlocked tag
720 * comparison first, to make it unlikely that we'll increment the
721 * usage counter of the wrong buffer, if someone calls us with a very
722 * out of date recent_buffer. Then we'll check it again if we get the
723 * pin.
724 */
725 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
726 PinBuffer(bufHdr, NULL, true))
727 {
728 if (BufferTagsEqual(&tag, &bufHdr->tag))
729 {
731 return true;
732 }
733 UnpinBuffer(bufHdr);
734 }
735 }
736
737 return false;
738}
739
740/*
741 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
742 * fork with RBM_NORMAL mode and default strategy.
743 */
744Buffer
746{
747 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
748}
749
750/*
751 * ReadBufferExtended -- returns a buffer containing the requested
752 * block of the requested relation. If the blknum
753 * requested is P_NEW, extend the relation file and
754 * allocate a new block. (Caller is responsible for
755 * ensuring that only one backend tries to extend a
756 * relation at the same time!)
757 *
758 * Returns: the buffer number for the buffer containing
759 * the block read. The returned buffer has been pinned.
760 * Does not return on error --- elog's instead.
761 *
762 * Assume when this function is called, that reln has been opened already.
763 *
764 * In RBM_NORMAL mode, the page is read from disk, and the page header is
765 * validated. An error is thrown if the page header is not valid. (But
766 * note that an all-zero page is considered "valid"; see
767 * PageIsVerified().)
768 *
769 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
770 * valid, the page is zeroed instead of throwing an error. This is intended
771 * for non-critical data, where the caller is prepared to repair errors.
772 *
773 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
774 * filled with zeros instead of reading it from disk. Useful when the caller
775 * is going to fill the page from scratch, since this saves I/O and avoids
776 * unnecessary failure if the page-on-disk has corrupt page headers.
777 * The page is returned locked to ensure that the caller has a chance to
778 * initialize the page before it's made visible to others.
779 * Caution: do not use this mode to read a page that is beyond the relation's
780 * current physical EOF; that is likely to cause problems in md.c when
781 * the page is modified and written out. P_NEW is OK, though.
782 *
783 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
784 * a cleanup-strength lock on the page.
785 *
786 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
787 *
788 * If strategy is not NULL, a nondefault buffer access strategy is used.
789 * See buffer/README for details.
790 */
791inline Buffer
794{
795 Buffer buf;
796
797 /*
798 * Reject attempts to read non-local temporary relations; we would be
799 * likely to get wrong data since we have no visibility into the owning
800 * session's local buffers.
801 */
802 if (RELATION_IS_OTHER_TEMP(reln))
804 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
805 errmsg("cannot access temporary tables of other sessions")));
806
807 /*
808 * Read the buffer, and update pgstat counters to reflect a cache hit or
809 * miss.
810 */
811 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
812 forkNum, blockNum, mode, strategy);
813
814 return buf;
815}
816
817
818/*
819 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
820 * a relcache entry for the relation.
821 *
822 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
823 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
824 * cannot be used for temporary relations (and making that work might be
825 * difficult, unless we only want to read temporary relations for our own
826 * ProcNumber).
827 */
828Buffer
831 BufferAccessStrategy strategy, bool permanent)
832{
833 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
834
835 return ReadBuffer_common(NULL, smgr,
836 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
837 forkNum, blockNum,
838 mode, strategy);
839}
840
841/*
842 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
843 */
844Buffer
846 ForkNumber forkNum,
847 BufferAccessStrategy strategy,
848 uint32 flags)
849{
850 Buffer buf;
851 uint32 extend_by = 1;
852
853 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
854 &buf, &extend_by);
855
856 return buf;
857}
858
859/*
860 * Extend relation by multiple blocks.
861 *
862 * Tries to extend the relation by extend_by blocks. Depending on the
863 * availability of resources the relation may end up being extended by a
864 * smaller number of pages (unless an error is thrown, always by at least one
865 * page). *extended_by is updated to the number of pages the relation has been
866 * extended to.
867 *
868 * buffers needs to be an array that is at least extend_by long. Upon
869 * completion, the first extend_by array elements will point to a pinned
870 * buffer.
871 *
872 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
873 * locked. This is useful for callers that want a buffer that is guaranteed to
874 * be empty.
875 */
878 ForkNumber fork,
879 BufferAccessStrategy strategy,
880 uint32 flags,
881 uint32 extend_by,
882 Buffer *buffers,
883 uint32 *extended_by)
884{
885 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
886 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
887 Assert(extend_by > 0);
888
889 if (bmr.relpersistence == '\0')
890 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
891
892 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
893 extend_by, InvalidBlockNumber,
894 buffers, extended_by);
895}
896
897/*
898 * Extend the relation so it is at least extend_to blocks large, return buffer
899 * (extend_to - 1).
900 *
901 * This is useful for callers that want to write a specific page, regardless
902 * of the current size of the relation (e.g. useful for visibilitymap and for
903 * crash recovery).
904 */
905Buffer
907 ForkNumber fork,
908 BufferAccessStrategy strategy,
909 uint32 flags,
910 BlockNumber extend_to,
912{
914 uint32 extended_by = 0;
916 Buffer buffers[64];
917
918 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
919 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
920 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
921
922 if (bmr.relpersistence == '\0')
923 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
924
925 /*
926 * If desired, create the file if it doesn't exist. If
927 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
928 * an smgrexists call.
929 */
930 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
931 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
932 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
933 !smgrexists(BMR_GET_SMGR(bmr), fork))
934 {
936
937 /* recheck, fork might have been created concurrently */
938 if (!smgrexists(BMR_GET_SMGR(bmr), fork))
940
942 }
943
944 /*
945 * If requested, invalidate size cache, so that smgrnblocks asks the
946 * kernel.
947 */
948 if (flags & EB_CLEAR_SIZE_CACHE)
949 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
950
951 /*
952 * Estimate how many pages we'll need to extend by. This avoids acquiring
953 * unnecessarily many victim buffers.
954 */
956
957 /*
958 * Since no-one else can be looking at the page contents yet, there is no
959 * difference between an exclusive lock and a cleanup-strength lock. Note
960 * that we pass the original mode to ReadBuffer_common() below, when
961 * falling back to reading the buffer to a concurrent relation extension.
962 */
964 flags |= EB_LOCK_TARGET;
965
966 while (current_size < extend_to)
967 {
968 uint32 num_pages = lengthof(buffers);
969 BlockNumber first_block;
970
971 if ((uint64) current_size + num_pages > extend_to)
972 num_pages = extend_to - current_size;
973
974 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
975 num_pages, extend_to,
976 buffers, &extended_by);
977
978 current_size = first_block + extended_by;
979 Assert(num_pages != 0 || current_size >= extend_to);
980
981 for (uint32 i = 0; i < extended_by; i++)
982 {
983 if (first_block + i != extend_to - 1)
984 ReleaseBuffer(buffers[i]);
985 else
986 buffer = buffers[i];
987 }
988 }
989
990 /*
991 * It's possible that another backend concurrently extended the relation.
992 * In that case read the buffer.
993 *
994 * XXX: Should we control this via a flag?
995 */
996 if (buffer == InvalidBuffer)
997 {
998 Assert(extended_by == 0);
1000 fork, extend_to - 1, mode, strategy);
1001 }
1002
1003 return buffer;
1004}
1005
1006/*
1007 * Lock and optionally zero a buffer, as part of the implementation of
1008 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1009 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1010 */
1011static void
1013{
1014 BufferDesc *bufHdr;
1015 bool need_to_zero;
1016 bool isLocalBuf = BufferIsLocal(buffer);
1017
1019
1020 if (already_valid)
1021 {
1022 /*
1023 * If the caller already knew the buffer was valid, we can skip some
1024 * header interaction. The caller just wants to lock the buffer.
1025 */
1026 need_to_zero = false;
1027 }
1028 else if (isLocalBuf)
1029 {
1030 /* Simple case for non-shared buffers. */
1031 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1032 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1033 }
1034 else
1035 {
1036 /*
1037 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1038 * concurrently. Even though we aren't doing I/O, that ensures that
1039 * we don't zero a page that someone else has pinned. An exclusive
1040 * content lock wouldn't be enough, because readers are allowed to
1041 * drop the content lock after determining that a tuple is visible
1042 * (see buffer access rules in README).
1043 */
1044 bufHdr = GetBufferDescriptor(buffer - 1);
1045 need_to_zero = StartBufferIO(bufHdr, true, false);
1046 }
1047
1048 if (need_to_zero)
1049 {
1050 memset(BufferGetPage(buffer), 0, BLCKSZ);
1051
1052 /*
1053 * Grab the buffer content lock before marking the page as valid, to
1054 * make sure that no other backend sees the zeroed page before the
1055 * caller has had a chance to initialize it.
1056 *
1057 * Since no-one else can be looking at the page contents yet, there is
1058 * no difference between an exclusive lock and a cleanup-strength
1059 * lock. (Note that we cannot use LockBuffer() or
1060 * LockBufferForCleanup() here, because they assert that the buffer is
1061 * already valid.)
1062 */
1063 if (!isLocalBuf)
1065
1066 /* Set BM_VALID, terminate IO, and wake up any waiters */
1067 if (isLocalBuf)
1068 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1069 else
1070 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1071 }
1072 else if (!isLocalBuf)
1073 {
1074 /*
1075 * The buffer is valid, so we can't zero it. The caller still expects
1076 * the page to be locked on return.
1077 */
1078 if (mode == RBM_ZERO_AND_LOCK)
1080 else
1082 }
1083}
1084
1085/*
1086 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1087 * already present, or false if more work is required to either read it in or
1088 * zero it.
1089 */
1092 SMgrRelation smgr,
1093 char persistence,
1094 ForkNumber forkNum,
1095 BlockNumber blockNum,
1096 BufferAccessStrategy strategy,
1097 bool *foundPtr)
1098{
1099 BufferDesc *bufHdr;
1100 IOContext io_context;
1101 IOObject io_object;
1102
1103 Assert(blockNum != P_NEW);
1104
1105 /* Persistence should be set before */
1106 Assert((persistence == RELPERSISTENCE_TEMP ||
1107 persistence == RELPERSISTENCE_PERMANENT ||
1108 persistence == RELPERSISTENCE_UNLOGGED));
1109
1110 if (persistence == RELPERSISTENCE_TEMP)
1111 {
1112 io_context = IOCONTEXT_NORMAL;
1113 io_object = IOOBJECT_TEMP_RELATION;
1114 }
1115 else
1116 {
1117 io_context = IOContextForStrategy(strategy);
1118 io_object = IOOBJECT_RELATION;
1119 }
1120
1121 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1125 smgr->smgr_rlocator.backend);
1126
1127 if (persistence == RELPERSISTENCE_TEMP)
1128 {
1129 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1130 if (*foundPtr)
1132 }
1133 else
1134 {
1135 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1136 strategy, foundPtr, io_context);
1137 if (*foundPtr)
1139 }
1140 if (rel)
1141 {
1142 /*
1143 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1144 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1145 * zeroed instead), the per-relation stats always count them.
1146 */
1148 if (*foundPtr)
1150 }
1151 if (*foundPtr)
1152 {
1153 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1154 if (VacuumCostActive)
1156
1157 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1161 smgr->smgr_rlocator.backend,
1162 true);
1163 }
1164
1165 return BufferDescriptorGetBuffer(bufHdr);
1166}
1167
1168/*
1169 * ReadBuffer_common -- common logic for all ReadBuffer variants
1170 *
1171 * smgr is required, rel is optional unless using P_NEW.
1172 */
1174ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1175 ForkNumber forkNum,
1177 BufferAccessStrategy strategy)
1178{
1179 ReadBuffersOperation operation;
1180 Buffer buffer;
1181 int flags;
1182 char persistence;
1183
1184 /*
1185 * Backward compatibility path, most code should use ExtendBufferedRel()
1186 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1187 * scales a lot better.
1188 */
1189 if (unlikely(blockNum == P_NEW))
1190 {
1192
1193 /*
1194 * Since no-one else can be looking at the page contents yet, there is
1195 * no difference between an exclusive lock and a cleanup-strength
1196 * lock.
1197 */
1199 flags |= EB_LOCK_FIRST;
1200
1201 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1202 }
1203
1204 if (rel)
1205 persistence = rel->rd_rel->relpersistence;
1206 else
1207 persistence = smgr_persistence;
1208
1211 {
1212 bool found;
1213
1214 buffer = PinBufferForBlock(rel, smgr, persistence,
1215 forkNum, blockNum, strategy, &found);
1216 ZeroAndLockBuffer(buffer, mode, found);
1217 return buffer;
1218 }
1219
1220 /*
1221 * Signal that we are going to immediately wait. If we're immediately
1222 * waiting, there is no benefit in actually executing the IO
1223 * asynchronously, it would just add dispatch overhead.
1224 */
1226 if (mode == RBM_ZERO_ON_ERROR)
1228 operation.smgr = smgr;
1229 operation.rel = rel;
1230 operation.persistence = persistence;
1231 operation.forknum = forkNum;
1232 operation.strategy = strategy;
1233 if (StartReadBuffer(&operation,
1234 &buffer,
1235 blockNum,
1236 flags))
1237 WaitReadBuffers(&operation);
1238
1239 return buffer;
1240}
1241
1244 Buffer *buffers,
1245 BlockNumber blockNum,
1246 int *nblocks,
1247 int flags,
1248 bool allow_forwarding)
1249{
1250 int actual_nblocks = *nblocks;
1251 int maxcombine = 0;
1252 bool did_start_io;
1253
1254 Assert(*nblocks == 1 || allow_forwarding);
1255 Assert(*nblocks > 0);
1256 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1257
1258 for (int i = 0; i < actual_nblocks; ++i)
1259 {
1260 bool found;
1261
1262 if (allow_forwarding && buffers[i] != InvalidBuffer)
1263 {
1264 BufferDesc *bufHdr;
1265
1266 /*
1267 * This is a buffer that was pinned by an earlier call to
1268 * StartReadBuffers(), but couldn't be handled in one operation at
1269 * that time. The operation was split, and the caller has passed
1270 * an already pinned buffer back to us to handle the rest of the
1271 * operation. It must continue at the expected block number.
1272 */
1273 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1274
1275 /*
1276 * It might be an already valid buffer (a hit) that followed the
1277 * final contiguous block of an earlier I/O (a miss) marking the
1278 * end of it, or a buffer that some other backend has since made
1279 * valid by performing the I/O for us, in which case we can handle
1280 * it as a hit now. It is safe to check for a BM_VALID flag with
1281 * a relaxed load, because we got a fresh view of it while pinning
1282 * it in the previous call.
1283 *
1284 * On the other hand if we don't see BM_VALID yet, it must be an
1285 * I/O that was split by the previous call and we need to try to
1286 * start a new I/O from this block. We're also racing against any
1287 * other backend that might start the I/O or even manage to mark
1288 * it BM_VALID after this check, but StartBufferIO() will handle
1289 * those cases.
1290 */
1291 if (BufferIsLocal(buffers[i]))
1292 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1293 else
1294 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1296 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1297 }
1298 else
1299 {
1300 buffers[i] = PinBufferForBlock(operation->rel,
1301 operation->smgr,
1302 operation->persistence,
1303 operation->forknum,
1304 blockNum + i,
1305 operation->strategy,
1306 &found);
1307 }
1308
1309 if (found)
1310 {
1311 /*
1312 * We have a hit. If it's the first block in the requested range,
1313 * we can return it immediately and report that WaitReadBuffers()
1314 * does not need to be called. If the initial value of *nblocks
1315 * was larger, the caller will have to call again for the rest.
1316 */
1317 if (i == 0)
1318 {
1319 *nblocks = 1;
1320
1321#ifdef USE_ASSERT_CHECKING
1322
1323 /*
1324 * Initialize enough of ReadBuffersOperation to make
1325 * CheckReadBuffersOperation() work. Outside of assertions
1326 * that's not necessary when no IO is issued.
1327 */
1328 operation->buffers = buffers;
1329 operation->blocknum = blockNum;
1330 operation->nblocks = 1;
1331 operation->nblocks_done = 1;
1332 CheckReadBuffersOperation(operation, true);
1333#endif
1334 return false;
1335 }
1336
1337 /*
1338 * Otherwise we already have an I/O to perform, but this block
1339 * can't be included as it is already valid. Split the I/O here.
1340 * There may or may not be more blocks requiring I/O after this
1341 * one, we haven't checked, but they can't be contiguous with this
1342 * one in the way. We'll leave this buffer pinned, forwarding it
1343 * to the next call, avoiding the need to unpin it here and re-pin
1344 * it in the next call.
1345 */
1346 actual_nblocks = i;
1347 break;
1348 }
1349 else
1350 {
1351 /*
1352 * Check how many blocks we can cover with the same IO. The smgr
1353 * implementation might e.g. be limited due to a segment boundary.
1354 */
1355 if (i == 0 && actual_nblocks > 1)
1356 {
1357 maxcombine = smgrmaxcombine(operation->smgr,
1358 operation->forknum,
1359 blockNum);
1360 if (unlikely(maxcombine < actual_nblocks))
1361 {
1362 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1363 blockNum, actual_nblocks, maxcombine);
1364 actual_nblocks = maxcombine;
1365 }
1366 }
1367 }
1368 }
1369 *nblocks = actual_nblocks;
1370
1371 /* Populate information needed for I/O. */
1372 operation->buffers = buffers;
1373 operation->blocknum = blockNum;
1374 operation->flags = flags;
1375 operation->nblocks = actual_nblocks;
1376 operation->nblocks_done = 0;
1377 pgaio_wref_clear(&operation->io_wref);
1378
1379 /*
1380 * When using AIO, start the IO in the background. If not, issue prefetch
1381 * requests if desired by the caller.
1382 *
1383 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1384 * de-risk the introduction of AIO somewhat. It's a large architectural
1385 * change, with lots of chances for unanticipated performance effects.
1386 *
1387 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1388 * asynchronously, but without the check here we'd execute IO earlier than
1389 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1390 */
1391 if (io_method != IOMETHOD_SYNC)
1392 {
1393 /*
1394 * Try to start IO asynchronously. It's possible that no IO needs to
1395 * be started, if another backend already performed the IO.
1396 *
1397 * Note that if an IO is started, it might not cover the entire
1398 * requested range, e.g. because an intermediary block has been read
1399 * in by another backend. In that case any "trailing" buffers we
1400 * already pinned above will be "forwarded" by read_stream.c to the
1401 * next call to StartReadBuffers().
1402 *
1403 * This is signalled to the caller by decrementing *nblocks *and*
1404 * reducing operation->nblocks. The latter is done here, but not below
1405 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1406 * overall read size anymore, we need to retry until done in its
1407 * entirety or until failed.
1408 */
1409 did_start_io = AsyncReadBuffers(operation, nblocks);
1410
1411 operation->nblocks = *nblocks;
1412 }
1413 else
1414 {
1415 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1416
1417 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1418 {
1419 /*
1420 * In theory we should only do this if PinBufferForBlock() had to
1421 * allocate new buffers above. That way, if two calls to
1422 * StartReadBuffers() were made for the same blocks before
1423 * WaitReadBuffers(), only the first would issue the advice.
1424 * That'd be a better simulation of true asynchronous I/O, which
1425 * would only start the I/O once, but isn't done here for
1426 * simplicity.
1427 */
1428 smgrprefetch(operation->smgr,
1429 operation->forknum,
1430 blockNum,
1431 actual_nblocks);
1432 }
1433
1434 /*
1435 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1436 * will initiate the necessary IO.
1437 */
1438 did_start_io = true;
1439 }
1440
1441 CheckReadBuffersOperation(operation, !did_start_io);
1442
1443 return did_start_io;
1444}
1445
1446/*
1447 * Begin reading a range of blocks beginning at blockNum and extending for
1448 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1449 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1450 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1451 * and is now being continued. On return, *nblocks holds the number of blocks
1452 * accepted by this operation. If it is less than the original number then
1453 * this operation has been split, but buffer elements up to the original
1454 * requested size may hold forwarded buffers to be used for a continuing
1455 * operation. The caller must either start a new I/O beginning at the block
1456 * immediately following the blocks accepted by this call and pass those
1457 * buffers back in, or release them if it chooses not to. It shouldn't make
1458 * any other use of or assumptions about forwarded buffers.
1459 *
1460 * If false is returned, no I/O is necessary and the buffers covered by
1461 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1462 * an I/O has been started, and WaitReadBuffers() must be called with the same
1463 * operation object before the buffers covered by *nblocks on exit can be
1464 * accessed. Along with the operation object, the caller-supplied array of
1465 * buffers must remain valid until WaitReadBuffers() is called, and any
1466 * forwarded buffers must also be preserved for a continuing call unless
1467 * they are explicitly released.
1468 */
1469bool
1471 Buffer *buffers,
1472 BlockNumber blockNum,
1473 int *nblocks,
1474 int flags)
1475{
1476 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1477 true /* expect forwarded buffers */ );
1478}
1479
1480/*
1481 * Single block version of the StartReadBuffers(). This might save a few
1482 * instructions when called from another translation unit, because it is
1483 * specialized for nblocks == 1.
1484 *
1485 * This version does not support "forwarded" buffers: they cannot be created
1486 * by reading only one block and *buffer is ignored on entry.
1487 */
1488bool
1490 Buffer *buffer,
1491 BlockNumber blocknum,
1492 int flags)
1493{
1494 int nblocks = 1;
1495 bool result;
1496
1497 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1498 false /* single block, no forwarding */ );
1499 Assert(nblocks == 1); /* single block can't be short */
1500
1501 return result;
1502}
1503
1504/*
1505 * Perform sanity checks on the ReadBuffersOperation.
1506 */
1507static void
1509{
1510#ifdef USE_ASSERT_CHECKING
1511 Assert(operation->nblocks_done <= operation->nblocks);
1512 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1513
1514 for (int i = 0; i < operation->nblocks; i++)
1515 {
1516 Buffer buffer = operation->buffers[i];
1517 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1520
1521 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1523
1524 if (i < operation->nblocks_done)
1526 }
1527#endif
1528}
1529
1530/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1531static inline bool
1533{
1534 if (BufferIsLocal(buffer))
1536 true, nowait);
1537 else
1538 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1539}
1540
1541/*
1542 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1543 */
1544static inline bool
1546{
1547 /*
1548 * If this backend currently has staged IO, we need to submit the pending
1549 * IO before waiting for the right to issue IO, to avoid the potential for
1550 * deadlocks (and, more commonly, unnecessary delays for other backends).
1551 */
1552 if (!nowait && pgaio_have_staged())
1553 {
1555 return true;
1556
1557 /*
1558 * Unfortunately StartBufferIO() returning false doesn't allow to
1559 * distinguish between the buffer already being valid and IO already
1560 * being in progress. Since IO already being in progress is quite
1561 * rare, this approach seems fine.
1562 */
1564 }
1565
1566 return ReadBuffersCanStartIOOnce(buffer, nowait);
1567}
1568
1569/*
1570 * Helper for WaitReadBuffers() that processes the results of a readv
1571 * operation, raising an error if necessary.
1572 */
1573static void
1575{
1576 PgAioReturn *aio_ret = &operation->io_return;
1577 PgAioResultStatus rs = aio_ret->result.status;
1578 int newly_read_blocks = 0;
1579
1580 Assert(pgaio_wref_valid(&operation->io_wref));
1581 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1582
1583 /*
1584 * SMGR reports the number of blocks successfully read as the result of
1585 * the IO operation. Thus we can simply add that to ->nblocks_done.
1586 */
1587
1588 if (likely(rs != PGAIO_RS_ERROR))
1589 newly_read_blocks = aio_ret->result.result;
1590
1591 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1592 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1593 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1594 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1595 {
1596 /*
1597 * We'll retry, so we just emit a debug message to the server log (or
1598 * not even that in prod scenarios).
1599 */
1600 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1601 elog(DEBUG3, "partial read, will retry");
1602 }
1603
1604 Assert(newly_read_blocks > 0);
1605 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1606
1607 operation->nblocks_done += newly_read_blocks;
1608
1609 Assert(operation->nblocks_done <= operation->nblocks);
1610}
1611
1612void
1614{
1615 PgAioReturn *aio_ret = &operation->io_return;
1616 IOContext io_context;
1617 IOObject io_object;
1618
1619 if (operation->persistence == RELPERSISTENCE_TEMP)
1620 {
1621 io_context = IOCONTEXT_NORMAL;
1622 io_object = IOOBJECT_TEMP_RELATION;
1623 }
1624 else
1625 {
1626 io_context = IOContextForStrategy(operation->strategy);
1627 io_object = IOOBJECT_RELATION;
1628 }
1629
1630 /*
1631 * If we get here without an IO operation having been issued, the
1632 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1633 * caller should not have called WaitReadBuffers().
1634 *
1635 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1636 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1637 * of the retry logic below, no extra code is required.
1638 *
1639 * This path is expected to eventually go away.
1640 */
1641 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1642 elog(ERROR, "waiting for read operation that didn't read");
1643
1644 /*
1645 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1646 * done. We may need multiple retries, not just because we could get
1647 * multiple partial reads, but also because some of the remaining
1648 * to-be-read buffers may have been read in by other backends, limiting
1649 * the IO size.
1650 */
1651 while (true)
1652 {
1653 int ignored_nblocks_progress;
1654
1655 CheckReadBuffersOperation(operation, false);
1656
1657 /*
1658 * If there is an IO associated with the operation, we may need to
1659 * wait for it.
1660 */
1661 if (pgaio_wref_valid(&operation->io_wref))
1662 {
1663 /*
1664 * Track the time spent waiting for the IO to complete. As
1665 * tracking a wait even if we don't actually need to wait
1666 *
1667 * a) is not cheap, due to the timestamping overhead
1668 *
1669 * b) reports some time as waiting, even if we never waited
1670 *
1671 * we first check if we already know the IO is complete.
1672 */
1673 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1674 !pgaio_wref_check_done(&operation->io_wref))
1675 {
1677
1678 pgaio_wref_wait(&operation->io_wref);
1679
1680 /*
1681 * The IO operation itself was already counted earlier, in
1682 * AsyncReadBuffers(), this just accounts for the wait time.
1683 */
1684 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1685 io_start, 0, 0);
1686 }
1687 else
1688 {
1689 Assert(pgaio_wref_check_done(&operation->io_wref));
1690 }
1691
1692 /*
1693 * We now are sure the IO completed. Check the results. This
1694 * includes reporting on errors if there were any.
1695 */
1696 ProcessReadBuffersResult(operation);
1697 }
1698
1699 /*
1700 * Most of the time, the one IO we already started, will read in
1701 * everything. But we need to deal with partial reads and buffers not
1702 * needing IO anymore.
1703 */
1704 if (operation->nblocks_done == operation->nblocks)
1705 break;
1706
1708
1709 /*
1710 * This may only complete the IO partially, either because some
1711 * buffers were already valid, or because of a partial read.
1712 *
1713 * NB: In contrast to after the AsyncReadBuffers() call in
1714 * StartReadBuffers(), we do *not* reduce
1715 * ReadBuffersOperation->nblocks here, callers expect the full
1716 * operation to be completed at this point (as more operations may
1717 * have been queued).
1718 */
1719 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1720 }
1721
1722 CheckReadBuffersOperation(operation, true);
1723
1724 /* NB: READ_DONE tracepoint was already executed in completion callback */
1725}
1726
1727/*
1728 * Initiate IO for the ReadBuffersOperation
1729 *
1730 * This function only starts a single IO at a time. The size of the IO may be
1731 * limited to below the to-be-read blocks, if one of the buffers has
1732 * concurrently been read in. If the first to-be-read buffer is already valid,
1733 * no IO will be issued.
1734 *
1735 * To support retries after partial reads, the first operation->nblocks_done
1736 * buffers are skipped.
1737 *
1738 * On return *nblocks_progress is updated to reflect the number of buffers
1739 * affected by the call. If the first buffer is valid, *nblocks_progress is
1740 * set to 1 and operation->nblocks_done is incremented.
1741 *
1742 * Returns true if IO was initiated, false if no IO was necessary.
1743 */
1744static bool
1745AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1746{
1747 Buffer *buffers = &operation->buffers[0];
1748 int flags = operation->flags;
1749 BlockNumber blocknum = operation->blocknum;
1750 ForkNumber forknum = operation->forknum;
1751 char persistence = operation->persistence;
1752 int16 nblocks_done = operation->nblocks_done;
1753 Buffer *io_buffers = &operation->buffers[nblocks_done];
1754 int io_buffers_len = 0;
1755 PgAioHandle *ioh;
1756 uint32 ioh_flags = 0;
1757 void *io_pages[MAX_IO_COMBINE_LIMIT];
1758 IOContext io_context;
1759 IOObject io_object;
1760 bool did_start_io;
1761
1762 /*
1763 * When this IO is executed synchronously, either because the caller will
1764 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1765 * the AIO subsystem needs to know.
1766 */
1767 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1768 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1769
1770 if (persistence == RELPERSISTENCE_TEMP)
1771 {
1772 io_context = IOCONTEXT_NORMAL;
1773 io_object = IOOBJECT_TEMP_RELATION;
1774 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1775 }
1776 else
1777 {
1778 io_context = IOContextForStrategy(operation->strategy);
1779 io_object = IOOBJECT_RELATION;
1780 }
1781
1782 /*
1783 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1784 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1785 * set globally, but on a per-session basis. The completion callback,
1786 * which may be run in other processes, e.g. in IO workers, may have a
1787 * different value of the zero_damaged_pages GUC.
1788 *
1789 * XXX: We probably should eventually use a different flag for
1790 * zero_damaged_pages, so we can report different log levels / error codes
1791 * for zero_damaged_pages and ZERO_ON_ERROR.
1792 */
1795
1796 /*
1797 * For the same reason as with zero_damaged_pages we need to use this
1798 * backend's ignore_checksum_failure value.
1799 */
1802
1803
1804 /*
1805 * To be allowed to report stats in the local completion callback we need
1806 * to prepare to report stats now. This ensures we can safely report the
1807 * checksum failure even in a critical section.
1808 */
1810
1811 /*
1812 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1813 * might block, which we don't want after setting IO_IN_PROGRESS.
1814 *
1815 * If we need to wait for IO before we can get a handle, submit
1816 * already-staged IO first, so that other backends don't need to wait.
1817 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1818 * wait for already submitted IO, which doesn't require additional locks,
1819 * but it could still cause undesirable waits.
1820 *
1821 * A secondary benefit is that this would allow us to measure the time in
1822 * pgaio_io_acquire() without causing undue timer overhead in the common,
1823 * non-blocking, case. However, currently the pgstats infrastructure
1824 * doesn't really allow that, as it a) asserts that an operation can't
1825 * have time without operations b) doesn't have an API to report
1826 * "accumulated" time.
1827 */
1829 if (unlikely(!ioh))
1830 {
1832
1834 }
1835
1836 /*
1837 * Check if we can start IO on the first to-be-read buffer.
1838 *
1839 * If an I/O is already in progress in another backend, we want to wait
1840 * for the outcome: either done, or something went wrong and we will
1841 * retry.
1842 */
1843 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1844 {
1845 /*
1846 * Someone else has already completed this block, we're done.
1847 *
1848 * When IO is necessary, ->nblocks_done is updated in
1849 * ProcessReadBuffersResult(), but that is not called if no IO is
1850 * necessary. Thus update here.
1851 */
1852 operation->nblocks_done += 1;
1853 *nblocks_progress = 1;
1854
1855 pgaio_io_release(ioh);
1856 pgaio_wref_clear(&operation->io_wref);
1857 did_start_io = false;
1858
1859 /*
1860 * Report and track this as a 'hit' for this backend, even though it
1861 * must have started out as a miss in PinBufferForBlock(). The other
1862 * backend will track this as a 'read'.
1863 */
1864 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1865 operation->smgr->smgr_rlocator.locator.spcOid,
1866 operation->smgr->smgr_rlocator.locator.dbOid,
1867 operation->smgr->smgr_rlocator.locator.relNumber,
1868 operation->smgr->smgr_rlocator.backend,
1869 true);
1870
1871 if (persistence == RELPERSISTENCE_TEMP)
1873 else
1875
1876 if (operation->rel)
1877 pgstat_count_buffer_hit(operation->rel);
1878
1879 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1880
1881 if (VacuumCostActive)
1883 }
1884 else
1885 {
1886 instr_time io_start;
1887
1888 /* We found a buffer that we need to read in. */
1889 Assert(io_buffers[0] == buffers[nblocks_done]);
1890 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1891 io_buffers_len = 1;
1892
1893 /*
1894 * How many neighboring-on-disk blocks can we scatter-read into other
1895 * buffers at the same time? In this case we don't wait if we see an
1896 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1897 * head block, so we should get on with that I/O as soon as possible.
1898 */
1899 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1900 {
1901 if (!ReadBuffersCanStartIO(buffers[i], true))
1902 break;
1903 /* Must be consecutive block numbers. */
1904 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1905 BufferGetBlockNumber(buffers[i]) - 1);
1906 Assert(io_buffers[io_buffers_len] == buffers[i]);
1907
1908 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1909 }
1910
1911 /* get a reference to wait for in WaitReadBuffers() */
1912 pgaio_io_get_wref(ioh, &operation->io_wref);
1913
1914 /* provide the list of buffers to the completion callbacks */
1915 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1916
1918 persistence == RELPERSISTENCE_TEMP ?
1921 flags);
1922
1923 pgaio_io_set_flag(ioh, ioh_flags);
1924
1925 /* ---
1926 * Even though we're trying to issue IO asynchronously, track the time
1927 * in smgrstartreadv():
1928 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1929 * immediately
1930 * - the io method might not support the IO (e.g. worker IO for a temp
1931 * table)
1932 * ---
1933 */
1935 smgrstartreadv(ioh, operation->smgr, forknum,
1936 blocknum + nblocks_done,
1937 io_pages, io_buffers_len);
1938 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1939 io_start, 1, io_buffers_len * BLCKSZ);
1940
1941 if (persistence == RELPERSISTENCE_TEMP)
1942 pgBufferUsage.local_blks_read += io_buffers_len;
1943 else
1944 pgBufferUsage.shared_blks_read += io_buffers_len;
1945
1946 /*
1947 * Track vacuum cost when issuing IO, not after waiting for it.
1948 * Otherwise we could end up issuing a lot of IO in a short timespan,
1949 * despite a low cost limit.
1950 */
1951 if (VacuumCostActive)
1952 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1953
1954 *nblocks_progress = io_buffers_len;
1955 did_start_io = true;
1956 }
1957
1958 return did_start_io;
1959}
1960
1961/*
1962 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1963 * buffer. If no buffer exists already, selects a replacement victim and
1964 * evicts the old page, but does NOT read in new page.
1965 *
1966 * "strategy" can be a buffer replacement strategy object, or NULL for
1967 * the default strategy. The selected buffer's usage_count is advanced when
1968 * using the default strategy, but otherwise possibly not (see PinBuffer).
1969 *
1970 * The returned buffer is pinned and is already marked as holding the
1971 * desired page. If it already did have the desired page, *foundPtr is
1972 * set true. Otherwise, *foundPtr is set false.
1973 *
1974 * io_context is passed as an output parameter to avoid calling
1975 * IOContextForStrategy() when there is a shared buffers hit and no IO
1976 * statistics need be captured.
1977 *
1978 * No locks are held either at entry or exit.
1979 */
1981BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1982 BlockNumber blockNum,
1983 BufferAccessStrategy strategy,
1984 bool *foundPtr, IOContext io_context)
1985{
1986 BufferTag newTag; /* identity of requested block */
1987 uint32 newHash; /* hash value for newTag */
1988 LWLock *newPartitionLock; /* buffer partition lock for it */
1989 int existing_buf_id;
1990 Buffer victim_buffer;
1991 BufferDesc *victim_buf_hdr;
1992 uint32 victim_buf_state;
1993 uint32 set_bits = 0;
1994
1995 /* Make sure we will have room to remember the buffer pin */
1998
1999 /* create a tag so we can lookup the buffer */
2000 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2001
2002 /* determine its hash code and partition lock ID */
2003 newHash = BufTableHashCode(&newTag);
2004 newPartitionLock = BufMappingPartitionLock(newHash);
2005
2006 /* see if the block is in the buffer pool already */
2007 LWLockAcquire(newPartitionLock, LW_SHARED);
2008 existing_buf_id = BufTableLookup(&newTag, newHash);
2009 if (existing_buf_id >= 0)
2010 {
2011 BufferDesc *buf;
2012 bool valid;
2013
2014 /*
2015 * Found it. Now, pin the buffer so no one can steal it from the
2016 * buffer pool, and check to see if the correct data has been loaded
2017 * into the buffer.
2018 */
2019 buf = GetBufferDescriptor(existing_buf_id);
2020
2021 valid = PinBuffer(buf, strategy, false);
2022
2023 /* Can release the mapping lock as soon as we've pinned it */
2024 LWLockRelease(newPartitionLock);
2025
2026 *foundPtr = true;
2027
2028 if (!valid)
2029 {
2030 /*
2031 * We can only get here if (a) someone else is still reading in
2032 * the page, (b) a previous read attempt failed, or (c) someone
2033 * called StartReadBuffers() but not yet WaitReadBuffers().
2034 */
2035 *foundPtr = false;
2036 }
2037
2038 return buf;
2039 }
2040
2041 /*
2042 * Didn't find it in the buffer pool. We'll have to initialize a new
2043 * buffer. Remember to unlock the mapping lock while doing the work.
2044 */
2045 LWLockRelease(newPartitionLock);
2046
2047 /*
2048 * Acquire a victim buffer. Somebody else might try to do the same, we
2049 * don't hold any conflicting locks. If so we'll have to undo our work
2050 * later.
2051 */
2052 victim_buffer = GetVictimBuffer(strategy, io_context);
2053 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2054
2055 /*
2056 * Try to make a hashtable entry for the buffer under its new tag. If
2057 * somebody else inserted another buffer for the tag, we'll release the
2058 * victim buffer we acquired and use the already inserted one.
2059 */
2060 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2061 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2062 if (existing_buf_id >= 0)
2063 {
2064 BufferDesc *existing_buf_hdr;
2065 bool valid;
2066
2067 /*
2068 * Got a collision. Someone has already done what we were about to do.
2069 * We'll just handle this as if it were found in the buffer pool in
2070 * the first place. First, give up the buffer we were planning to
2071 * use.
2072 *
2073 * We could do this after releasing the partition lock, but then we'd
2074 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2075 * before acquiring the lock, for the rare case of such a collision.
2076 */
2077 UnpinBuffer(victim_buf_hdr);
2078
2079 /* remaining code should match code at top of routine */
2080
2081 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2082
2083 valid = PinBuffer(existing_buf_hdr, strategy, false);
2084
2085 /* Can release the mapping lock as soon as we've pinned it */
2086 LWLockRelease(newPartitionLock);
2087
2088 *foundPtr = true;
2089
2090 if (!valid)
2091 {
2092 /*
2093 * We can only get here if (a) someone else is still reading in
2094 * the page, (b) a previous read attempt failed, or (c) someone
2095 * called StartReadBuffers() but not yet WaitReadBuffers().
2096 */
2097 *foundPtr = false;
2098 }
2099
2100 return existing_buf_hdr;
2101 }
2102
2103 /*
2104 * Need to lock the buffer header too in order to change its tag.
2105 */
2106 victim_buf_state = LockBufHdr(victim_buf_hdr);
2107
2108 /* some sanity checks while we hold the buffer header lock */
2109 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2110 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2111
2112 victim_buf_hdr->tag = newTag;
2113
2114 /*
2115 * Make sure BM_PERMANENT is set for buffers that must be written at every
2116 * checkpoint. Unlogged buffers only need to be written at shutdown
2117 * checkpoints, except for their "init" forks, which need to be treated
2118 * just like permanent relations.
2119 */
2120 set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2121 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2122 set_bits |= BM_PERMANENT;
2123
2124 UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
2125 set_bits, 0, 0);
2126
2127 LWLockRelease(newPartitionLock);
2128
2129 /*
2130 * Buffer contents are currently invalid.
2131 */
2132 *foundPtr = false;
2133
2134 return victim_buf_hdr;
2135}
2136
2137/*
2138 * InvalidateBuffer -- mark a shared buffer invalid.
2139 *
2140 * The buffer header spinlock must be held at entry. We drop it before
2141 * returning. (This is sane because the caller must have locked the
2142 * buffer in order to be sure it should be dropped.)
2143 *
2144 * This is used only in contexts such as dropping a relation. We assume
2145 * that no other backend could possibly be interested in using the page,
2146 * so the only reason the buffer might be pinned is if someone else is
2147 * trying to write it out. We have to let them finish before we can
2148 * reclaim the buffer.
2149 *
2150 * The buffer could get reclaimed by someone else while we are waiting
2151 * to acquire the necessary locks; if so, don't mess it up.
2152 */
2153static void
2155{
2156 BufferTag oldTag;
2157 uint32 oldHash; /* hash value for oldTag */
2158 LWLock *oldPartitionLock; /* buffer partition lock for it */
2159 uint32 oldFlags;
2160 uint32 buf_state;
2161
2162 /* Save the original buffer tag before dropping the spinlock */
2163 oldTag = buf->tag;
2164
2166
2167 /*
2168 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2169 * worth storing the hashcode in BufferDesc so we need not recompute it
2170 * here? Probably not.
2171 */
2172 oldHash = BufTableHashCode(&oldTag);
2173 oldPartitionLock = BufMappingPartitionLock(oldHash);
2174
2175retry:
2176
2177 /*
2178 * Acquire exclusive mapping lock in preparation for changing the buffer's
2179 * association.
2180 */
2181 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2182
2183 /* Re-lock the buffer header */
2184 buf_state = LockBufHdr(buf);
2185
2186 /* If it's changed while we were waiting for lock, do nothing */
2187 if (!BufferTagsEqual(&buf->tag, &oldTag))
2188 {
2190 LWLockRelease(oldPartitionLock);
2191 return;
2192 }
2193
2194 /*
2195 * We assume the reason for it to be pinned is that either we were
2196 * asynchronously reading the page in before erroring out or someone else
2197 * is flushing the page out. Wait for the IO to finish. (This could be
2198 * an infinite loop if the refcount is messed up... it would be nice to
2199 * time out after awhile, but there seems no way to be sure how many loops
2200 * may be needed. Note that if the other guy has pinned the buffer but
2201 * not yet done StartBufferIO, WaitIO will fall through and we'll
2202 * effectively be busy-looping here.)
2203 */
2204 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2205 {
2207 LWLockRelease(oldPartitionLock);
2208 /* safety check: should definitely not be our *own* pin */
2210 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2211 WaitIO(buf);
2212 goto retry;
2213 }
2214
2215 /*
2216 * Clear out the buffer's tag and flags. We must do this to ensure that
2217 * linear scans of the buffer array don't think the buffer is valid.
2218 */
2219 oldFlags = buf_state & BUF_FLAG_MASK;
2220 ClearBufferTag(&buf->tag);
2221
2222 UnlockBufHdrExt(buf, buf_state,
2223 0,
2225 0);
2226
2227 /*
2228 * Remove the buffer from the lookup hashtable, if it was in there.
2229 */
2230 if (oldFlags & BM_TAG_VALID)
2231 BufTableDelete(&oldTag, oldHash);
2232
2233 /*
2234 * Done with mapping lock.
2235 */
2236 LWLockRelease(oldPartitionLock);
2237}
2238
2239/*
2240 * Helper routine for GetVictimBuffer()
2241 *
2242 * Needs to be called on a buffer with a valid tag, pinned, but without the
2243 * buffer header spinlock held.
2244 *
2245 * Returns true if the buffer can be reused, in which case the buffer is only
2246 * pinned by this backend and marked as invalid, false otherwise.
2247 */
2248static bool
2250{
2251 uint32 buf_state;
2252 uint32 hash;
2253 LWLock *partition_lock;
2254 BufferTag tag;
2255
2257
2258 /* have buffer pinned, so it's safe to read tag without lock */
2259 tag = buf_hdr->tag;
2260
2261 hash = BufTableHashCode(&tag);
2262 partition_lock = BufMappingPartitionLock(hash);
2263
2264 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2265
2266 /* lock the buffer header */
2267 buf_state = LockBufHdr(buf_hdr);
2268
2269 /*
2270 * We have the buffer pinned nobody else should have been able to unset
2271 * this concurrently.
2272 */
2273 Assert(buf_state & BM_TAG_VALID);
2274 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2275 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2276
2277 /*
2278 * If somebody else pinned the buffer since, or even worse, dirtied it,
2279 * give up on this buffer: It's clearly in use.
2280 */
2281 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2282 {
2283 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2284
2285 UnlockBufHdr(buf_hdr);
2286 LWLockRelease(partition_lock);
2287
2288 return false;
2289 }
2290
2291 /*
2292 * Clear out the buffer's tag and flags and usagecount. This is not
2293 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2294 * doing anything with the buffer. But currently it's beneficial, as the
2295 * cheaper pre-check for several linear scans of shared buffers use the
2296 * tag (see e.g. FlushDatabaseBuffers()).
2297 */
2298 ClearBufferTag(&buf_hdr->tag);
2299 UnlockBufHdrExt(buf_hdr, buf_state,
2300 0,
2302 0);
2303
2304 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2305
2306 /* finally delete buffer from the buffer mapping table */
2307 BufTableDelete(&tag, hash);
2308
2309 LWLockRelease(partition_lock);
2310
2311 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2312 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2313 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2315
2316 return true;
2317}
2318
2319static Buffer
2321{
2322 BufferDesc *buf_hdr;
2323 Buffer buf;
2324 uint32 buf_state;
2325 bool from_ring;
2326
2327 /*
2328 * Ensure, before we pin a victim buffer, that there's a free refcount
2329 * entry and resource owner slot for the pin.
2330 */
2333
2334 /* we return here if a prospective victim buffer gets used concurrently */
2335again:
2336
2337 /*
2338 * Select a victim buffer. The buffer is returned pinned and owned by
2339 * this backend.
2340 */
2341 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2342 buf = BufferDescriptorGetBuffer(buf_hdr);
2343
2344 /*
2345 * We shouldn't have any other pins for this buffer.
2346 */
2348
2349 /*
2350 * If the buffer was dirty, try to write it out. There is a race
2351 * condition here, in that someone might dirty it after we released the
2352 * buffer header lock above, or even while we are writing it out (since
2353 * our share-lock won't prevent hint-bit updates). We will recheck the
2354 * dirty bit after re-locking the buffer header.
2355 */
2356 if (buf_state & BM_DIRTY)
2357 {
2358 LWLock *content_lock;
2359
2360 Assert(buf_state & BM_TAG_VALID);
2361 Assert(buf_state & BM_VALID);
2362
2363 /*
2364 * We need a share-lock on the buffer contents to write it out (else
2365 * we might write invalid data, eg because someone else is compacting
2366 * the page contents while we write). We must use a conditional lock
2367 * acquisition here to avoid deadlock. Even though the buffer was not
2368 * pinned (and therefore surely not locked) when StrategyGetBuffer
2369 * returned it, someone else could have pinned and exclusive-locked it
2370 * by the time we get here. If we try to get the lock unconditionally,
2371 * we'd block waiting for them; if they later block waiting for us,
2372 * deadlock ensues. (This has been observed to happen when two
2373 * backends are both trying to split btree index pages, and the second
2374 * one just happens to be trying to split the page the first one got
2375 * from StrategyGetBuffer.)
2376 */
2377 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2378 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2379 {
2380 /*
2381 * Someone else has locked the buffer, so give it up and loop back
2382 * to get another one.
2383 */
2384 UnpinBuffer(buf_hdr);
2385 goto again;
2386 }
2387
2388 /*
2389 * If using a nondefault strategy, and writing the buffer would
2390 * require a WAL flush, let the strategy decide whether to go ahead
2391 * and write/reuse the buffer or to choose another victim. We need a
2392 * lock to inspect the page LSN, so this can't be done inside
2393 * StrategyGetBuffer.
2394 */
2395 if (strategy != NULL)
2396 {
2397 XLogRecPtr lsn;
2398
2399 /* Read the LSN while holding buffer header lock */
2400 buf_state = LockBufHdr(buf_hdr);
2401 lsn = BufferGetLSN(buf_hdr);
2402 UnlockBufHdr(buf_hdr);
2403
2404 if (XLogNeedsFlush(lsn)
2405 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2406 {
2407 LWLockRelease(content_lock);
2408 UnpinBuffer(buf_hdr);
2409 goto again;
2410 }
2411 }
2412
2413 /* OK, do the I/O */
2414 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2415 LWLockRelease(content_lock);
2416
2418 &buf_hdr->tag);
2419 }
2420
2421
2422 if (buf_state & BM_VALID)
2423 {
2424 /*
2425 * When a BufferAccessStrategy is in use, blocks evicted from shared
2426 * buffers are counted as IOOP_EVICT in the corresponding context
2427 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2428 * strategy in two cases: 1) while initially claiming buffers for the
2429 * strategy ring 2) to replace an existing strategy ring buffer
2430 * because it is pinned or in use and cannot be reused.
2431 *
2432 * Blocks evicted from buffers already in the strategy ring are
2433 * counted as IOOP_REUSE in the corresponding strategy context.
2434 *
2435 * At this point, we can accurately count evictions and reuses,
2436 * because we have successfully claimed the valid buffer. Previously,
2437 * we may have been forced to release the buffer due to concurrent
2438 * pinners or erroring out.
2439 */
2441 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2442 }
2443
2444 /*
2445 * If the buffer has an entry in the buffer mapping table, delete it. This
2446 * can fail because another backend could have pinned or dirtied the
2447 * buffer.
2448 */
2449 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2450 {
2451 UnpinBuffer(buf_hdr);
2452 goto again;
2453 }
2454
2455 /* a final set of sanity checks */
2456#ifdef USE_ASSERT_CHECKING
2457 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2458
2459 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2460 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2461
2463#endif
2464
2465 return buf;
2466}
2467
2468/*
2469 * Return the maximum number of buffers that a backend should try to pin once,
2470 * to avoid exceeding its fair share. This is the highest value that
2471 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2472 * system with a very small buffer pool relative to max_connections.
2473 */
2474uint32
2476{
2477 return MaxProportionalPins;
2478}
2479
2480/*
2481 * Return the maximum number of additional buffers that this backend should
2482 * pin if it wants to stay under the per-backend limit, considering the number
2483 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2484 * return by this function can be zero.
2485 */
2486uint32
2488{
2489 uint32 estimated_pins_held;
2490
2491 /*
2492 * We get the number of "overflowed" pins for free, but don't know the
2493 * number of pins in PrivateRefCountArray. The cost of calculating that
2494 * exactly doesn't seem worth it, so just assume the max.
2495 */
2496 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2497
2498 /* Is this backend already holding more than its fair share? */
2499 if (estimated_pins_held > MaxProportionalPins)
2500 return 0;
2501
2502 return MaxProportionalPins - estimated_pins_held;
2503}
2504
2505/*
2506 * Limit the number of pins a batch operation may additionally acquire, to
2507 * avoid running out of pinnable buffers.
2508 *
2509 * One additional pin is always allowed, on the assumption that the operation
2510 * requires at least one to make progress.
2511 */
2512void
2514{
2515 uint32 limit;
2516
2517 if (*additional_pins <= 1)
2518 return;
2519
2520 limit = GetAdditionalPinLimit();
2521 limit = Max(limit, 1);
2522 if (limit < *additional_pins)
2523 *additional_pins = limit;
2524}
2525
2526/*
2527 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2528 * avoid duplicating the tracing and relpersistence related logic.
2529 */
2530static BlockNumber
2532 ForkNumber fork,
2533 BufferAccessStrategy strategy,
2534 uint32 flags,
2535 uint32 extend_by,
2536 BlockNumber extend_upto,
2537 Buffer *buffers,
2538 uint32 *extended_by)
2539{
2540 BlockNumber first_block;
2541
2542 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2543 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2544 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2545 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2546 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2547 extend_by);
2548
2549 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2550 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2551 extend_by, extend_upto,
2552 buffers, &extend_by);
2553 else
2554 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2555 extend_by, extend_upto,
2556 buffers, &extend_by);
2557 *extended_by = extend_by;
2558
2559 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2560 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2561 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2562 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2563 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2564 *extended_by,
2565 first_block);
2566
2567 return first_block;
2568}
2569
2570/*
2571 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2572 * shared buffers.
2573 */
2574static BlockNumber
2576 ForkNumber fork,
2577 BufferAccessStrategy strategy,
2578 uint32 flags,
2579 uint32 extend_by,
2580 BlockNumber extend_upto,
2581 Buffer *buffers,
2582 uint32 *extended_by)
2583{
2584 BlockNumber first_block;
2585 IOContext io_context = IOContextForStrategy(strategy);
2586 instr_time io_start;
2587
2588 LimitAdditionalPins(&extend_by);
2589
2590 /*
2591 * Acquire victim buffers for extension without holding extension lock.
2592 * Writing out victim buffers is the most expensive part of extending the
2593 * relation, particularly when doing so requires WAL flushes. Zeroing out
2594 * the buffers is also quite expensive, so do that before holding the
2595 * extension lock as well.
2596 *
2597 * These pages are pinned by us and not valid. While we hold the pin they
2598 * can't be acquired as victim buffers by another backend.
2599 */
2600 for (uint32 i = 0; i < extend_by; i++)
2601 {
2602 Block buf_block;
2603
2604 buffers[i] = GetVictimBuffer(strategy, io_context);
2605 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2606
2607 /* new buffers are zero-filled */
2608 MemSet(buf_block, 0, BLCKSZ);
2609 }
2610
2611 /*
2612 * Lock relation against concurrent extensions, unless requested not to.
2613 *
2614 * We use the same extension lock for all forks. That's unnecessarily
2615 * restrictive, but currently extensions for forks don't happen often
2616 * enough to make it worth locking more granularly.
2617 *
2618 * Note that another backend might have extended the relation by the time
2619 * we get the lock.
2620 */
2621 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2623
2624 /*
2625 * If requested, invalidate size cache, so that smgrnblocks asks the
2626 * kernel.
2627 */
2628 if (flags & EB_CLEAR_SIZE_CACHE)
2629 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2630
2631 first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
2632
2633 /*
2634 * Now that we have the accurate relation size, check if the caller wants
2635 * us to extend to only up to a specific size. If there were concurrent
2636 * extensions, we might have acquired too many buffers and need to release
2637 * them.
2638 */
2639 if (extend_upto != InvalidBlockNumber)
2640 {
2641 uint32 orig_extend_by = extend_by;
2642
2643 if (first_block > extend_upto)
2644 extend_by = 0;
2645 else if ((uint64) first_block + extend_by > extend_upto)
2646 extend_by = extend_upto - first_block;
2647
2648 for (uint32 i = extend_by; i < orig_extend_by; i++)
2649 {
2650 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2651
2652 UnpinBuffer(buf_hdr);
2653 }
2654
2655 if (extend_by == 0)
2656 {
2657 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2659 *extended_by = extend_by;
2660 return first_block;
2661 }
2662 }
2663
2664 /* Fail if relation is already at maximum possible length */
2665 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2666 ereport(ERROR,
2667 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2668 errmsg("cannot extend relation %s beyond %u blocks",
2669 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2670 MaxBlockNumber)));
2671
2672 /*
2673 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2674 *
2675 * This needs to happen before we extend the relation, because as soon as
2676 * we do, other backends can start to read in those pages.
2677 */
2678 for (uint32 i = 0; i < extend_by; i++)
2679 {
2680 Buffer victim_buf = buffers[i];
2681 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2682 BufferTag tag;
2683 uint32 hash;
2684 LWLock *partition_lock;
2685 int existing_id;
2686
2687 /* in case we need to pin an existing buffer below */
2690
2691 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2692 first_block + i);
2693 hash = BufTableHashCode(&tag);
2694 partition_lock = BufMappingPartitionLock(hash);
2695
2696 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2697
2698 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2699
2700 /*
2701 * We get here only in the corner case where we are trying to extend
2702 * the relation but we found a pre-existing buffer. This can happen
2703 * because a prior attempt at extending the relation failed, and
2704 * because mdread doesn't complain about reads beyond EOF (when
2705 * zero_damaged_pages is ON) and so a previous attempt to read a block
2706 * beyond EOF could have left a "valid" zero-filled buffer.
2707 *
2708 * This has also been observed when relation was overwritten by
2709 * external process. Since the legitimate cases should always have
2710 * left a zero-filled buffer, complain if not PageIsNew.
2711 */
2712 if (existing_id >= 0)
2713 {
2714 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2715 Block buf_block;
2716 bool valid;
2717
2718 /*
2719 * Pin the existing buffer before releasing the partition lock,
2720 * preventing it from being evicted.
2721 */
2722 valid = PinBuffer(existing_hdr, strategy, false);
2723
2724 LWLockRelease(partition_lock);
2725 UnpinBuffer(victim_buf_hdr);
2726
2727 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2728 buf_block = BufHdrGetBlock(existing_hdr);
2729
2730 if (valid && !PageIsNew((Page) buf_block))
2731 ereport(ERROR,
2732 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2733 existing_hdr->tag.blockNum,
2734 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2735
2736 /*
2737 * We *must* do smgr[zero]extend before succeeding, else the page
2738 * will not be reserved by the kernel, and the next P_NEW call
2739 * will decide to return the same page. Clear the BM_VALID bit,
2740 * do StartBufferIO() and proceed.
2741 *
2742 * Loop to handle the very small possibility that someone re-sets
2743 * BM_VALID between our clearing it and StartBufferIO inspecting
2744 * it.
2745 */
2746 do
2747 {
2748 pg_atomic_fetch_and_u32(&existing_hdr->state, ~BM_VALID);
2749 } while (!StartBufferIO(existing_hdr, true, false));
2750 }
2751 else
2752 {
2753 uint32 buf_state;
2754 uint32 set_bits = 0;
2755
2756 buf_state = LockBufHdr(victim_buf_hdr);
2757
2758 /* some sanity checks while we hold the buffer header lock */
2759 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2760 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2761
2762 victim_buf_hdr->tag = tag;
2763
2764 set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2765 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2766 set_bits |= BM_PERMANENT;
2767
2768 UnlockBufHdrExt(victim_buf_hdr, buf_state,
2769 set_bits, 0,
2770 0);
2771
2772 LWLockRelease(partition_lock);
2773
2774 /* XXX: could combine the locked operations in it with the above */
2775 StartBufferIO(victim_buf_hdr, true, false);
2776 }
2777 }
2778
2780
2781 /*
2782 * Note: if smgrzeroextend fails, we will end up with buffers that are
2783 * allocated but not marked BM_VALID. The next relation extension will
2784 * still select the same block number (because the relation didn't get any
2785 * longer on disk) and so future attempts to extend the relation will find
2786 * the same buffers (if they have not been recycled) but come right back
2787 * here to try smgrzeroextend again.
2788 *
2789 * We don't need to set checksum for all-zero pages.
2790 */
2791 smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
2792
2793 /*
2794 * Release the file-extension lock; it's now OK for someone else to extend
2795 * the relation some more.
2796 *
2797 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2798 * take noticeable time.
2799 */
2800 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2802
2804 io_start, 1, extend_by * BLCKSZ);
2805
2806 /* Set BM_VALID, terminate IO, and wake up any waiters */
2807 for (uint32 i = 0; i < extend_by; i++)
2808 {
2809 Buffer buf = buffers[i];
2810 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2811 bool lock = false;
2812
2813 if (flags & EB_LOCK_FIRST && i == 0)
2814 lock = true;
2815 else if (flags & EB_LOCK_TARGET)
2816 {
2817 Assert(extend_upto != InvalidBlockNumber);
2818 if (first_block + i + 1 == extend_upto)
2819 lock = true;
2820 }
2821
2822 if (lock)
2824
2825 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2826 }
2827
2829
2830 *extended_by = extend_by;
2831
2832 return first_block;
2833}
2834
2835/*
2836 * BufferIsLockedByMe
2837 *
2838 * Checks if this backend has the buffer locked in any mode.
2839 *
2840 * Buffer must be pinned.
2841 */
2842bool
2844{
2845 BufferDesc *bufHdr;
2846
2848
2849 if (BufferIsLocal(buffer))
2850 {
2851 /* Content locks are not maintained for local buffers. */
2852 return true;
2853 }
2854 else
2855 {
2856 bufHdr = GetBufferDescriptor(buffer - 1);
2858 }
2859}
2860
2861/*
2862 * BufferIsLockedByMeInMode
2863 *
2864 * Checks if this backend has the buffer locked in the specified mode.
2865 *
2866 * Buffer must be pinned.
2867 */
2868bool
2870{
2871 BufferDesc *bufHdr;
2872
2874
2875 if (BufferIsLocal(buffer))
2876 {
2877 /* Content locks are not maintained for local buffers. */
2878 return true;
2879 }
2880 else
2881 {
2882 LWLockMode lw_mode;
2883
2884 switch (mode)
2885 {
2887 lw_mode = LW_EXCLUSIVE;
2888 break;
2889 case BUFFER_LOCK_SHARE:
2890 lw_mode = LW_SHARED;
2891 break;
2892 default:
2894 }
2895
2896 bufHdr = GetBufferDescriptor(buffer - 1);
2898 lw_mode);
2899 }
2900}
2901
2902/*
2903 * BufferIsDirty
2904 *
2905 * Checks if buffer is already dirty.
2906 *
2907 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2908 * the result may be stale before it's returned.)
2909 */
2910bool
2912{
2913 BufferDesc *bufHdr;
2914
2916
2917 if (BufferIsLocal(buffer))
2918 {
2919 int bufid = -buffer - 1;
2920
2921 bufHdr = GetLocalBufferDescriptor(bufid);
2922 /* Content locks are not maintained for local buffers. */
2923 }
2924 else
2925 {
2926 bufHdr = GetBufferDescriptor(buffer - 1);
2928 }
2929
2930 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2931}
2932
2933/*
2934 * MarkBufferDirty
2935 *
2936 * Marks buffer contents as dirty (actual write happens later).
2937 *
2938 * Buffer must be pinned and exclusive-locked. (If caller does not hold
2939 * exclusive lock, then somebody could be in process of writing the buffer,
2940 * leading to risk of bad data written to disk.)
2941 */
2942void
2944{
2945 BufferDesc *bufHdr;
2946 uint32 buf_state;
2947 uint32 old_buf_state;
2948
2949 if (!BufferIsValid(buffer))
2950 elog(ERROR, "bad buffer ID: %d", buffer);
2951
2952 if (BufferIsLocal(buffer))
2953 {
2955 return;
2956 }
2957
2958 bufHdr = GetBufferDescriptor(buffer - 1);
2959
2962
2963 /*
2964 * NB: We have to wait for the buffer header spinlock to be not held, as
2965 * TerminateBufferIO() relies on the spinlock.
2966 */
2967 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2968 for (;;)
2969 {
2970 if (old_buf_state & BM_LOCKED)
2971 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2972
2973 buf_state = old_buf_state;
2974
2975 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2976 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2977
2978 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2979 buf_state))
2980 break;
2981 }
2982
2983 /*
2984 * If the buffer was not dirty already, do vacuum accounting.
2985 */
2986 if (!(old_buf_state & BM_DIRTY))
2987 {
2989 if (VacuumCostActive)
2991 }
2992}
2993
2994/*
2995 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2996 *
2997 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2998 * compared to calling the two routines separately. Now it's mainly just
2999 * a convenience function. However, if the passed buffer is valid and
3000 * already contains the desired block, we just return it as-is; and that
3001 * does save considerable work compared to a full release and reacquire.
3002 *
3003 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3004 * buffer actually needs to be released. This case is the same as ReadBuffer,
3005 * but can save some tests in the caller.
3006 */
3007Buffer
3009 Relation relation,
3010 BlockNumber blockNum)
3011{
3012 ForkNumber forkNum = MAIN_FORKNUM;
3013 BufferDesc *bufHdr;
3014
3015 if (BufferIsValid(buffer))
3016 {
3018 if (BufferIsLocal(buffer))
3019 {
3020 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3021 if (bufHdr->tag.blockNum == blockNum &&
3022 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3023 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3024 return buffer;
3026 }
3027 else
3028 {
3029 bufHdr = GetBufferDescriptor(buffer - 1);
3030 /* we have pin, so it's ok to examine tag without spinlock */
3031 if (bufHdr->tag.blockNum == blockNum &&
3032 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3033 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3034 return buffer;
3035 UnpinBuffer(bufHdr);
3036 }
3037 }
3038
3039 return ReadBuffer(relation, blockNum);
3040}
3041
3042/*
3043 * PinBuffer -- make buffer unavailable for replacement.
3044 *
3045 * For the default access strategy, the buffer's usage_count is incremented
3046 * when we first pin it; for other strategies we just make sure the usage_count
3047 * isn't zero. (The idea of the latter is that we don't want synchronized
3048 * heap scans to inflate the count, but we need it to not be zero to discourage
3049 * other backends from stealing buffers from our ring. As long as we cycle
3050 * through the ring faster than the global clock-sweep cycles, buffers in
3051 * our ring won't be chosen as victims for replacement by other backends.)
3052 *
3053 * This should be applied only to shared buffers, never local ones.
3054 *
3055 * Since buffers are pinned/unpinned very frequently, pin buffers without
3056 * taking the buffer header lock; instead update the state variable in loop of
3057 * CAS operations. Hopefully it's just a single CAS.
3058 *
3059 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3060 * must have been done already.
3061 *
3062 * Returns true if buffer is BM_VALID, else false. This provision allows
3063 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3064 * true, then a false return value also indicates that the buffer was
3065 * (recently) invalid and has not been pinned.
3066 */
3067static bool
3069 bool skip_if_not_valid)
3070{
3072 bool result;
3074
3077
3078 ref = GetPrivateRefCountEntry(b, true);
3079
3080 if (ref == NULL)
3081 {
3082 uint32 buf_state;
3083 uint32 old_buf_state;
3084
3085 old_buf_state = pg_atomic_read_u32(&buf->state);
3086 for (;;)
3087 {
3088 if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
3089 return false;
3090
3091 /*
3092 * We're not allowed to increase the refcount while the buffer
3093 * header spinlock is held. Wait for the lock to be released.
3094 */
3095 if (old_buf_state & BM_LOCKED)
3096 old_buf_state = WaitBufHdrUnlocked(buf);
3097
3098 buf_state = old_buf_state;
3099
3100 /* increase refcount */
3101 buf_state += BUF_REFCOUNT_ONE;
3102
3103 if (strategy == NULL)
3104 {
3105 /* Default case: increase usagecount unless already max. */
3107 buf_state += BUF_USAGECOUNT_ONE;
3108 }
3109 else
3110 {
3111 /*
3112 * Ring buffers shouldn't evict others from pool. Thus we
3113 * don't make usagecount more than 1.
3114 */
3115 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3116 buf_state += BUF_USAGECOUNT_ONE;
3117 }
3118
3119 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3120 buf_state))
3121 {
3122 result = (buf_state & BM_VALID) != 0;
3123
3125 break;
3126 }
3127 }
3128 }
3129 else
3130 {
3131 /*
3132 * If we previously pinned the buffer, it is likely to be valid, but
3133 * it may not be if StartReadBuffers() was called and
3134 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3135 * the flags without locking. This is racy, but it's OK to return
3136 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3137 * it'll see that it's now valid.
3138 *
3139 * Note: We deliberately avoid a Valgrind client request here.
3140 * Individual access methods can optionally superimpose buffer page
3141 * client requests on top of our client requests to enforce that
3142 * buffers are only accessed while locked (and pinned). It's possible
3143 * that the buffer page is legitimately non-accessible here. We
3144 * cannot meddle with that.
3145 */
3146 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3147
3148 Assert(ref->refcount > 0);
3149 ref->refcount++;
3151 }
3152
3153 return result;
3154}
3155
3156/*
3157 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3158 * The spinlock is released before return.
3159 *
3160 * As this function is called with the spinlock held, the caller has to
3161 * previously call ReservePrivateRefCountEntry() and
3162 * ResourceOwnerEnlarge(CurrentResourceOwner);
3163 *
3164 * Currently, no callers of this function want to modify the buffer's
3165 * usage_count at all, so there's no need for a strategy parameter.
3166 * Also we don't bother with a BM_VALID test (the caller could check that for
3167 * itself).
3168 *
3169 * Also all callers only ever use this function when it's known that the
3170 * buffer can't have a preexisting pin by this backend. That allows us to skip
3171 * searching the private refcount array & hash, which is a boon, because the
3172 * spinlock is still held.
3173 *
3174 * Note: use of this routine is frequently mandatory, not just an optimization
3175 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3176 * its state can change under us.
3177 */
3178static void
3180{
3181 uint32 old_buf_state;
3182
3183 /*
3184 * As explained, We don't expect any preexisting pins. That allows us to
3185 * manipulate the PrivateRefCount after releasing the spinlock
3186 */
3188
3189 /*
3190 * Since we hold the buffer spinlock, we can update the buffer state and
3191 * release the lock in one operation.
3192 */
3193 old_buf_state = pg_atomic_read_u32(&buf->state);
3194
3195 UnlockBufHdrExt(buf, old_buf_state,
3196 0, 0, 1);
3197
3199}
3200
3201/*
3202 * Support for waking up another backend that is waiting for the cleanup lock
3203 * to be released using BM_PIN_COUNT_WAITER.
3204 *
3205 * See LockBufferForCleanup().
3206 *
3207 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3208 * not just reducing the backend-local pincount for the buffer).
3209 */
3210static void
3212{
3213 /*
3214 * Acquire the buffer header lock, re-check that there's a waiter. Another
3215 * backend could have unpinned this buffer, and already woken up the
3216 * waiter.
3217 *
3218 * There's no danger of the buffer being replaced after we unpinned it
3219 * above, as it's pinned by the waiter. The waiter removes
3220 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3221 * backend waking it up.
3222 */
3223 uint32 buf_state = LockBufHdr(buf);
3224
3225 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3226 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3227 {
3228 /* we just released the last pin other than the waiter's */
3229 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3230
3231 UnlockBufHdrExt(buf, buf_state,
3233 0);
3234 ProcSendSignal(wait_backend_pgprocno);
3235 }
3236 else
3238}
3239
3240/*
3241 * UnpinBuffer -- make buffer available for replacement.
3242 *
3243 * This should be applied only to shared buffers, never local ones. This
3244 * always adjusts CurrentResourceOwner.
3245 */
3246static void
3248{
3250
3253}
3254
3255static void
3257{
3260
3262
3263 /* not moving as we're likely deleting it soon anyway */
3264 ref = GetPrivateRefCountEntry(b, false);
3265 Assert(ref != NULL);
3266 Assert(ref->refcount > 0);
3267 ref->refcount--;
3268 if (ref->refcount == 0)
3269 {
3270 uint32 old_buf_state;
3271
3272 /*
3273 * Mark buffer non-accessible to Valgrind.
3274 *
3275 * Note that the buffer may have already been marked non-accessible
3276 * within access method code that enforces that buffers are only
3277 * accessed while a buffer lock is held.
3278 */
3280
3281 /*
3282 * I'd better not still hold the buffer content lock. Can't use
3283 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3284 */
3286
3287 /* decrement the shared reference count */
3288 old_buf_state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
3289
3290 /* Support LockBufferForCleanup() */
3291 if (old_buf_state & BM_PIN_COUNT_WAITER)
3293
3295 }
3296}
3297
3298/*
3299 * Set up backend-local tracking of a buffer pinned the first time by this
3300 * backend.
3301 */
3302inline void
3304{
3306
3308 ref->refcount++;
3309
3311
3312 /*
3313 * This is the first pin for this page by this backend, mark its page as
3314 * defined to valgrind. While the page contents might not actually be
3315 * valid yet, we don't currently guarantee that such pages are marked
3316 * undefined or non-accessible.
3317 *
3318 * It's not necessarily the prettiest to do this here, but otherwise we'd
3319 * need this block of code in multiple places.
3320 */
3322 BLCKSZ);
3323}
3324
3325#define ST_SORT sort_checkpoint_bufferids
3326#define ST_ELEMENT_TYPE CkptSortItem
3327#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3328#define ST_SCOPE static
3329#define ST_DEFINE
3330#include "lib/sort_template.h"
3331
3332/*
3333 * BufferSync -- Write out all dirty buffers in the pool.
3334 *
3335 * This is called at checkpoint time to write out all dirty shared buffers.
3336 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3337 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3338 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3339 * even unlogged buffers, which are otherwise skipped. The remaining flags
3340 * currently have no effect here.
3341 */
3342static void
3343BufferSync(int flags)
3344{
3345 uint32 buf_state;
3346 int buf_id;
3347 int num_to_scan;
3348 int num_spaces;
3349 int num_processed;
3350 int num_written;
3351 CkptTsStatus *per_ts_stat = NULL;
3352 Oid last_tsid;
3353 binaryheap *ts_heap;
3354 int i;
3355 uint32 mask = BM_DIRTY;
3356 WritebackContext wb_context;
3357
3358 /*
3359 * Unless this is a shutdown checkpoint or we have been explicitly told,
3360 * we write only permanent, dirty buffers. But at shutdown or end of
3361 * recovery, we write all dirty buffers.
3362 */
3365 mask |= BM_PERMANENT;
3366
3367 /*
3368 * Loop over all buffers, and mark the ones that need to be written with
3369 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3370 * can estimate how much work needs to be done.
3371 *
3372 * This allows us to write only those pages that were dirty when the
3373 * checkpoint began, and not those that get dirtied while it proceeds.
3374 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3375 * later in this function, or by normal backends or the bgwriter cleaning
3376 * scan, the flag is cleared. Any buffer dirtied after this point won't
3377 * have the flag set.
3378 *
3379 * Note that if we fail to write some buffer, we may leave buffers with
3380 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3381 * certainly need to be written for the next checkpoint attempt, too.
3382 */
3383 num_to_scan = 0;
3384 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3385 {
3386 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3387 uint32 set_bits = 0;
3388
3389 /*
3390 * Header spinlock is enough to examine BM_DIRTY, see comment in
3391 * SyncOneBuffer.
3392 */
3393 buf_state = LockBufHdr(bufHdr);
3394
3395 if ((buf_state & mask) == mask)
3396 {
3397 CkptSortItem *item;
3398
3399 set_bits = BM_CHECKPOINT_NEEDED;
3400
3401 item = &CkptBufferIds[num_to_scan++];
3402 item->buf_id = buf_id;
3403 item->tsId = bufHdr->tag.spcOid;
3404 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3405 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3406 item->blockNum = bufHdr->tag.blockNum;
3407 }
3408
3409 UnlockBufHdrExt(bufHdr, buf_state,
3410 set_bits, 0,
3411 0);
3412
3413 /* Check for barrier events in case NBuffers is large. */
3416 }
3417
3418 if (num_to_scan == 0)
3419 return; /* nothing to do */
3420
3422
3423 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3424
3425 /*
3426 * Sort buffers that need to be written to reduce the likelihood of random
3427 * IO. The sorting is also important for the implementation of balancing
3428 * writes between tablespaces. Without balancing writes we'd potentially
3429 * end up writing to the tablespaces one-by-one; possibly overloading the
3430 * underlying system.
3431 */
3432 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3433
3434 num_spaces = 0;
3435
3436 /*
3437 * Allocate progress status for each tablespace with buffers that need to
3438 * be flushed. This requires the to-be-flushed array to be sorted.
3439 */
3440 last_tsid = InvalidOid;
3441 for (i = 0; i < num_to_scan; i++)
3442 {
3443 CkptTsStatus *s;
3444 Oid cur_tsid;
3445
3446 cur_tsid = CkptBufferIds[i].tsId;
3447
3448 /*
3449 * Grow array of per-tablespace status structs, every time a new
3450 * tablespace is found.
3451 */
3452 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3453 {
3454 Size sz;
3455
3456 num_spaces++;
3457
3458 /*
3459 * Not worth adding grow-by-power-of-2 logic here - even with a
3460 * few hundred tablespaces this should be fine.
3461 */
3462 sz = sizeof(CkptTsStatus) * num_spaces;
3463
3464 if (per_ts_stat == NULL)
3465 per_ts_stat = (CkptTsStatus *) palloc(sz);
3466 else
3467 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3468
3469 s = &per_ts_stat[num_spaces - 1];
3470 memset(s, 0, sizeof(*s));
3471 s->tsId = cur_tsid;
3472
3473 /*
3474 * The first buffer in this tablespace. As CkptBufferIds is sorted
3475 * by tablespace all (s->num_to_scan) buffers in this tablespace
3476 * will follow afterwards.
3477 */
3478 s->index = i;
3479
3480 /*
3481 * progress_slice will be determined once we know how many buffers
3482 * are in each tablespace, i.e. after this loop.
3483 */
3484
3485 last_tsid = cur_tsid;
3486 }
3487 else
3488 {
3489 s = &per_ts_stat[num_spaces - 1];
3490 }
3491
3492 s->num_to_scan++;
3493
3494 /* Check for barrier events. */
3497 }
3498
3499 Assert(num_spaces > 0);
3500
3501 /*
3502 * Build a min-heap over the write-progress in the individual tablespaces,
3503 * and compute how large a portion of the total progress a single
3504 * processed buffer is.
3505 */
3506 ts_heap = binaryheap_allocate(num_spaces,
3508 NULL);
3509
3510 for (i = 0; i < num_spaces; i++)
3511 {
3512 CkptTsStatus *ts_stat = &per_ts_stat[i];
3513
3514 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3515
3516 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3517 }
3518
3519 binaryheap_build(ts_heap);
3520
3521 /*
3522 * Iterate through to-be-checkpointed buffers and write the ones (still)
3523 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3524 * tablespaces; otherwise the sorting would lead to only one tablespace
3525 * receiving writes at a time, making inefficient use of the hardware.
3526 */
3527 num_processed = 0;
3528 num_written = 0;
3529 while (!binaryheap_empty(ts_heap))
3530 {
3531 BufferDesc *bufHdr = NULL;
3532 CkptTsStatus *ts_stat = (CkptTsStatus *)
3534
3535 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3536 Assert(buf_id != -1);
3537
3538 bufHdr = GetBufferDescriptor(buf_id);
3539
3540 num_processed++;
3541
3542 /*
3543 * We don't need to acquire the lock here, because we're only looking
3544 * at a single bit. It's possible that someone else writes the buffer
3545 * and clears the flag right after we check, but that doesn't matter
3546 * since SyncOneBuffer will then do nothing. However, there is a
3547 * further race condition: it's conceivable that between the time we
3548 * examine the bit here and the time SyncOneBuffer acquires the lock,
3549 * someone else not only wrote the buffer but replaced it with another
3550 * page and dirtied it. In that improbable case, SyncOneBuffer will
3551 * write the buffer though we didn't need to. It doesn't seem worth
3552 * guarding against this, though.
3553 */
3555 {
3556 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3557 {
3558 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3560 num_written++;
3561 }
3562 }
3563
3564 /*
3565 * Measure progress independent of actually having to flush the buffer
3566 * - otherwise writing become unbalanced.
3567 */
3568 ts_stat->progress += ts_stat->progress_slice;
3569 ts_stat->num_scanned++;
3570 ts_stat->index++;
3571
3572 /* Have all the buffers from the tablespace been processed? */
3573 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3574 {
3575 binaryheap_remove_first(ts_heap);
3576 }
3577 else
3578 {
3579 /* update heap with the new progress */
3580 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3581 }
3582
3583 /*
3584 * Sleep to throttle our I/O rate.
3585 *
3586 * (This will check for barrier events even if it doesn't sleep.)
3587 */
3588 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3589 }
3590
3591 /*
3592 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3593 * IOContext will always be IOCONTEXT_NORMAL.
3594 */
3596
3597 pfree(per_ts_stat);
3598 per_ts_stat = NULL;
3599 binaryheap_free(ts_heap);
3600
3601 /*
3602 * Update checkpoint statistics. As noted above, this doesn't include
3603 * buffers written by other backends or bgwriter scan.
3604 */
3605 CheckpointStats.ckpt_bufs_written += num_written;
3606
3607 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3608}
3609
3610/*
3611 * BgBufferSync -- Write out some dirty buffers in the pool.
3612 *
3613 * This is called periodically by the background writer process.
3614 *
3615 * Returns true if it's appropriate for the bgwriter process to go into
3616 * low-power hibernation mode. (This happens if the strategy clock-sweep
3617 * has been "lapped" and no buffer allocations have occurred recently,
3618 * or if the bgwriter has been effectively disabled by setting
3619 * bgwriter_lru_maxpages to 0.)
3620 */
3621bool
3623{
3624 /* info obtained from freelist.c */
3625 int strategy_buf_id;
3626 uint32 strategy_passes;
3627 uint32 recent_alloc;
3628
3629 /*
3630 * Information saved between calls so we can determine the strategy
3631 * point's advance rate and avoid scanning already-cleaned buffers.
3632 */
3633 static bool saved_info_valid = false;
3634 static int prev_strategy_buf_id;
3635 static uint32 prev_strategy_passes;
3636 static int next_to_clean;
3637 static uint32 next_passes;
3638
3639 /* Moving averages of allocation rate and clean-buffer density */
3640 static float smoothed_alloc = 0;
3641 static float smoothed_density = 10.0;
3642
3643 /* Potentially these could be tunables, but for now, not */
3644 float smoothing_samples = 16;
3645 float scan_whole_pool_milliseconds = 120000.0;
3646
3647 /* Used to compute how far we scan ahead */
3648 long strategy_delta;
3649 int bufs_to_lap;
3650 int bufs_ahead;
3651 float scans_per_alloc;
3652 int reusable_buffers_est;
3653 int upcoming_alloc_est;
3654 int min_scan_buffers;
3655
3656 /* Variables for the scanning loop proper */
3657 int num_to_scan;
3658 int num_written;
3659 int reusable_buffers;
3660
3661 /* Variables for final smoothed_density update */
3662 long new_strategy_delta;
3663 uint32 new_recent_alloc;
3664
3665 /*
3666 * Find out where the clock-sweep currently is, and how many buffer
3667 * allocations have happened since our last call.
3668 */
3669 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3670
3671 /* Report buffer alloc counts to pgstat */
3672 PendingBgWriterStats.buf_alloc += recent_alloc;
3673
3674 /*
3675 * If we're not running the LRU scan, just stop after doing the stats
3676 * stuff. We mark the saved state invalid so that we can recover sanely
3677 * if LRU scan is turned back on later.
3678 */
3679 if (bgwriter_lru_maxpages <= 0)
3680 {
3681 saved_info_valid = false;
3682 return true;
3683 }
3684
3685 /*
3686 * Compute strategy_delta = how many buffers have been scanned by the
3687 * clock-sweep since last time. If first time through, assume none. Then
3688 * see if we are still ahead of the clock-sweep, and if so, how many
3689 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3690 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3691 * behavior when the passes counts wrap around.
3692 */
3693 if (saved_info_valid)
3694 {
3695 int32 passes_delta = strategy_passes - prev_strategy_passes;
3696
3697 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3698 strategy_delta += (long) passes_delta * NBuffers;
3699
3700 Assert(strategy_delta >= 0);
3701
3702 if ((int32) (next_passes - strategy_passes) > 0)
3703 {
3704 /* we're one pass ahead of the strategy point */
3705 bufs_to_lap = strategy_buf_id - next_to_clean;
3706#ifdef BGW_DEBUG
3707 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3708 next_passes, next_to_clean,
3709 strategy_passes, strategy_buf_id,
3710 strategy_delta, bufs_to_lap);
3711#endif
3712 }
3713 else if (next_passes == strategy_passes &&
3714 next_to_clean >= strategy_buf_id)
3715 {
3716 /* on same pass, but ahead or at least not behind */
3717 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3718#ifdef BGW_DEBUG
3719 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3720 next_passes, next_to_clean,
3721 strategy_passes, strategy_buf_id,
3722 strategy_delta, bufs_to_lap);
3723#endif
3724 }
3725 else
3726 {
3727 /*
3728 * We're behind, so skip forward to the strategy point and start
3729 * cleaning from there.
3730 */
3731#ifdef BGW_DEBUG
3732 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3733 next_passes, next_to_clean,
3734 strategy_passes, strategy_buf_id,
3735 strategy_delta);
3736#endif
3737 next_to_clean = strategy_buf_id;
3738 next_passes = strategy_passes;
3739 bufs_to_lap = NBuffers;
3740 }
3741 }
3742 else
3743 {
3744 /*
3745 * Initializing at startup or after LRU scanning had been off. Always
3746 * start at the strategy point.
3747 */
3748#ifdef BGW_DEBUG
3749 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3750 strategy_passes, strategy_buf_id);
3751#endif
3752 strategy_delta = 0;
3753 next_to_clean = strategy_buf_id;
3754 next_passes = strategy_passes;
3755 bufs_to_lap = NBuffers;
3756 }
3757
3758 /* Update saved info for next time */
3759 prev_strategy_buf_id = strategy_buf_id;
3760 prev_strategy_passes = strategy_passes;
3761 saved_info_valid = true;
3762
3763 /*
3764 * Compute how many buffers had to be scanned for each new allocation, ie,
3765 * 1/density of reusable buffers, and track a moving average of that.
3766 *
3767 * If the strategy point didn't move, we don't update the density estimate
3768 */
3769 if (strategy_delta > 0 && recent_alloc > 0)
3770 {
3771 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3772 smoothed_density += (scans_per_alloc - smoothed_density) /
3773 smoothing_samples;
3774 }
3775
3776 /*
3777 * Estimate how many reusable buffers there are between the current
3778 * strategy point and where we've scanned ahead to, based on the smoothed
3779 * density estimate.
3780 */
3781 bufs_ahead = NBuffers - bufs_to_lap;
3782 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3783
3784 /*
3785 * Track a moving average of recent buffer allocations. Here, rather than
3786 * a true average we want a fast-attack, slow-decline behavior: we
3787 * immediately follow any increase.
3788 */
3789 if (smoothed_alloc <= (float) recent_alloc)
3790 smoothed_alloc = recent_alloc;
3791 else
3792 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3793 smoothing_samples;
3794
3795 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3796 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3797
3798 /*
3799 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3800 * eventually underflow to zero, and the underflows produce annoying
3801 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3802 * zero, there's no point in tracking smaller and smaller values of
3803 * smoothed_alloc, so just reset it to exactly zero to avoid this
3804 * syndrome. It will pop back up as soon as recent_alloc increases.
3805 */
3806 if (upcoming_alloc_est == 0)
3807 smoothed_alloc = 0;
3808
3809 /*
3810 * Even in cases where there's been little or no buffer allocation
3811 * activity, we want to make a small amount of progress through the buffer
3812 * cache so that as many reusable buffers as possible are clean after an
3813 * idle period.
3814 *
3815 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3816 * the BGW will be called during the scan_whole_pool time; slice the
3817 * buffer pool into that many sections.
3818 */
3819 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3820
3821 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3822 {
3823#ifdef BGW_DEBUG
3824 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3825 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3826#endif
3827 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3828 }
3829
3830 /*
3831 * Now write out dirty reusable buffers, working forward from the
3832 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3833 * enough buffers to match our estimate of the next cycle's allocation
3834 * requirements, or hit the bgwriter_lru_maxpages limit.
3835 */
3836
3837 num_to_scan = bufs_to_lap;
3838 num_written = 0;
3839 reusable_buffers = reusable_buffers_est;
3840
3841 /* Execute the LRU scan */
3842 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3843 {
3844 int sync_state = SyncOneBuffer(next_to_clean, true,
3845 wb_context);
3846
3847 if (++next_to_clean >= NBuffers)
3848 {
3849 next_to_clean = 0;
3850 next_passes++;
3851 }
3852 num_to_scan--;
3853
3854 if (sync_state & BUF_WRITTEN)
3855 {
3856 reusable_buffers++;
3857 if (++num_written >= bgwriter_lru_maxpages)
3858 {
3860 break;
3861 }
3862 }
3863 else if (sync_state & BUF_REUSABLE)
3864 reusable_buffers++;
3865 }
3866
3868
3869#ifdef BGW_DEBUG
3870 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3871 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3872 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3873 bufs_to_lap - num_to_scan,
3874 num_written,
3875 reusable_buffers - reusable_buffers_est);
3876#endif
3877
3878 /*
3879 * Consider the above scan as being like a new allocation scan.
3880 * Characterize its density and update the smoothed one based on it. This
3881 * effectively halves the moving average period in cases where both the
3882 * strategy and the background writer are doing some useful scanning,
3883 * which is helpful because a long memory isn't as desirable on the
3884 * density estimates.
3885 */
3886 new_strategy_delta = bufs_to_lap - num_to_scan;
3887 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3888 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3889 {
3890 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3891 smoothed_density += (scans_per_alloc - smoothed_density) /
3892 smoothing_samples;
3893
3894#ifdef BGW_DEBUG
3895 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3896 new_recent_alloc, new_strategy_delta,
3897 scans_per_alloc, smoothed_density);
3898#endif
3899 }
3900
3901 /* Return true if OK to hibernate */
3902 return (bufs_to_lap == 0 && recent_alloc == 0);
3903}
3904
3905/*
3906 * SyncOneBuffer -- process a single buffer during syncing.
3907 *
3908 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3909 * buffers marked recently used, as these are not replacement candidates.
3910 *
3911 * Returns a bitmask containing the following flag bits:
3912 * BUF_WRITTEN: we wrote the buffer.
3913 * BUF_REUSABLE: buffer is available for replacement, ie, it has
3914 * pin count 0 and usage count 0.
3915 *
3916 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3917 * after locking it, but we don't care all that much.)
3918 */
3919static int
3920SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3921{
3922 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3923 int result = 0;
3924 uint32 buf_state;
3925 BufferTag tag;
3926
3927 /* Make sure we can handle the pin */
3930
3931 /*
3932 * Check whether buffer needs writing.
3933 *
3934 * We can make this check without taking the buffer content lock so long
3935 * as we mark pages dirty in access methods *before* logging changes with
3936 * XLogInsert(): if someone marks the buffer dirty just after our check we
3937 * don't worry because our checkpoint.redo points before log record for
3938 * upcoming changes and so we are not required to write such dirty buffer.
3939 */
3940 buf_state = LockBufHdr(bufHdr);
3941
3942 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3943 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3944 {
3945 result |= BUF_REUSABLE;
3946 }
3947 else if (skip_recently_used)
3948 {
3949 /* Caller told us not to write recently-used buffers */
3950 UnlockBufHdr(bufHdr);
3951 return result;
3952 }
3953
3954 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3955 {
3956 /* It's clean, so nothing to do */
3957 UnlockBufHdr(bufHdr);
3958 return result;
3959 }
3960
3961 /*
3962 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3963 * buffer is clean by the time we've locked it.)
3964 */
3965 PinBuffer_Locked(bufHdr);
3966
3968
3969 tag = bufHdr->tag;
3970
3971 UnpinBuffer(bufHdr);
3972
3973 /*
3974 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3975 * IOContext will always be IOCONTEXT_NORMAL.
3976 */
3978
3979 return result | BUF_WRITTEN;
3980}
3981
3982/*
3983 * AtEOXact_Buffers - clean up at end of transaction.
3984 *
3985 * As of PostgreSQL 8.0, buffer pins should get released by the
3986 * ResourceOwner mechanism. This routine is just a debugging
3987 * cross-check that no pins remain.
3988 */
3989void
3990AtEOXact_Buffers(bool isCommit)
3991{
3993
3994 AtEOXact_LocalBuffers(isCommit);
3995
3997}
3998
3999/*
4000 * Initialize access to shared buffer pool
4001 *
4002 * This is called during backend startup (whether standalone or under the
4003 * postmaster). It sets up for this backend's access to the already-existing
4004 * buffer pool.
4005 */
4006void
4008{
4009 HASHCTL hash_ctl;
4010
4011 /*
4012 * An advisory limit on the number of pins each backend should hold, based
4013 * on shared_buffers and the maximum number of connections possible.
4014 * That's very pessimistic, but outside toy-sized shared_buffers it should
4015 * allow plenty of pins. LimitAdditionalPins() and
4016 * GetAdditionalPinLimit() can be used to check the remaining balance.
4017 */
4019
4020 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4021
4022 hash_ctl.keysize = sizeof(int32);
4023 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4024
4025 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4027
4028 /*
4029 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4030 * the corresponding phase of backend shutdown.
4031 */
4032 Assert(MyProc != NULL);
4034}
4035
4036/*
4037 * During backend exit, ensure that we released all shared-buffer locks and
4038 * assert that we have no remaining pins.
4039 */
4040static void
4042{
4043 UnlockBuffers();
4044
4046
4047 /* localbuf.c needs a chance too */
4049}
4050
4051/*
4052 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4053 *
4054 * As of PostgreSQL 8.0, buffer pins should get released by the
4055 * ResourceOwner mechanism. This routine is just a debugging
4056 * cross-check that no pins remain.
4057 */
4058static void
4060{
4061#ifdef USE_ASSERT_CHECKING
4062 int RefCountErrors = 0;
4064 int i;
4065 char *s;
4066
4067 /* check the array */
4068 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4069 {
4070 res = &PrivateRefCountArray[i];
4071
4072 if (res->buffer != InvalidBuffer)
4073 {
4075 elog(WARNING, "buffer refcount leak: %s", s);
4076 pfree(s);
4077
4078 RefCountErrors++;
4079 }
4080 }
4081
4082 /* if necessary search the hash */
4084 {
4085 HASH_SEQ_STATUS hstat;
4086
4088 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4089 {
4091 elog(WARNING, "buffer refcount leak: %s", s);
4092 pfree(s);
4093 RefCountErrors++;
4094 }
4095 }
4096
4097 Assert(RefCountErrors == 0);
4098#endif
4099}
4100
4101#ifdef USE_ASSERT_CHECKING
4102/*
4103 * Check for exclusive-locked catalog buffers. This is the core of
4104 * AssertCouldGetRelation().
4105 *
4106 * A backend would self-deadlock on LWLocks if the catalog scan read the
4107 * exclusive-locked buffer. The main threat is exclusive-locked buffers of
4108 * catalogs used in relcache, because a catcache search on any catalog may
4109 * build that catalog's relcache entry. We don't have an inventory of
4110 * catalogs relcache uses, so just check buffers of most catalogs.
4111 *
4112 * It's better to minimize waits while holding an exclusive buffer lock, so it
4113 * would be nice to broaden this check not to be catalog-specific. However,
4114 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4115 * read tables. That is deadlock-free as long as there's no loop in the
4116 * dependency graph: modifying table A may cause an opclass to read table B,
4117 * but it must not cause a read of table A.
4118 */
4119void
4120AssertBufferLocksPermitCatalogRead(void)
4121{
4122 ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
4123}
4124
4125static void
4126AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
4127 void *unused_context)
4128{
4129 BufferDesc *bufHdr;
4130 BufferTag tag;
4131 Oid relid;
4132
4133 if (mode != LW_EXCLUSIVE)
4134 return;
4135
4136 if (!((BufferDescPadded *) lock > BufferDescriptors &&
4138 return; /* not a buffer lock */
4139
4140 bufHdr = (BufferDesc *)
4141 ((char *) lock - offsetof(BufferDesc, content_lock));
4142 tag = bufHdr->tag;
4143
4144 /*
4145 * This relNumber==relid assumption holds until a catalog experiences
4146 * VACUUM FULL or similar. After a command like that, relNumber will be
4147 * in the normal (non-catalog) range, and we lose the ability to detect
4148 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4149 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4150 * held lock.
4151 */
4152 relid = tag.relNumber;
4153
4154 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4155 return;
4156
4158}
4159#endif
4160
4161
4162/*
4163 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4164 */
4165char *
4167{
4168 BufferDesc *buf;
4169 int32 loccount;
4170 char *result;
4171 ProcNumber backend;
4172 uint32 buf_state;
4173
4175 if (BufferIsLocal(buffer))
4176 {
4178 loccount = LocalRefCount[-buffer - 1];
4179 backend = MyProcNumber;
4180 }
4181 else
4182 {
4184 loccount = GetPrivateRefCount(buffer);
4185 backend = INVALID_PROC_NUMBER;
4186 }
4187
4188 /* theoretically we should lock the bufhdr here */
4189 buf_state = pg_atomic_read_u32(&buf->state);
4190
4191 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4192 buffer,
4194 BufTagGetForkNum(&buf->tag)).str,
4195 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4196 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4197 return result;
4198}
4199
4200/*
4201 * CheckPointBuffers
4202 *
4203 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4204 *
4205 * Note: temporary relations do not participate in checkpoints, so they don't
4206 * need to be flushed.
4207 */
4208void
4210{
4211 BufferSync(flags);
4212}
4213
4214/*
4215 * BufferGetBlockNumber
4216 * Returns the block number associated with a buffer.
4217 *
4218 * Note:
4219 * Assumes that the buffer is valid and pinned, else the
4220 * value may be obsolete immediately...
4221 */
4224{
4225 BufferDesc *bufHdr;
4226
4228
4229 if (BufferIsLocal(buffer))
4230 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4231 else
4232 bufHdr = GetBufferDescriptor(buffer - 1);
4233
4234 /* pinned, so OK to read tag without spinlock */
4235 return bufHdr->tag.blockNum;
4236}
4237
4238/*
4239 * BufferGetTag
4240 * Returns the relfilelocator, fork number and block number associated with
4241 * a buffer.
4242 */
4243void
4245 BlockNumber *blknum)
4246{
4247 BufferDesc *bufHdr;
4248
4249 /* Do the same checks as BufferGetBlockNumber. */
4251
4252 if (BufferIsLocal(buffer))
4253 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4254 else
4255 bufHdr = GetBufferDescriptor(buffer - 1);
4256
4257 /* pinned, so OK to read tag without spinlock */
4258 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4259 *forknum = BufTagGetForkNum(&bufHdr->tag);
4260 *blknum = bufHdr->tag.blockNum;
4261}
4262
4263/*
4264 * FlushBuffer
4265 * Physically write out a shared buffer.
4266 *
4267 * NOTE: this actually just passes the buffer contents to the kernel; the
4268 * real write to disk won't happen until the kernel feels like it. This
4269 * is okay from our point of view since we can redo the changes from WAL.
4270 * However, we will need to force the changes to disk via fsync before
4271 * we can checkpoint WAL.
4272 *
4273 * The caller must hold a pin on the buffer and have share-locked the
4274 * buffer contents. (Note: a share-lock does not prevent updates of
4275 * hint bits in the buffer, so the page could change while the write
4276 * is in progress, but we assume that that will not invalidate the data
4277 * written.)
4278 *
4279 * If the caller has an smgr reference for the buffer's relation, pass it
4280 * as the second parameter. If not, pass NULL.
4281 */
4282static void
4284 IOContext io_context)
4285{
4286 XLogRecPtr recptr;
4287 ErrorContextCallback errcallback;
4288 instr_time io_start;
4289 Block bufBlock;
4290 char *bufToWrite;
4291 uint32 buf_state;
4292
4293 /*
4294 * Try to start an I/O operation. If StartBufferIO returns false, then
4295 * someone else flushed the buffer before we could, so we need not do
4296 * anything.
4297 */
4298 if (!StartBufferIO(buf, false, false))
4299 return;
4300
4301 /* Setup error traceback support for ereport() */
4303 errcallback.arg = buf;
4304 errcallback.previous = error_context_stack;
4305 error_context_stack = &errcallback;
4306
4307 /* Find smgr relation for buffer */
4308 if (reln == NULL)
4310
4311 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4312 buf->tag.blockNum,
4316
4317 buf_state = LockBufHdr(buf);
4318
4319 /*
4320 * Run PageGetLSN while holding header lock, since we don't have the
4321 * buffer locked exclusively in all cases.
4322 */
4323 recptr = BufferGetLSN(buf);
4324
4325 /* To check if block content changes while flushing. - vadim 01/17/97 */
4326 UnlockBufHdrExt(buf, buf_state,
4327 0, BM_JUST_DIRTIED,
4328 0);
4329
4330 /*
4331 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4332 * rule that log updates must hit disk before any of the data-file changes
4333 * they describe do.
4334 *
4335 * However, this rule does not apply to unlogged relations, which will be
4336 * lost after a crash anyway. Most unlogged relation pages do not bear
4337 * LSNs since we never emit WAL records for them, and therefore flushing
4338 * up through the buffer LSN would be useless, but harmless. However,
4339 * GiST indexes use LSNs internally to track page-splits, and therefore
4340 * unlogged GiST pages bear "fake" LSNs generated by
4341 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4342 * LSN counter could advance past the WAL insertion point; and if it did
4343 * happen, attempting to flush WAL through that location would fail, with
4344 * disastrous system-wide consequences. To make sure that can't happen,
4345 * skip the flush if the buffer isn't permanent.
4346 */
4347 if (buf_state & BM_PERMANENT)
4348 XLogFlush(recptr);
4349
4350 /*
4351 * Now it's safe to write the buffer to disk. Note that no one else should
4352 * have been able to write it, while we were busy with log flushing,
4353 * because we got the exclusive right to perform I/O by setting the
4354 * BM_IO_IN_PROGRESS bit.
4355 */
4356 bufBlock = BufHdrGetBlock(buf);
4357
4358 /*
4359 * Update page checksum if desired. Since we have only shared lock on the
4360 * buffer, other processes might be updating hint bits in it, so we must
4361 * copy the page to private storage if we do checksumming.
4362 */
4363 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4364
4366
4367 /*
4368 * bufToWrite is either the shared buffer or a copy, as appropriate.
4369 */
4370 smgrwrite(reln,
4371 BufTagGetForkNum(&buf->tag),
4372 buf->tag.blockNum,
4373 bufToWrite,
4374 false);
4375
4376 /*
4377 * When a strategy is in use, only flushes of dirty buffers already in the
4378 * strategy ring are counted as strategy writes (IOCONTEXT
4379 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4380 * statistics tracking.
4381 *
4382 * If a shared buffer initially added to the ring must be flushed before
4383 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4384 *
4385 * If a shared buffer which was added to the ring later because the
4386 * current strategy buffer is pinned or in use or because all strategy
4387 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4388 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4389 * (from_ring will be false).
4390 *
4391 * When a strategy is not in use, the write can only be a "regular" write
4392 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4393 */
4395 IOOP_WRITE, io_start, 1, BLCKSZ);
4396
4398
4399 /*
4400 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4401 * end the BM_IO_IN_PROGRESS state.
4402 */
4403 TerminateBufferIO(buf, true, 0, true, false);
4404
4405 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4406 buf->tag.blockNum,
4410
4411 /* Pop the error context stack */
4412 error_context_stack = errcallback.previous;
4413}
4414
4415/*
4416 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4417 * before/after calling FlushBuffer().
4418 */
4419static void
4421 IOObject io_object, IOContext io_context)
4422{
4426}
4427
4428/*
4429 * RelationGetNumberOfBlocksInFork
4430 * Determines the current number of pages in the specified relation fork.
4431 *
4432 * Note that the accuracy of the result will depend on the details of the
4433 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4434 * it might not be.
4435 */
4438{
4439 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4440 {
4441 /*
4442 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4443 * tableam returns the size in bytes - but for the purpose of this
4444 * routine, we want the number of blocks. Therefore divide, rounding
4445 * up.
4446 */
4447 uint64 szbytes;
4448
4449 szbytes = table_relation_size(relation, forkNum);
4450
4451 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4452 }
4453 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4454 {
4455 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4456 }
4457 else
4458 Assert(false);
4459
4460 return 0; /* keep compiler quiet */
4461}
4462
4463/*
4464 * BufferIsPermanent
4465 * Determines whether a buffer will potentially still be around after
4466 * a crash. Caller must hold a buffer pin.
4467 */
4468bool
4470{
4471 BufferDesc *bufHdr;
4472
4473 /* Local buffers are used only for temp relations. */
4474 if (BufferIsLocal(buffer))
4475 return false;
4476
4477 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4480
4481 /*
4482 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4483 * need not bother with the buffer header spinlock. Even if someone else
4484 * changes the buffer header state while we're doing this, the state is
4485 * changed atomically, so we'll read the old value or the new value, but
4486 * not random garbage.
4487 */
4488 bufHdr = GetBufferDescriptor(buffer - 1);
4489 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4490}
4491
4492/*
4493 * BufferGetLSNAtomic
4494 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4495 * This is necessary for some callers who may not have an exclusive lock
4496 * on the buffer.
4497 */
4500{
4501 char *page = BufferGetPage(buffer);
4502 BufferDesc *bufHdr;
4503 XLogRecPtr lsn;
4504
4505 /*
4506 * If we don't need locking for correctness, fastpath out.
4507 */
4509 return PageGetLSN(page);
4510
4511 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4514
4515 bufHdr = GetBufferDescriptor(buffer - 1);
4516 LockBufHdr(bufHdr);
4517 lsn = PageGetLSN(page);
4518 UnlockBufHdr(bufHdr);
4519
4520 return lsn;
4521}
4522
4523/* ---------------------------------------------------------------------
4524 * DropRelationBuffers
4525 *
4526 * This function removes from the buffer pool all the pages of the
4527 * specified relation forks that have block numbers >= firstDelBlock.
4528 * (In particular, with firstDelBlock = 0, all pages are removed.)
4529 * Dirty pages are simply dropped, without bothering to write them
4530 * out first. Therefore, this is NOT rollback-able, and so should be
4531 * used only with extreme caution!
4532 *
4533 * Currently, this is called only from smgr.c when the underlying file
4534 * is about to be deleted or truncated (firstDelBlock is needed for
4535 * the truncation case). The data in the affected pages would therefore
4536 * be deleted momentarily anyway, and there is no point in writing it.
4537 * It is the responsibility of higher-level code to ensure that the
4538 * deletion or truncation does not lose any data that could be needed
4539 * later. It is also the responsibility of higher-level code to ensure
4540 * that no other process could be trying to load more pages of the
4541 * relation into buffers.
4542 * --------------------------------------------------------------------
4543 */
4544void
4546 int nforks, BlockNumber *firstDelBlock)
4547{
4548 int i;
4549 int j;
4550 RelFileLocatorBackend rlocator;
4551 BlockNumber nForkBlock[MAX_FORKNUM];
4552 uint64 nBlocksToInvalidate = 0;
4553
4554 rlocator = smgr_reln->smgr_rlocator;
4555
4556 /* If it's a local relation, it's localbuf.c's problem. */
4557 if (RelFileLocatorBackendIsTemp(rlocator))
4558 {
4559 if (rlocator.backend == MyProcNumber)
4560 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4561 firstDelBlock);
4562
4563 return;
4564 }
4565
4566 /*
4567 * To remove all the pages of the specified relation forks from the buffer
4568 * pool, we need to scan the entire buffer pool but we can optimize it by
4569 * finding the buffers from BufMapping table provided we know the exact
4570 * size of each fork of the relation. The exact size is required to ensure
4571 * that we don't leave any buffer for the relation being dropped as
4572 * otherwise the background writer or checkpointer can lead to a PANIC
4573 * error while flushing buffers corresponding to files that don't exist.
4574 *
4575 * To know the exact size, we rely on the size cached for each fork by us
4576 * during recovery which limits the optimization to recovery and on
4577 * standbys but we can easily extend it once we have shared cache for
4578 * relation size.
4579 *
4580 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4581 * and the future writes keeps the cached value up-to-date. See
4582 * smgrextend. It is possible that the value of the first lseek is smaller
4583 * than the actual number of existing blocks in the file due to buggy
4584 * Linux kernels that might not have accounted for the recent write. But
4585 * that should be fine because there must not be any buffers after that
4586 * file size.
4587 */
4588 for (i = 0; i < nforks; i++)
4589 {
4590 /* Get the number of blocks for a relation's fork */
4591 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4592
4593 if (nForkBlock[i] == InvalidBlockNumber)
4594 {
4595 nBlocksToInvalidate = InvalidBlockNumber;
4596 break;
4597 }
4598
4599 /* calculate the number of blocks to be invalidated */
4600 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4601 }
4602
4603 /*
4604 * We apply the optimization iff the total number of blocks to invalidate
4605 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4606 */
4607 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4608 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4609 {
4610 for (j = 0; j < nforks; j++)
4611 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4612 nForkBlock[j], firstDelBlock[j]);
4613 return;
4614 }
4615
4616 for (i = 0; i < NBuffers; i++)
4617 {
4618 BufferDesc *bufHdr = GetBufferDescriptor(i);
4619
4620 /*
4621 * We can make this a tad faster by prechecking the buffer tag before
4622 * we attempt to lock the buffer; this saves a lot of lock
4623 * acquisitions in typical cases. It should be safe because the
4624 * caller must have AccessExclusiveLock on the relation, or some other
4625 * reason to be certain that no one is loading new pages of the rel
4626 * into the buffer pool. (Otherwise we might well miss such pages
4627 * entirely.) Therefore, while the tag might be changing while we
4628 * look at it, it can't be changing *to* a value we care about, only
4629 * *away* from such a value. So false negatives are impossible, and
4630 * false positives are safe because we'll recheck after getting the
4631 * buffer lock.
4632 *
4633 * We could check forkNum and blockNum as well as the rlocator, but
4634 * the incremental win from doing so seems small.
4635 */
4636 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4637 continue;
4638
4639 LockBufHdr(bufHdr);
4640
4641 for (j = 0; j < nforks; j++)
4642 {
4643 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4644 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4645 bufHdr->tag.blockNum >= firstDelBlock[j])
4646 {
4647 InvalidateBuffer(bufHdr); /* releases spinlock */
4648 break;
4649 }
4650 }
4651 if (j >= nforks)
4652 UnlockBufHdr(bufHdr);
4653 }
4654}
4655
4656/* ---------------------------------------------------------------------
4657 * DropRelationsAllBuffers
4658 *
4659 * This function removes from the buffer pool all the pages of all
4660 * forks of the specified relations. It's equivalent to calling
4661 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4662 * --------------------------------------------------------------------
4663 */
4664void
4665DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4666{
4667 int i;
4668 int n = 0;
4669 SMgrRelation *rels;
4670 BlockNumber (*block)[MAX_FORKNUM + 1];
4671 uint64 nBlocksToInvalidate = 0;
4672 RelFileLocator *locators;
4673 bool cached = true;
4674 bool use_bsearch;
4675
4676 if (nlocators == 0)
4677 return;
4678
4679 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4680
4681 /* If it's a local relation, it's localbuf.c's problem. */
4682 for (i = 0; i < nlocators; i++)
4683 {
4684 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4685 {
4686 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4687 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4688 }
4689 else
4690 rels[n++] = smgr_reln[i];
4691 }
4692
4693 /*
4694 * If there are no non-local relations, then we're done. Release the
4695 * memory and return.
4696 */
4697 if (n == 0)
4698 {
4699 pfree(rels);
4700 return;
4701 }
4702
4703 /*
4704 * This is used to remember the number of blocks for all the relations
4705 * forks.
4706 */
4707 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4708 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4709
4710 /*
4711 * We can avoid scanning the entire buffer pool if we know the exact size
4712 * of each of the given relation forks. See DropRelationBuffers.
4713 */
4714 for (i = 0; i < n && cached; i++)
4715 {
4716 for (int j = 0; j <= MAX_FORKNUM; j++)
4717 {
4718 /* Get the number of blocks for a relation's fork. */
4719 block[i][j] = smgrnblocks_cached(rels[i], j);
4720
4721 /* We need to only consider the relation forks that exists. */
4722 if (block[i][j] == InvalidBlockNumber)
4723 {
4724 if (!smgrexists(rels[i], j))
4725 continue;
4726 cached = false;
4727 break;
4728 }
4729
4730 /* calculate the total number of blocks to be invalidated */
4731 nBlocksToInvalidate += block[i][j];
4732 }
4733 }
4734
4735 /*
4736 * We apply the optimization iff the total number of blocks to invalidate
4737 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4738 */
4739 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4740 {
4741 for (i = 0; i < n; i++)
4742 {
4743 for (int j = 0; j <= MAX_FORKNUM; j++)
4744 {
4745 /* ignore relation forks that doesn't exist */
4746 if (!BlockNumberIsValid(block[i][j]))
4747 continue;
4748
4749 /* drop all the buffers for a particular relation fork */
4750 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4751 j, block[i][j], 0);
4752 }
4753 }
4754
4755 pfree(block);
4756 pfree(rels);
4757 return;
4758 }
4759
4760 pfree(block);
4761 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4762 for (i = 0; i < n; i++)
4763 locators[i] = rels[i]->smgr_rlocator.locator;
4764
4765 /*
4766 * For low number of relations to drop just use a simple walk through, to
4767 * save the bsearch overhead. The threshold to use is rather a guess than
4768 * an exactly determined value, as it depends on many factors (CPU and RAM
4769 * speeds, amount of shared buffers etc.).
4770 */
4771 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4772
4773 /* sort the list of rlocators if necessary */
4774 if (use_bsearch)
4775 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4776
4777 for (i = 0; i < NBuffers; i++)
4778 {
4779 RelFileLocator *rlocator = NULL;
4780 BufferDesc *bufHdr = GetBufferDescriptor(i);
4781
4782 /*
4783 * As in DropRelationBuffers, an unlocked precheck should be safe and
4784 * saves some cycles.
4785 */
4786
4787 if (!use_bsearch)
4788 {
4789 int j;
4790
4791 for (j = 0; j < n; j++)
4792 {
4793 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4794 {
4795 rlocator = &locators[j];
4796 break;
4797 }
4798 }
4799 }
4800 else
4801 {
4802 RelFileLocator locator;
4803
4804 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4805 rlocator = bsearch(&locator,
4806 locators, n, sizeof(RelFileLocator),
4808 }
4809
4810 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4811 if (rlocator == NULL)
4812 continue;
4813
4814 LockBufHdr(bufHdr);
4815 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4816 InvalidateBuffer(bufHdr); /* releases spinlock */
4817 else
4818 UnlockBufHdr(bufHdr);
4819 }
4820
4821 pfree(locators);
4822 pfree(rels);
4823}
4824
4825/* ---------------------------------------------------------------------
4826 * FindAndDropRelationBuffers
4827 *
4828 * This function performs look up in BufMapping table and removes from the
4829 * buffer pool all the pages of the specified relation fork that has block
4830 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4831 * pages are removed.)
4832 * --------------------------------------------------------------------
4833 */
4834static void
4836 BlockNumber nForkBlock,
4837 BlockNumber firstDelBlock)
4838{
4839 BlockNumber curBlock;
4840
4841 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4842 {
4843 uint32 bufHash; /* hash value for tag */
4844 BufferTag bufTag; /* identity of requested block */
4845 LWLock *bufPartitionLock; /* buffer partition lock for it */
4846 int buf_id;
4847 BufferDesc *bufHdr;
4848
4849 /* create a tag so we can lookup the buffer */
4850 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4851
4852 /* determine its hash code and partition lock ID */
4853 bufHash = BufTableHashCode(&bufTag);
4854 bufPartitionLock = BufMappingPartitionLock(bufHash);
4855
4856 /* Check that it is in the buffer pool. If not, do nothing. */
4857 LWLockAcquire(bufPartitionLock, LW_SHARED);
4858 buf_id = BufTableLookup(&bufTag, bufHash);
4859 LWLockRelease(bufPartitionLock);
4860
4861 if (buf_id < 0)
4862 continue;
4863
4864 bufHdr = GetBufferDescriptor(buf_id);
4865
4866 /*
4867 * We need to lock the buffer header and recheck if the buffer is
4868 * still associated with the same block because the buffer could be
4869 * evicted by some other backend loading blocks for a different
4870 * relation after we release lock on the BufMapping table.
4871 */
4872 LockBufHdr(bufHdr);
4873
4874 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4875 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4876 bufHdr->tag.blockNum >= firstDelBlock)
4877 InvalidateBuffer(bufHdr); /* releases spinlock */
4878 else
4879 UnlockBufHdr(bufHdr);
4880 }
4881}
4882
4883/* ---------------------------------------------------------------------
4884 * DropDatabaseBuffers
4885 *
4886 * This function removes all the buffers in the buffer cache for a
4887 * particular database. Dirty pages are simply dropped, without
4888 * bothering to write them out first. This is used when we destroy a
4889 * database, to avoid trying to flush data to disk when the directory
4890 * tree no longer exists. Implementation is pretty similar to
4891 * DropRelationBuffers() which is for destroying just one relation.
4892 * --------------------------------------------------------------------
4893 */
4894void
4896{
4897 int i;
4898
4899 /*
4900 * We needn't consider local buffers, since by assumption the target
4901 * database isn't our own.
4902 */
4903
4904 for (i = 0; i < NBuffers; i++)
4905 {
4906 BufferDesc *bufHdr = GetBufferDescriptor(i);
4907
4908 /*
4909 * As in DropRelationBuffers, an unlocked precheck should be safe and
4910 * saves some cycles.
4911 */
4912 if (bufHdr->tag.dbOid != dbid)
4913 continue;
4914
4915 LockBufHdr(bufHdr);
4916 if (bufHdr->tag.dbOid == dbid)
4917 InvalidateBuffer(bufHdr); /* releases spinlock */
4918 else
4919 UnlockBufHdr(bufHdr);
4920 }
4921}
4922
4923/* ---------------------------------------------------------------------
4924 * FlushRelationBuffers
4925 *
4926 * This function writes all dirty pages of a relation out to disk
4927 * (or more accurately, out to kernel disk buffers), ensuring that the
4928 * kernel has an up-to-date view of the relation.
4929 *
4930 * Generally, the caller should be holding AccessExclusiveLock on the
4931 * target relation to ensure that no other backend is busy dirtying
4932 * more blocks of the relation; the effects can't be expected to last
4933 * after the lock is released.
4934 *
4935 * XXX currently it sequentially searches the buffer pool, should be
4936 * changed to more clever ways of searching. This routine is not
4937 * used in any performance-critical code paths, so it's not worth
4938 * adding additional overhead to normal paths to make it go faster.
4939 * --------------------------------------------------------------------
4940 */
4941void
4943{
4944 int i;
4945 BufferDesc *bufHdr;
4946 SMgrRelation srel = RelationGetSmgr(rel);
4947
4948 if (RelationUsesLocalBuffers(rel))
4949 {
4950 for (i = 0; i < NLocBuffer; i++)
4951 {
4952 uint32 buf_state;
4953
4954 bufHdr = GetLocalBufferDescriptor(i);
4955 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4956 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4957 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4958 {
4959 ErrorContextCallback errcallback;
4960
4961 /* Setup error traceback support for ereport() */
4963 errcallback.arg = bufHdr;
4964 errcallback.previous = error_context_stack;
4965 error_context_stack = &errcallback;
4966
4967 /* Make sure we can handle the pin */
4970
4971 /*
4972 * Pin/unpin mostly to make valgrind work, but it also seems
4973 * like the right thing to do.
4974 */
4975 PinLocalBuffer(bufHdr, false);
4976
4977
4978 FlushLocalBuffer(bufHdr, srel);
4979
4981
4982 /* Pop the error context stack */
4983 error_context_stack = errcallback.previous;
4984 }
4985 }
4986
4987 return;
4988 }
4989
4990 for (i = 0; i < NBuffers; i++)
4991 {
4992 uint32 buf_state;
4993
4994 bufHdr = GetBufferDescriptor(i);
4995
4996 /*
4997 * As in DropRelationBuffers, an unlocked precheck should be safe and
4998 * saves some cycles.
4999 */
5000 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5001 continue;
5002
5003 /* Make sure we can handle the pin */
5006
5007 buf_state = LockBufHdr(bufHdr);
5008 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5009 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5010 {
5011 PinBuffer_Locked(bufHdr);
5013 UnpinBuffer(bufHdr);
5014 }
5015 else
5016 UnlockBufHdr(bufHdr);
5017 }
5018}
5019
5020/* ---------------------------------------------------------------------
5021 * FlushRelationsAllBuffers
5022 *
5023 * This function flushes out of the buffer pool all the pages of all
5024 * forks of the specified smgr relations. It's equivalent to calling
5025 * FlushRelationBuffers once per relation. The relations are assumed not
5026 * to use local buffers.
5027 * --------------------------------------------------------------------
5028 */
5029void
5031{
5032 int i;
5033 SMgrSortArray *srels;
5034 bool use_bsearch;
5035
5036 if (nrels == 0)
5037 return;
5038
5039 /* fill-in array for qsort */
5040 srels = palloc(sizeof(SMgrSortArray) * nrels);
5041
5042 for (i = 0; i < nrels; i++)
5043 {
5044 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5045
5046 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5047 srels[i].srel = smgrs[i];
5048 }
5049
5050 /*
5051 * Save the bsearch overhead for low number of relations to sync. See
5052 * DropRelationsAllBuffers for details.
5053 */
5054 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5055
5056 /* sort the list of SMgrRelations if necessary */
5057 if (use_bsearch)
5058 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5059
5060 for (i = 0; i < NBuffers; i++)
5061 {
5062 SMgrSortArray *srelent = NULL;
5063 BufferDesc *bufHdr = GetBufferDescriptor(i);
5064 uint32 buf_state;
5065
5066 /*
5067 * As in DropRelationBuffers, an unlocked precheck should be safe and
5068 * saves some cycles.
5069 */
5070
5071 if (!use_bsearch)
5072 {
5073 int j;
5074
5075 for (j = 0; j < nrels; j++)
5076 {
5077 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5078 {
5079 srelent = &srels[j];
5080 break;
5081 }
5082 }
5083 }
5084 else
5085 {
5086 RelFileLocator rlocator;
5087
5088 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5089 srelent = bsearch(&rlocator,
5090 srels, nrels, sizeof(SMgrSortArray),
5092 }
5093
5094 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5095 if (srelent == NULL)
5096 continue;
5097
5098 /* Make sure we can handle the pin */
5101
5102 buf_state = LockBufHdr(bufHdr);
5103 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5104 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5105 {
5106 PinBuffer_Locked(bufHdr);
5108 UnpinBuffer(bufHdr);
5109 }
5110 else
5111 UnlockBufHdr(bufHdr);
5112 }
5113
5114 pfree(srels);
5115}
5116
5117/* ---------------------------------------------------------------------
5118 * RelationCopyStorageUsingBuffer
5119 *
5120 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5121 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5122 *
5123 * Refer comments atop CreateAndCopyRelationData() for details about
5124 * 'permanent' parameter.
5125 * --------------------------------------------------------------------
5126 */
5127static void
5129 RelFileLocator dstlocator,
5130 ForkNumber forkNum, bool permanent)
5131{
5132 Buffer srcBuf;
5133 Buffer dstBuf;
5134 Page srcPage;
5135 Page dstPage;
5136 bool use_wal;
5137 BlockNumber nblocks;
5138 BlockNumber blkno;
5140 BufferAccessStrategy bstrategy_src;
5141 BufferAccessStrategy bstrategy_dst;
5143 ReadStream *src_stream;
5144 SMgrRelation src_smgr;
5145
5146 /*
5147 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5148 * can skip it when copying any fork of an unlogged relation other than
5149 * the init fork.
5150 */
5151 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5152
5153 /* Get number of blocks in the source relation. */
5154 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5155 forkNum);
5156
5157 /* Nothing to copy; just return. */
5158 if (nblocks == 0)
5159 return;
5160
5161 /*
5162 * Bulk extend the destination relation of the same size as the source
5163 * relation before starting to copy block by block.
5164 */
5165 memset(buf.data, 0, BLCKSZ);
5166 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5167 buf.data, true);
5168
5169 /* This is a bulk operation, so use buffer access strategies. */
5170 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5171 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5172
5173 /* Initialize streaming read */
5174 p.current_blocknum = 0;
5175 p.last_exclusive = nblocks;
5176 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5177
5178 /*
5179 * It is safe to use batchmode as block_range_read_stream_cb takes no
5180 * locks.
5181 */
5184 bstrategy_src,
5185 src_smgr,
5186 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5187 forkNum,
5189 &p,
5190 0);
5191
5192 /* Iterate over each block of the source relation file. */
5193 for (blkno = 0; blkno < nblocks; blkno++)
5194 {
5196
5197 /* Read block from source relation. */
5198 srcBuf = read_stream_next_buffer(src_stream, NULL);
5200 srcPage = BufferGetPage(srcBuf);
5201
5202 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5203 BufferGetBlockNumber(srcBuf),
5204 RBM_ZERO_AND_LOCK, bstrategy_dst,
5205 permanent);
5206 dstPage = BufferGetPage(dstBuf);
5207
5209
5210 /* Copy page data from the source to the destination. */
5211 memcpy(dstPage, srcPage, BLCKSZ);
5212 MarkBufferDirty(dstBuf);
5213
5214 /* WAL-log the copied page. */
5215 if (use_wal)
5216 log_newpage_buffer(dstBuf, true);
5217
5219
5220 UnlockReleaseBuffer(dstBuf);
5221 UnlockReleaseBuffer(srcBuf);
5222 }
5223 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5224 read_stream_end(src_stream);
5225
5226 FreeAccessStrategy(bstrategy_src);
5227 FreeAccessStrategy(bstrategy_dst);
5228}
5229
5230/* ---------------------------------------------------------------------
5231 * CreateAndCopyRelationData
5232 *
5233 * Create destination relation storage and copy all forks from the
5234 * source relation to the destination.
5235 *
5236 * Pass permanent as true for permanent relations and false for
5237 * unlogged relations. Currently this API is not supported for
5238 * temporary relations.
5239 * --------------------------------------------------------------------
5240 */
5241void
5243 RelFileLocator dst_rlocator, bool permanent)
5244{
5245 char relpersistence;
5246 SMgrRelation src_rel;
5247 SMgrRelation dst_rel;
5248
5249 /* Set the relpersistence. */
5250 relpersistence = permanent ?
5251 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5252
5253 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5254 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5255
5256 /*
5257 * Create and copy all forks of the relation. During create database we
5258 * have a separate cleanup mechanism which deletes complete database
5259 * directory. Therefore, each individual relation doesn't need to be
5260 * registered for cleanup.
5261 */
5262 RelationCreateStorage(dst_rlocator, relpersistence, false);
5263
5264 /* copy main fork. */
5265 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5266 permanent);
5267
5268 /* copy those extra forks that exist */
5269 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5270 forkNum <= MAX_FORKNUM; forkNum++)
5271 {
5272 if (smgrexists(src_rel, forkNum))
5273 {
5274 smgrcreate(dst_rel, forkNum, false);
5275
5276 /*
5277 * WAL log creation if the relation is persistent, or this is the
5278 * init fork of an unlogged relation.
5279 */
5280 if (permanent || forkNum == INIT_FORKNUM)
5281 log_smgrcreate(&dst_rlocator, forkNum);
5282
5283 /* Copy a fork's data, block by block. */
5284 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5285 permanent);
5286 }
5287 }
5288}
5289
5290/* ---------------------------------------------------------------------
5291 * FlushDatabaseBuffers
5292 *
5293 * This function writes all dirty pages of a database out to disk
5294 * (or more accurately, out to kernel disk buffers), ensuring that the
5295 * kernel has an up-to-date view of the database.
5296 *
5297 * Generally, the caller should be holding an appropriate lock to ensure
5298 * no other backend is active in the target database; otherwise more
5299 * pages could get dirtied.
5300 *
5301 * Note we don't worry about flushing any pages of temporary relations.
5302 * It's assumed these wouldn't be interesting.
5303 * --------------------------------------------------------------------
5304 */
5305void
5307{
5308 int i;
5309 BufferDesc *bufHdr;
5310
5311 for (i = 0; i < NBuffers; i++)
5312 {
5313 uint32 buf_state;
5314
5315 bufHdr = GetBufferDescriptor(i);
5316
5317 /*
5318 * As in DropRelationBuffers, an unlocked precheck should be safe and
5319 * saves some cycles.
5320 */
5321 if (bufHdr->tag.dbOid != dbid)
5322 continue;
5323
5324 /* Make sure we can handle the pin */
5327
5328 buf_state = LockBufHdr(bufHdr);
5329 if (bufHdr->tag.dbOid == dbid &&
5330 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5331 {
5332 PinBuffer_Locked(bufHdr);
5334 UnpinBuffer(bufHdr);
5335 }
5336 else
5337 UnlockBufHdr(bufHdr);
5338 }
5339}
5340
5341/*
5342 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5343 * OS.
5344 */
5345void
5347{
5348 BufferDesc *bufHdr;
5349
5350 /* currently not needed, but no fundamental reason not to support */
5352
5354
5355 bufHdr = GetBufferDescriptor(buffer - 1);
5356
5358
5360}
5361
5362/*
5363 * ReleaseBuffer -- release the pin on a buffer
5364 */
5365void
5367{
5368 if (!BufferIsValid(buffer))
5369 elog(ERROR, "bad buffer ID: %d", buffer);
5370
5371 if (BufferIsLocal(buffer))
5373 else
5375}
5376
5377/*
5378 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5379 *
5380 * This is just a shorthand for a common combination.
5381 */
5382void
5384{
5387}
5388
5389/*
5390 * IncrBufferRefCount
5391 * Increment the pin count on a buffer that we have *already* pinned
5392 * at least once.
5393 *
5394 * This function cannot be used on a buffer we do not have pinned,
5395 * because it doesn't change the shared buffer state.
5396 */
5397void
5399{
5402 if (BufferIsLocal(buffer))
5403 LocalRefCount[-buffer - 1]++;
5404 else
5405 {
5407
5408 ref = GetPrivateRefCountEntry(buffer, true);
5409 Assert(ref != NULL);
5410 ref->refcount++;
5411 }
5413}
5414
5415/*
5416 * MarkBufferDirtyHint
5417 *
5418 * Mark a buffer dirty for non-critical changes.
5419 *
5420 * This is essentially the same as MarkBufferDirty, except:
5421 *
5422 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5423 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5424 * 2. The caller might have only share-lock instead of exclusive-lock on the
5425 * buffer's content lock.
5426 * 3. This function does not guarantee that the buffer is always marked dirty
5427 * (due to a race condition), so it cannot be used for important changes.
5428 */
5429void
5431{
5432 BufferDesc *bufHdr;
5433 Page page = BufferGetPage(buffer);
5434
5435 if (!BufferIsValid(buffer))
5436 elog(ERROR, "bad buffer ID: %d", buffer);
5437
5438 if (BufferIsLocal(buffer))
5439 {
5441 return;
5442 }
5443
5444 bufHdr = GetBufferDescriptor(buffer - 1);
5445
5447 /* here, either share or exclusive lock is OK */
5449
5450 /*
5451 * This routine might get called many times on the same page, if we are
5452 * making the first scan after commit of an xact that added/deleted many
5453 * tuples. So, be as quick as we can if the buffer is already dirty. We
5454 * do this by not acquiring spinlock if it looks like the status bits are
5455 * already set. Since we make this test unlocked, there's a chance we
5456 * might fail to notice that the flags have just been cleared, and failed
5457 * to reset them, due to memory-ordering issues. But since this function
5458 * is only intended to be used in cases where failing to write out the
5459 * data would be harmless anyway, it doesn't really matter.
5460 */
5461 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5463 {
5465 bool dirtied = false;
5466 bool delayChkptFlags = false;
5467 uint32 buf_state;
5468
5469 /*
5470 * If we need to protect hint bit updates from torn writes, WAL-log a
5471 * full page image of the page. This full page image is only necessary
5472 * if the hint bit update is the first change to the page since the
5473 * last checkpoint.
5474 *
5475 * We don't check full_page_writes here because that logic is included
5476 * when we call XLogInsert() since the value changes dynamically.
5477 */
5478 if (XLogHintBitIsNeeded() &&
5480 {
5481 /*
5482 * If we must not write WAL, due to a relfilelocator-specific
5483 * condition or being in recovery, don't dirty the page. We can
5484 * set the hint, just not dirty the page as a result so the hint
5485 * is lost when we evict the page or shutdown.
5486 *
5487 * See src/backend/storage/page/README for longer discussion.
5488 */
5489 if (RecoveryInProgress() ||
5491 return;
5492
5493 /*
5494 * If the block is already dirty because we either made a change
5495 * or set a hint already, then we don't need to write a full page
5496 * image. Note that aggressive cleaning of blocks dirtied by hint
5497 * bit setting would increase the call rate. Bulk setting of hint
5498 * bits would reduce the call rate...
5499 *
5500 * We must issue the WAL record before we mark the buffer dirty.
5501 * Otherwise we might write the page before we write the WAL. That
5502 * causes a race condition, since a checkpoint might occur between
5503 * writing the WAL record and marking the buffer dirty. We solve
5504 * that with a kluge, but one that is already in use during
5505 * transaction commit to prevent race conditions. Basically, we
5506 * simply prevent the checkpoint WAL record from being written
5507 * until we have marked the buffer dirty. We don't start the
5508 * checkpoint flush until we have marked dirty, so our checkpoint
5509 * must flush the change to disk successfully or the checkpoint
5510 * never gets written, so crash recovery will fix.
5511 *
5512 * It's possible we may enter here without an xid, so it is
5513 * essential that CreateCheckPoint waits for virtual transactions
5514 * rather than full transactionids.
5515 */
5518 delayChkptFlags = true;
5519 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5520 }
5521
5522 buf_state = LockBufHdr(bufHdr);
5523
5524 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5525
5526 if (!(buf_state & BM_DIRTY))
5527 {
5528 dirtied = true; /* Means "will be dirtied by this action" */
5529
5530 /*
5531 * Set the page LSN if we wrote a backup block. We aren't supposed
5532 * to set this when only holding a share lock but as long as we
5533 * serialise it somehow we're OK. We choose to set LSN while
5534 * holding the buffer header lock, which causes any reader of an
5535 * LSN who holds only a share lock to also obtain a buffer header
5536 * lock before using PageGetLSN(), which is enforced in
5537 * BufferGetLSNAtomic().
5538 *
5539 * If checksums are enabled, you might think we should reset the
5540 * checksum here. That will happen when the page is written
5541 * sometime later in this checkpoint cycle.
5542 */
5543 if (XLogRecPtrIsValid(lsn))
5544 PageSetLSN(page, lsn);
5545 }
5546
5547 UnlockBufHdrExt(bufHdr, buf_state,
5549 0, 0);
5550
5551 if (delayChkptFlags)
5552 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5553
5554 if (dirtied)
5555 {
5557 if (VacuumCostActive)
5559 }
5560 }
5561}
5562
5563/*
5564 * Release buffer content locks for shared buffers.
5565 *
5566 * Used to clean up after errors.
5567 *
5568 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5569 * of releasing buffer content locks per se; the only thing we need to deal
5570 * with here is clearing any PIN_COUNT request that was in progress.
5571 */
5572void
5574{
5576
5577 if (buf)
5578 {
5579 uint32 buf_state;
5580 uint32 unset_bits = 0;
5581
5582 buf_state = LockBufHdr(buf);
5583
5584 /*
5585 * Don't complain if flag bit not set; it could have been reset but we
5586 * got a cancel/die interrupt before getting the signal.
5587 */
5588 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5589 buf->wait_backend_pgprocno == MyProcNumber)
5590 unset_bits = BM_PIN_COUNT_WAITER;
5591
5592 UnlockBufHdrExt(buf, buf_state,
5593 0, unset_bits,
5594 0);
5595
5596 PinCountWaitBuf = NULL;
5597 }
5598}
5599
5600/*
5601 * Acquire or release the content_lock for the buffer.
5602 */
5603void
5605{
5606 BufferDesc *buf;
5607
5609 if (BufferIsLocal(buffer))
5610 return; /* local buffers need no lock */
5611
5613
5614 if (mode == BUFFER_LOCK_UNLOCK)
5616 else if (mode == BUFFER_LOCK_SHARE)
5618 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5620 else
5621 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5622}
5623
5624/*
5625 * Acquire the content_lock for the buffer, but only if we don't have to wait.
5626 *
5627 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5628 */
5629bool
5631{
5632 BufferDesc *buf;
5633
5635 if (BufferIsLocal(buffer))
5636 return true; /* act as though we got it */
5637
5639
5641 LW_EXCLUSIVE);
5642}
5643
5644/*
5645 * Verify that this backend is pinning the buffer exactly once.
5646 *
5647 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5648 * holds a pin on the buffer. We do not care whether some other backend does.
5649 */
5650void
5652{
5653 if (BufferIsLocal(buffer))
5654 {
5655 if (LocalRefCount[-buffer - 1] != 1)
5656 elog(ERROR, "incorrect local pin count: %d",
5657 LocalRefCount[-buffer - 1]);
5658 }
5659 else
5660 {
5661 if (GetPrivateRefCount(buffer) != 1)
5662 elog(ERROR, "incorrect local pin count: %d",
5664 }
5665}
5666
5667/*
5668 * LockBufferForCleanup - lock a buffer in preparation for deleting items
5669 *
5670 * Items may be deleted from a disk page only when the caller (a) holds an
5671 * exclusive lock on the buffer and (b) has observed that no other backend
5672 * holds a pin on the buffer. If there is a pin, then the other backend
5673 * might have a pointer into the buffer (for example, a heapscan reference
5674 * to an item --- see README for more details). It's OK if a pin is added
5675 * after the cleanup starts, however; the newly-arrived backend will be
5676 * unable to look at the page until we release the exclusive lock.
5677 *
5678 * To implement this protocol, a would-be deleter must pin the buffer and
5679 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5680 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5681 * it has successfully observed pin count = 1.
5682 */
5683void
5685{
5686 BufferDesc *bufHdr;
5687 TimestampTz waitStart = 0;
5688 bool waiting = false;
5689 bool logged_recovery_conflict = false;
5690
5692 Assert(PinCountWaitBuf == NULL);
5693
5695
5696 /*
5697 * We do not yet need to be worried about in-progress AIOs holding a pin,
5698 * as we, so far, only support doing reads via AIO and this function can
5699 * only be called once the buffer is valid (i.e. no read can be in
5700 * flight).
5701 */
5702
5703 /* Nobody else to wait for */
5704 if (BufferIsLocal(buffer))
5705 return;
5706
5707 bufHdr = GetBufferDescriptor(buffer - 1);
5708
5709 for (;;)
5710 {
5711 uint32 buf_state;
5712 uint32 unset_bits = 0;
5713
5714 /* Try to acquire lock */
5716 buf_state = LockBufHdr(bufHdr);
5717
5718 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5719 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5720 {
5721 /* Successfully acquired exclusive lock with pincount 1 */
5722 UnlockBufHdr(bufHdr);
5723
5724 /*
5725 * Emit the log message if recovery conflict on buffer pin was
5726 * resolved but the startup process waited longer than
5727 * deadlock_timeout for it.
5728 */
5729 if (logged_recovery_conflict)
5731 waitStart, GetCurrentTimestamp(),
5732 NULL, false);
5733
5734 if (waiting)
5735 {
5736 /* reset ps display to remove the suffix if we added one */
5738 waiting = false;
5739 }
5740 return;
5741 }
5742 /* Failed, so mark myself as waiting for pincount 1 */
5743 if (buf_state & BM_PIN_COUNT_WAITER)
5744 {
5745 UnlockBufHdr(bufHdr);
5747 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5748 }
5750 PinCountWaitBuf = bufHdr;
5751 UnlockBufHdrExt(bufHdr, buf_state,
5753 0);
5755
5756 /* Wait to be signaled by UnpinBuffer() */
5757 if (InHotStandby)
5758 {
5759 if (!waiting)
5760 {
5761 /* adjust the process title to indicate that it's waiting */
5762 set_ps_display_suffix("waiting");
5763 waiting = true;
5764 }
5765
5766 /*
5767 * Emit the log message if the startup process is waiting longer
5768 * than deadlock_timeout for recovery conflict on buffer pin.
5769 *
5770 * Skip this if first time through because the startup process has
5771 * not started waiting yet in this case. So, the wait start
5772 * timestamp is set after this logic.
5773 */
5774 if (waitStart != 0 && !logged_recovery_conflict)
5775 {
5777
5778 if (TimestampDifferenceExceeds(waitStart, now,
5780 {
5782 waitStart, now, NULL, true);
5783 logged_recovery_conflict = true;
5784 }
5785 }
5786
5787 /*
5788 * Set the wait start timestamp if logging is enabled and first
5789 * time through.
5790 */
5791 if (log_recovery_conflict_waits && waitStart == 0)
5792 waitStart = GetCurrentTimestamp();
5793
5794 /* Publish the bufid that Startup process waits on */
5796 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5798 /* Reset the published bufid */
5800 }
5801 else
5802 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5803
5804 /*
5805 * Remove flag marking us as waiter. Normally this will not be set
5806 * anymore, but ProcWaitForSignal() can return for other signals as
5807 * well. We take care to only reset the flag if we're the waiter, as
5808 * theoretically another backend could have started waiting. That's
5809 * impossible with the current usages due to table level locking, but
5810 * better be safe.
5811 */
5812 buf_state = LockBufHdr(bufHdr);
5813 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5815 unset_bits |= BM_PIN_COUNT_WAITER;
5816
5817 UnlockBufHdrExt(bufHdr, buf_state,
5818 0, unset_bits,
5819 0);
5820
5821 PinCountWaitBuf = NULL;
5822 /* Loop back and try again */
5823 }
5824}
5825
5826/*
5827 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5828 * requests cancellation of all pin holders that are blocking it.
5829 */
5830bool
5832{
5833 int bufid = GetStartupBufferPinWaitBufId();
5834
5835 /*
5836 * If we get woken slowly then it's possible that the Startup process was
5837 * already woken by other backends before we got here. Also possible that
5838 * we get here by multiple interrupts or interrupts at inappropriate
5839 * times, so make sure we do nothing if the bufid is not set.
5840 */
5841 if (bufid < 0)
5842 return false;
5843
5844 if (GetPrivateRefCount(bufid + 1) > 0)
5845 return true;
5846
5847 return false;
5848}
5849
5850/*
5851 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5852 *
5853 * We won't loop, but just check once to see if the pin count is OK. If
5854 * not, return false with no lock held.
5855 */
5856bool
5858{
5859 BufferDesc *bufHdr;
5860 uint32 buf_state,
5861 refcount;
5862
5864
5865 /* see AIO related comment in LockBufferForCleanup() */
5866
5867 if (BufferIsLocal(buffer))
5868 {
5870 /* There should be exactly one pin */
5871 Assert(refcount > 0);
5872 if (refcount != 1)
5873 return false;
5874 /* Nobody else to wait for */
5875 return true;
5876 }
5877
5878 /* There should be exactly one local pin */
5881 if (refcount != 1)
5882 return false;
5883
5884 /* Try to acquire lock */
5886 return false;
5887
5888 bufHdr = GetBufferDescriptor(buffer - 1);
5889 buf_state = LockBufHdr(bufHdr);
5890 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5891
5892 Assert(refcount > 0);
5893 if (refcount == 1)
5894 {
5895 /* Successfully acquired exclusive lock with pincount 1 */
5896 UnlockBufHdr(bufHdr);
5897 return true;
5898 }
5899
5900 /* Failed, so release the lock */
5901 UnlockBufHdr(bufHdr);
5903 return false;
5904}
5905
5906/*
5907 * IsBufferCleanupOK - as above, but we already have the lock
5908 *
5909 * Check whether it's OK to perform cleanup on a buffer we've already
5910 * locked. If we observe that the pin count is 1, our exclusive lock
5911 * happens to be a cleanup lock, and we can proceed with anything that
5912 * would have been allowable had we sought a cleanup lock originally.
5913 */
5914bool
5916{
5917 BufferDesc *bufHdr;
5918 uint32 buf_state;
5919
5921
5922 /* see AIO related comment in LockBufferForCleanup() */
5923
5924 if (BufferIsLocal(buffer))
5925 {
5926 /* There should be exactly one pin */
5927 if (LocalRefCount[-buffer - 1] != 1)
5928 return false;
5929 /* Nobody else to wait for */
5930 return true;
5931 }
5932
5933 /* There should be exactly one local pin */
5934 if (GetPrivateRefCount(buffer) != 1)
5935 return false;
5936
5937 bufHdr = GetBufferDescriptor(buffer - 1);
5938
5939 /* caller must hold exclusive lock on buffer */
5941
5942 buf_state = LockBufHdr(bufHdr);
5943
5944 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5945 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5946 {
5947 /* pincount is OK. */
5948 UnlockBufHdr(bufHdr);
5949 return true;
5950 }
5951
5952 UnlockBufHdr(bufHdr);
5953 return false;
5954}
5955
5956
5957/*
5958 * Functions for buffer I/O handling
5959 *
5960 * Also note that these are used only for shared buffers, not local ones.
5961 */
5962
5963/*
5964 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5965 */
5966static void
5968{
5970
5972 for (;;)
5973 {
5974 uint32 buf_state;
5975 PgAioWaitRef iow;
5976
5977 /*
5978 * It may not be necessary to acquire the spinlock to check the flag
5979 * here, but since this test is essential for correctness, we'd better
5980 * play it safe.
5981 */
5982 buf_state = LockBufHdr(buf);
5983
5984 /*
5985 * Copy the wait reference while holding the spinlock. This protects
5986 * against a concurrent TerminateBufferIO() in another backend from
5987 * clearing the wref while it's being read.
5988 */
5989 iow = buf->io_wref;
5991
5992 /* no IO in progress, we don't need to wait */
5993 if (!(buf_state & BM_IO_IN_PROGRESS))
5994 break;
5995
5996 /*
5997 * The buffer has asynchronous IO in progress, wait for it to
5998 * complete.
5999 */
6000 if (pgaio_wref_valid(&iow))
6001 {
6002 pgaio_wref_wait(&iow);
6003
6004 /*
6005 * The AIO subsystem internally uses condition variables and thus
6006 * might remove this backend from the BufferDesc's CV. While that
6007 * wouldn't cause a correctness issue (the first CV sleep just
6008 * immediately returns if not already registered), it seems worth
6009 * avoiding unnecessary loop iterations, given that we take care
6010 * to do so at the start of the function.
6011 */
6013 continue;
6014 }
6015
6016 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6017 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6018 }
6020}
6021
6022/*
6023 * StartBufferIO: begin I/O on this buffer
6024 * (Assumptions)
6025 * My process is executing no IO on this buffer
6026 * The buffer is Pinned
6027 *
6028 * In some scenarios multiple backends could attempt the same I/O operation
6029 * concurrently. If someone else has already started I/O on this buffer then
6030 * we will wait for completion of the IO using WaitIO().
6031 *
6032 * Input operations are only attempted on buffers that are not BM_VALID,
6033 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6034 * so we can always tell if the work is already done.
6035 *
6036 * Returns true if we successfully marked the buffer as I/O busy,
6037 * false if someone else already did the work.
6038 *
6039 * If nowait is true, then we don't wait for an I/O to be finished by another
6040 * backend. In that case, false indicates either that the I/O was already
6041 * finished, or is still in progress. This is useful for callers that want to
6042 * find out if they can perform the I/O as part of a larger operation, without
6043 * waiting for the answer or distinguishing the reasons why not.
6044 */
6045bool
6046StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6047{
6048 uint32 buf_state;
6049
6051
6052 for (;;)
6053 {
6054 buf_state = LockBufHdr(buf);
6055
6056 if (!(buf_state & BM_IO_IN_PROGRESS))
6057 break;
6059 if (nowait)
6060 return false;
6061 WaitIO(buf);
6062 }
6063
6064 /* Once we get here, there is definitely no I/O active on this buffer */
6065
6066 /* Check if someone else already did the I/O */
6067 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6068 {
6070 return false;
6071 }
6072
6073 UnlockBufHdrExt(buf, buf_state,
6075 0);
6076
6079
6080 return true;
6081}
6082
6083/*
6084 * TerminateBufferIO: release a buffer we were doing I/O on
6085 * (Assumptions)
6086 * My process is executing IO for the buffer
6087 * BM_IO_IN_PROGRESS bit is set for the buffer
6088 * The buffer is Pinned
6089 *
6090 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6091 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6092 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6093 * marking the buffer clean if it was re-dirtied while we were writing.
6094 *
6095 * set_flag_bits gets ORed into the buffer's flags. It must include
6096 * BM_IO_ERROR in a failure case. For successful completion it could
6097 * be 0, or BM_VALID if we just finished reading in the page.
6098 *
6099 * If forget_owner is true, we release the buffer I/O from the current
6100 * resource owner. (forget_owner=false is used when the resource owner itself
6101 * is being released)
6102 */
6103void
6104TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6105 bool forget_owner, bool release_aio)
6106{
6107 uint32 buf_state;
6108 uint32 unset_flag_bits = 0;
6109 int refcount_change = 0;
6110
6111 buf_state = LockBufHdr(buf);
6112
6113 Assert(buf_state & BM_IO_IN_PROGRESS);
6114 unset_flag_bits |= BM_IO_IN_PROGRESS;
6115
6116 /* Clear earlier errors, if this IO failed, it'll be marked again */
6117 unset_flag_bits |= BM_IO_ERROR;
6118
6119 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6120 unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
6121
6122 if (release_aio)
6123 {
6124 /* release ownership by the AIO subsystem */
6125 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6126 refcount_change = -1;
6127 pgaio_wref_clear(&buf->io_wref);
6128 }
6129
6130 buf_state = UnlockBufHdrExt(buf, buf_state,
6131 set_flag_bits, unset_flag_bits,
6132 refcount_change);
6133
6134 if (forget_owner)
6137
6139
6140 /*
6141 * Support LockBufferForCleanup()
6142 *
6143 * We may have just released the last pin other than the waiter's. In most
6144 * cases, this backend holds another pin on the buffer. But, if, for
6145 * example, this backend is completing an IO issued by another backend, it
6146 * may be time to wake the waiter.
6147 */
6148 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6150}
6151
6152/*
6153 * AbortBufferIO: Clean up active buffer I/O after an error.
6154 *
6155 * All LWLocks we might have held have been released,
6156 * but we haven't yet released buffer pins, so the buffer is still pinned.
6157 *
6158 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6159 * possible the error condition wasn't related to the I/O.
6160 *
6161 * Note: this does not remove the buffer I/O from the resource owner.
6162 * That's correct when we're releasing the whole resource owner, but
6163 * beware if you use this in other contexts.
6164 */
6165static void
6167{
6168 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6169 uint32 buf_state;
6170
6171 buf_state = LockBufHdr(buf_hdr);
6172 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6173
6174 if (!(buf_state & BM_VALID))
6175 {
6176 Assert(!(buf_state & BM_DIRTY));
6177 UnlockBufHdr(buf_hdr);
6178 }
6179 else
6180 {
6181 Assert(buf_state & BM_DIRTY);
6182 UnlockBufHdr(buf_hdr);
6183
6184 /* Issue notice if this is not the first failure... */
6185 if (buf_state & BM_IO_ERROR)
6186 {
6187 /* Buffer is pinned, so we can read tag without spinlock */
6189 (errcode(ERRCODE_IO_ERROR),
6190 errmsg("could not write block %u of %s",
6191 buf_hdr->tag.blockNum,
6193 BufTagGetForkNum(&buf_hdr->tag)).str),
6194 errdetail("Multiple failures --- write error might be permanent.")));
6195 }
6196 }
6197
6198 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6199}
6200
6201/*
6202 * Error context callback for errors occurring during shared buffer writes.
6203 */
6204static void
6206{
6207 BufferDesc *bufHdr = (BufferDesc *) arg;
6208
6209 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6210 if (bufHdr != NULL)
6211 errcontext("writing block %u of relation \"%s\"",
6212 bufHdr->tag.blockNum,
6214 BufTagGetForkNum(&bufHdr->tag)).str);
6215}
6216
6217/*
6218 * Error context callback for errors occurring during local buffer writes.
6219 */
6220static void
6222{
6223 BufferDesc *bufHdr = (BufferDesc *) arg;
6224
6225 if (bufHdr != NULL)
6226 errcontext("writing block %u of relation \"%s\"",
6227 bufHdr->tag.blockNum,
6230 BufTagGetForkNum(&bufHdr->tag)).str);
6231}
6232
6233/*
6234 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6235 */
6236static int
6237rlocator_comparator(const void *p1, const void *p2)
6238{
6239 RelFileLocator n1 = *(const RelFileLocator *) p1;
6240 RelFileLocator n2 = *(const RelFileLocator *) p2;
6241
6242 if (n1.relNumber < n2.relNumber)
6243 return -1;
6244 else if (n1.relNumber > n2.relNumber)
6245 return 1;
6246
6247 if (n1.dbOid < n2.dbOid)
6248 return -1;
6249 else if (n1.dbOid > n2.dbOid)
6250 return 1;
6251
6252 if (n1.spcOid < n2.spcOid)
6253 return -1;
6254 else if (n1.spcOid > n2.spcOid)
6255 return 1;
6256 else
6257 return 0;
6258}
6259
6260/*
6261 * Lock buffer header - set BM_LOCKED in buffer state.
6262 */
6263uint32
6265{
6266 SpinDelayStatus delayStatus;
6267 uint32 old_buf_state;
6268
6270
6271 init_local_spin_delay(&delayStatus);
6272
6273 while (true)
6274 {
6275 /* set BM_LOCKED flag */
6276 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6277 /* if it wasn't set before we're OK */
6278 if (!(old_buf_state & BM_LOCKED))
6279 break;
6280 perform_spin_delay(&delayStatus);
6281 }
6282 finish_spin_delay(&delayStatus);
6283 return old_buf_state | BM_LOCKED;
6284}
6285
6286/*
6287 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6288 * state at that point.
6289 *
6290 * Obviously the buffer could be locked by the time the value is returned, so
6291 * this is primarily useful in CAS style loops.
6292 */
6295{
6296 SpinDelayStatus delayStatus;
6297 uint32 buf_state;
6298
6299 init_local_spin_delay(&delayStatus);
6300
6301 buf_state = pg_atomic_read_u32(&buf->state);
6302
6303 while (buf_state & BM_LOCKED)
6304 {
6305 perform_spin_delay(&delayStatus);
6306 buf_state = pg_atomic_read_u32(&buf->state);
6307 }
6308
6309 finish_spin_delay(&delayStatus);
6310
6311 return buf_state;
6312}
6313
6314/*
6315 * BufferTag comparator.
6316 */
6317static inline int
6319{
6320 int ret;
6321 RelFileLocator rlocatora;
6322 RelFileLocator rlocatorb;
6323
6324 rlocatora = BufTagGetRelFileLocator(ba);
6325 rlocatorb = BufTagGetRelFileLocator(bb);
6326
6327 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6328
6329 if (ret != 0)
6330 return ret;
6331
6332 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6333 return -1;
6334 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6335 return 1;
6336
6337 if (ba->blockNum < bb->blockNum)
6338 return -1;
6339 if (ba->blockNum > bb->blockNum)
6340 return 1;
6341
6342 return 0;
6343}
6344
6345/*
6346 * Comparator determining the writeout order in a checkpoint.
6347 *
6348 * It is important that tablespaces are compared first, the logic balancing
6349 * writes between tablespaces relies on it.
6350 */
6351static inline int
6353{
6354 /* compare tablespace */
6355 if (a->tsId < b->tsId)
6356 return -1;
6357 else if (a->tsId > b->tsId)
6358 return 1;
6359 /* compare relation */
6360 if (a->relNumber < b->relNumber)
6361 return -1;
6362 else if (a->relNumber > b->relNumber)
6363 return 1;
6364 /* compare fork */
6365 else if (a->forkNum < b->forkNum)
6366 return -1;
6367 else if (a->forkNum > b->forkNum)
6368 return 1;
6369 /* compare block number */
6370 else if (a->blockNum < b->blockNum)
6371 return -1;
6372 else if (a->blockNum > b->blockNum)
6373 return 1;
6374 /* equal page IDs are unlikely, but not impossible */
6375 return 0;
6376}
6377
6378/*
6379 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6380 * progress.
6381 */
6382static int
6384{
6387
6388 /* we want a min-heap, so return 1 for the a < b */
6389 if (sa->progress < sb->progress)
6390 return 1;
6391 else if (sa->progress == sb->progress)
6392 return 0;
6393 else
6394 return -1;
6395}
6396
6397/*
6398 * Initialize a writeback context, discarding potential previous state.
6399 *
6400 * *max_pending is a pointer instead of an immediate value, so the coalesce
6401 * limits can easily changed by the GUC mechanism, and so calling code does
6402 * not have to check the current configuration. A value of 0 means that no
6403 * writeback control will be performed.
6404 */
6405void
6406WritebackContextInit(WritebackContext *context, int *max_pending)
6407{
6408 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6409
6410 context->max_pending = max_pending;
6411 context->nr_pending = 0;
6412}
6413
6414/*
6415 * Add buffer to list of pending writeback requests.
6416 */
6417void
6419 BufferTag *tag)
6420{
6421 PendingWriteback *pending;
6422
6423 /*
6424 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6425 * point in tracking in that case.
6426 */
6428 !enableFsync)
6429 return;
6430
6431 /*
6432 * Add buffer to the pending writeback array, unless writeback control is
6433 * disabled.
6434 */
6435 if (*wb_context->max_pending > 0)
6436 {
6438
6439 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6440
6441 pending->tag = *tag;
6442 }
6443
6444 /*
6445 * Perform pending flushes if the writeback limit is exceeded. This
6446 * includes the case where previously an item has been added, but control
6447 * is now disabled.
6448 */
6449 if (wb_context->nr_pending >= *wb_context->max_pending)
6450 IssuePendingWritebacks(wb_context, io_context);
6451}
6452
6453#define ST_SORT sort_pending_writebacks
6454#define ST_ELEMENT_TYPE PendingWriteback
6455#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6456#define ST_SCOPE static
6457#define ST_DEFINE
6458#include "lib/sort_template.h"
6459
6460/*
6461 * Issue all pending writeback requests, previously scheduled with
6462 * ScheduleBufferTagForWriteback, to the OS.
6463 *
6464 * Because this is only used to improve the OSs IO scheduling we try to never
6465 * error out - it's just a hint.
6466 */
6467void
6469{
6470 instr_time io_start;
6471 int i;
6472
6473 if (wb_context->nr_pending == 0)
6474 return;
6475
6476 /*
6477 * Executing the writes in-order can make them a lot faster, and allows to
6478 * merge writeback requests to consecutive blocks into larger writebacks.
6479 */
6480 sort_pending_writebacks(wb_context->pending_writebacks,
6481 wb_context->nr_pending);
6482
6484
6485 /*
6486 * Coalesce neighbouring writes, but nothing else. For that we iterate
6487 * through the, now sorted, array of pending flushes, and look forward to
6488 * find all neighbouring (or identical) writes.
6489 */
6490 for (i = 0; i < wb_context->nr_pending; i++)
6491 {
6494 SMgrRelation reln;
6495 int ahead;
6496 BufferTag tag;
6497 RelFileLocator currlocator;
6498 Size nblocks = 1;
6499
6500 cur = &wb_context->pending_writebacks[i];
6501 tag = cur->tag;
6502 currlocator = BufTagGetRelFileLocator(&tag);
6503
6504 /*
6505 * Peek ahead, into following writeback requests, to see if they can
6506 * be combined with the current one.
6507 */
6508 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6509 {
6510
6511 next = &wb_context->pending_writebacks[i + ahead + 1];
6512
6513 /* different file, stop */
6514 if (!RelFileLocatorEquals(currlocator,
6515 BufTagGetRelFileLocator(&next->tag)) ||
6516 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6517 break;
6518
6519 /* ok, block queued twice, skip */
6520 if (cur->tag.blockNum == next->tag.blockNum)
6521 continue;
6522
6523 /* only merge consecutive writes */
6524 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6525 break;
6526
6527 nblocks++;
6528 cur = next;
6529 }
6530
6531 i += ahead;
6532
6533 /* and finally tell the kernel to write the data to storage */
6534 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6535 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6536 }
6537
6538 /*
6539 * Assume that writeback requests are only issued for buffers containing
6540 * blocks of permanent relations.
6541 */
6543 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6544
6545 wb_context->nr_pending = 0;
6546}
6547
6548/* ResourceOwner callbacks */
6549
6550static void
6552{
6554
6556}
6557
6558static char *
6560{
6562
6563 return psprintf("lost track of buffer IO on buffer %d", buffer);
6564}
6565
6566static void
6568{
6570
6571 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6572 if (!BufferIsValid(buffer))
6573 elog(ERROR, "bad buffer ID: %d", buffer);
6574
6575 if (BufferIsLocal(buffer))
6577 else
6579}
6580
6581static char *
6583{
6585}
6586
6587/*
6588 * Helper function to evict unpinned buffer whose buffer header lock is
6589 * already acquired.
6590 */
6591static bool
6592EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6593{
6594 uint32 buf_state;
6595 bool result;
6596
6597 *buffer_flushed = false;
6598
6599 buf_state = pg_atomic_read_u32(&(desc->state));
6600 Assert(buf_state & BM_LOCKED);
6601
6602 if ((buf_state & BM_VALID) == 0)
6603 {
6604 UnlockBufHdr(desc);
6605 return false;
6606 }
6607
6608 /* Check that it's not pinned already. */
6609 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6610 {
6611 UnlockBufHdr(desc);
6612 return false;
6613 }
6614
6615 PinBuffer_Locked(desc); /* releases spinlock */
6616
6617 /* If it was dirty, try to clean it once. */
6618 if (buf_state & BM_DIRTY)
6619 {
6621 *buffer_flushed = true;
6622 }
6623
6624 /* This will return false if it becomes dirty or someone else pins it. */
6625 result = InvalidateVictimBuffer(desc);
6626
6627 UnpinBuffer(desc);
6628
6629 return result;
6630}
6631
6632/*
6633 * Try to evict the current block in a shared buffer.
6634 *
6635 * This function is intended for testing/development use only!
6636 *
6637 * To succeed, the buffer must not be pinned on entry, so if the caller had a
6638 * particular block in mind, it might already have been replaced by some other
6639 * block by the time this function runs. It's also unpinned on return, so the
6640 * buffer might be occupied again by the time control is returned, potentially
6641 * even by the same block. This inherent raciness without other interlocking
6642 * makes the function unsuitable for non-testing usage.
6643 *
6644 * *buffer_flushed is set to true if the buffer was dirty and has been
6645 * flushed, false otherwise. However, *buffer_flushed=true does not
6646 * necessarily mean that we flushed the buffer, it could have been flushed by
6647 * someone else.
6648 *
6649 * Returns true if the buffer was valid and it has now been made invalid.
6650 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6651 * or if the buffer becomes dirty again while we're trying to write it out.
6652 */
6653bool
6654EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6655{
6656 BufferDesc *desc;
6657
6659
6660 /* Make sure we can pin the buffer. */
6663
6664 desc = GetBufferDescriptor(buf - 1);
6665 LockBufHdr(desc);
6666
6667 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6668}
6669
6670/*
6671 * Try to evict all the shared buffers.
6672 *
6673 * This function is intended for testing/development use only! See
6674 * EvictUnpinnedBuffer().
6675 *
6676 * The buffers_* parameters are mandatory and indicate the total count of
6677 * buffers that:
6678 * - buffers_evicted - were evicted
6679 * - buffers_flushed - were flushed
6680 * - buffers_skipped - could not be evicted
6681 */
6682void
6683EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6684 int32 *buffers_skipped)
6685{
6686 *buffers_evicted = 0;
6687 *buffers_skipped = 0;
6688 *buffers_flushed = 0;
6689
6690 for (int buf = 1; buf <= NBuffers; buf++)
6691 {
6692 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6693 uint32 buf_state;
6694 bool buffer_flushed;
6695
6697
6698 buf_state = pg_atomic_read_u32(&desc->state);
6699 if (!(buf_state & BM_VALID))
6700 continue;
6701
6704
6705 LockBufHdr(desc);
6706
6707 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6708 (*buffers_evicted)++;
6709 else
6710 (*buffers_skipped)++;
6711
6712 if (buffer_flushed)
6713 (*buffers_flushed)++;
6714 }
6715}
6716
6717/*
6718 * Try to evict all the shared buffers containing provided relation's pages.
6719 *
6720 * This function is intended for testing/development use only! See
6721 * EvictUnpinnedBuffer().
6722 *
6723 * The caller must hold at least AccessShareLock on the relation to prevent
6724 * the relation from being dropped.
6725 *
6726 * The buffers_* parameters are mandatory and indicate the total count of
6727 * buffers that:
6728 * - buffers_evicted - were evicted
6729 * - buffers_flushed - were flushed
6730 * - buffers_skipped - could not be evicted
6731 */
6732void
6734 int32 *buffers_flushed, int32 *buffers_skipped)
6735{
6737
6738 *buffers_skipped = 0;
6739 *buffers_evicted = 0;
6740 *buffers_flushed = 0;
6741
6742 for (int buf = 1; buf <= NBuffers; buf++)
6743 {
6744 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6745 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6746 bool buffer_flushed;
6747
6749
6750 /* An unlocked precheck should be safe and saves some cycles. */
6751 if ((buf_state & BM_VALID) == 0 ||
6753 continue;
6754
6755 /* Make sure we can pin the buffer. */
6758
6759 buf_state = LockBufHdr(desc);
6760
6761 /* recheck, could have changed without the lock */
6762 if ((buf_state & BM_VALID) == 0 ||
6764 {
6765 UnlockBufHdr(desc);
6766 continue;
6767 }
6768
6769 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6770 (*buffers_evicted)++;
6771 else
6772 (*buffers_skipped)++;
6773
6774 if (buffer_flushed)
6775 (*buffers_flushed)++;
6776 }
6777}
6778
6779/*
6780 * Generic implementation of the AIO handle staging callback for readv/writev
6781 * on local/shared buffers.
6782 *
6783 * Each readv/writev can target multiple buffers. The buffers have already
6784 * been registered with the IO handle.
6785 *
6786 * To make the IO ready for execution ("staging"), we need to ensure that the
6787 * targeted buffers are in an appropriate state while the IO is ongoing. For
6788 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
6789 * in this backend could lead to this backend's buffer pin being released as
6790 * part of error handling, which in turn could lead to the buffer being
6791 * replaced while IO is ongoing.
6792 */
6794buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
6795{
6796 uint64 *io_data;
6797 uint8 handle_data_len;
6798 PgAioWaitRef io_ref;
6800
6801 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6802
6803 pgaio_io_get_wref(ioh, &io_ref);
6804
6805 /* iterate over all buffers affected by the vectored readv/writev */
6806 for (int i = 0; i < handle_data_len; i++)
6807 {
6808 Buffer buffer = (Buffer) io_data[i];
6809 BufferDesc *buf_hdr = is_temp ?
6812 uint32 buf_state;
6813
6814 /*
6815 * Check that all the buffers are actually ones that could conceivably
6816 * be done in one IO, i.e. are sequential. This is the last
6817 * buffer-aware code before IO is actually executed and confusion
6818 * about which buffers are targeted by IO can be hard to debug, making
6819 * it worth doing extra-paranoid checks.
6820 */
6821 if (i == 0)
6822 first = buf_hdr->tag;
6823 else
6824 {
6825 Assert(buf_hdr->tag.relNumber == first.relNumber);
6826 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6827 }
6828
6829 if (is_temp)
6830 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6831 else
6832 buf_state = LockBufHdr(buf_hdr);
6833
6834 /* verify the buffer is in the expected state */
6835 Assert(buf_state & BM_TAG_VALID);
6836 if (is_write)
6837 {
6838 Assert(buf_state & BM_VALID);
6839 Assert(buf_state & BM_DIRTY);
6840 }
6841 else
6842 {
6843 Assert(!(buf_state & BM_VALID));
6844 Assert(!(buf_state & BM_DIRTY));
6845 }
6846
6847 /* temp buffers don't use BM_IO_IN_PROGRESS */
6848 if (!is_temp)
6849 Assert(buf_state & BM_IO_IN_PROGRESS);
6850
6851 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6852
6853 /*
6854 * Reflect that the buffer is now owned by the AIO subsystem.
6855 *
6856 * For local buffers: This can't be done just via LocalRefCount, as
6857 * one might initially think, as this backend could error out while
6858 * AIO is still in progress, releasing all the pins by the backend
6859 * itself.
6860 *
6861 * This pin is released again in TerminateBufferIO().
6862 */
6863 buf_hdr->io_wref = io_ref;
6864
6865 if (is_temp)
6866 {
6867 buf_state += BUF_REFCOUNT_ONE;
6868 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6869 }
6870 else
6871 UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
6872
6873 /*
6874 * Ensure the content lock that prevents buffer modifications while
6875 * the buffer is being written out is not released early due to an
6876 * error.
6877 */
6878 if (is_write && !is_temp)
6879 {
6880 LWLock *content_lock;
6881
6882 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6883
6884 Assert(LWLockHeldByMe(content_lock));
6885
6886 /*
6887 * Lock is now owned by AIO subsystem.
6888 */
6889 LWLockDisown(content_lock);
6890 }
6891
6892 /*
6893 * Stop tracking this buffer via the resowner - the AIO system now
6894 * keeps track.
6895 */
6896 if (!is_temp)
6898 }
6899}
6900
6901/*
6902 * Decode readv errors as encoded by buffer_readv_encode_error().
6903 */
6904static inline void
6906 bool *zeroed_any,
6907 bool *ignored_any,
6908 uint8 *zeroed_or_error_count,
6909 uint8 *checkfail_count,
6910 uint8 *first_off)
6911{
6912 uint32 rem_error = result.error_data;
6913
6914 /* see static asserts in buffer_readv_encode_error */
6915#define READV_COUNT_BITS 7
6916#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6917
6918 *zeroed_any = rem_error & 1;
6919 rem_error >>= 1;
6920
6921 *ignored_any = rem_error & 1;
6922 rem_error >>= 1;
6923
6924 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6925 rem_error >>= READV_COUNT_BITS;
6926
6927 *checkfail_count = rem_error & READV_COUNT_MASK;
6928 rem_error >>= READV_COUNT_BITS;
6929
6930 *first_off = rem_error & READV_COUNT_MASK;
6931 rem_error >>= READV_COUNT_BITS;
6932}
6933
6934/*
6935 * Helper to encode errors for buffer_readv_complete()
6936 *
6937 * Errors are encoded as follows:
6938 * - bit 0 indicates whether any page was zeroed (1) or not (0)
6939 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
6940 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
6941 * - next READV_COUNT_BITS bits indicate the number of checksum failures
6942 * - next READV_COUNT_BITS bits indicate the first offset of the first page
6943 * that was errored or zeroed or, if no errors/zeroes, the first ignored
6944 * checksum
6945 */
6946static inline void
6948 bool is_temp,
6949 bool zeroed_any,
6950 bool ignored_any,
6951 uint8 error_count,
6952 uint8 zeroed_count,
6953 uint8 checkfail_count,
6954 uint8 first_error_off,
6955 uint8 first_zeroed_off,
6956 uint8 first_ignored_off)
6957{
6958
6959 uint8 shift = 0;
6960 uint8 zeroed_or_error_count =
6961 error_count > 0 ? error_count : zeroed_count;
6962 uint8 first_off;
6963
6965 "PG_IOV_MAX is bigger than reserved space for error data");
6967 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6968
6969 /*
6970 * We only have space to encode one offset - but luckily that's good
6971 * enough. If there is an error, the error is the interesting offset, same
6972 * with a zeroed buffer vs an ignored buffer.
6973 */
6974 if (error_count > 0)
6975 first_off = first_error_off;
6976 else if (zeroed_count > 0)
6977 first_off = first_zeroed_off;
6978 else
6979 first_off = first_ignored_off;
6980
6981 Assert(!zeroed_any || error_count == 0);
6982
6983 result->error_data = 0;
6984
6985 result->error_data |= zeroed_any << shift;
6986 shift += 1;
6987
6988 result->error_data |= ignored_any << shift;
6989 shift += 1;
6990
6991 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6992 shift += READV_COUNT_BITS;
6993
6994 result->error_data |= ((uint32) checkfail_count) << shift;
6995 shift += READV_COUNT_BITS;
6996
6997 result->error_data |= ((uint32) first_off) << shift;
6998 shift += READV_COUNT_BITS;
6999
7000 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
7002
7003 if (error_count > 0)
7004 result->status = PGAIO_RS_ERROR;
7005 else
7006 result->status = PGAIO_RS_WARNING;
7007
7008 /*
7009 * The encoding is complicated enough to warrant cross-checking it against
7010 * the decode function.
7011 */
7012#ifdef USE_ASSERT_CHECKING
7013 {
7014 bool zeroed_any_2,
7015 ignored_any_2;
7016 uint8 zeroed_or_error_count_2,
7017 checkfail_count_2,
7018 first_off_2;
7019
7021 &zeroed_any_2, &ignored_any_2,
7022 &zeroed_or_error_count_2,
7023 &checkfail_count_2,
7024 &first_off_2);
7025 Assert(zeroed_any == zeroed_any_2);
7026 Assert(ignored_any == ignored_any_2);
7027 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
7028 Assert(checkfail_count == checkfail_count_2);
7029 Assert(first_off == first_off_2);
7030 }
7031#endif
7032
7033#undef READV_COUNT_BITS
7034#undef READV_COUNT_MASK
7035}
7036
7037/*
7038 * Helper for AIO readv completion callbacks, supporting both shared and temp
7039 * buffers. Gets called once for each buffer in a multi-page read.
7040 */
7043 uint8 flags, bool failed, bool is_temp,
7044 bool *buffer_invalid,
7045 bool *failed_checksum,
7046 bool *ignored_checksum,
7047 bool *zeroed_buffer)
7048{
7049 BufferDesc *buf_hdr = is_temp ?
7052 BufferTag tag = buf_hdr->tag;
7053 char *bufdata = BufferGetBlock(buffer);
7054 uint32 set_flag_bits;
7055 int piv_flags;
7056
7057 /* check that the buffer is in the expected state for a read */
7058#ifdef USE_ASSERT_CHECKING
7059 {
7060 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7061
7062 Assert(buf_state & BM_TAG_VALID);
7063 Assert(!(buf_state & BM_VALID));
7064 /* temp buffers don't use BM_IO_IN_PROGRESS */
7065 if (!is_temp)
7066 Assert(buf_state & BM_IO_IN_PROGRESS);
7067 Assert(!(buf_state & BM_DIRTY));
7068 }
7069#endif
7070
7071 *buffer_invalid = false;
7072 *failed_checksum = false;
7073 *ignored_checksum = false;
7074 *zeroed_buffer = false;
7075
7076 /*
7077 * We ask PageIsVerified() to only log the message about checksum errors,
7078 * as the completion might be run in any backend (or IO workers). We will
7079 * report checksum errors in buffer_readv_report().
7080 */
7081 piv_flags = PIV_LOG_LOG;
7082
7083 /* the local zero_damaged_pages may differ from the definer's */
7085 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7086
7087 /* Check for garbage data. */
7088 if (!failed)
7089 {
7090 /*
7091 * If the buffer is not currently pinned by this backend, e.g. because
7092 * we're completing this IO after an error, the buffer data will have
7093 * been marked as inaccessible when the buffer was unpinned. The AIO
7094 * subsystem holds a pin, but that doesn't prevent the buffer from
7095 * having been marked as inaccessible. The completion might also be
7096 * executed in a different process.
7097 */
7098#ifdef USE_VALGRIND
7099 if (!BufferIsPinned(buffer))
7100 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7101#endif
7102
7103 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7104 failed_checksum))
7105 {
7106 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7107 {
7108 memset(bufdata, 0, BLCKSZ);
7109 *zeroed_buffer = true;
7110 }
7111 else
7112 {
7113 *buffer_invalid = true;
7114 /* mark buffer as having failed */
7115 failed = true;
7116 }
7117 }
7118 else if (*failed_checksum)
7119 *ignored_checksum = true;
7120
7121 /* undo what we did above */
7122#ifdef USE_VALGRIND
7123 if (!BufferIsPinned(buffer))
7124 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7125#endif
7126
7127 /*
7128 * Immediately log a message about the invalid page, but only to the
7129 * server log. The reason to do so immediately is that this may be
7130 * executed in a different backend than the one that originated the
7131 * request. The reason to do so immediately is that the originator
7132 * might not process the query result immediately (because it is busy
7133 * doing another part of query processing) or at all (e.g. if it was
7134 * cancelled or errored out due to another IO also failing). The
7135 * definer of the IO will emit an ERROR or WARNING when processing the
7136 * IO's results
7137 *
7138 * To avoid duplicating the code to emit these log messages, we reuse
7139 * buffer_readv_report().
7140 */
7141 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7142 {
7143 PgAioResult result_one = {0};
7144
7145 buffer_readv_encode_error(&result_one, is_temp,
7146 *zeroed_buffer,
7147 *ignored_checksum,
7148 *buffer_invalid,
7149 *zeroed_buffer ? 1 : 0,
7150 *failed_checksum ? 1 : 0,
7151 buf_off, buf_off, buf_off);
7152 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7153 }
7154 }
7155
7156 /* Terminate I/O and set BM_VALID. */
7157 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7158 if (is_temp)
7159 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7160 else
7161 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7162
7163 /*
7164 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7165 * callback may not be executed in the same backend that called
7166 * BUFFER_READ_START. The alternative would be to defer calling the
7167 * tracepoint to a later point (e.g. the local completion callback for
7168 * shared buffer reads), which seems even less helpful.
7169 */
7170 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7171 tag.blockNum,
7172 tag.spcOid,
7173 tag.dbOid,
7174 tag.relNumber,
7176 false);
7177}
7178
7179/*
7180 * Perform completion handling of a single AIO read. This read may cover
7181 * multiple blocks / buffers.
7182 *
7183 * Shared between shared and local buffers, to reduce code duplication.
7184 */
7187 uint8 cb_data, bool is_temp)
7188{
7189 PgAioResult result = prior_result;
7191 uint8 first_error_off = 0;
7192 uint8 first_zeroed_off = 0;
7193 uint8 first_ignored_off = 0;
7194 uint8 error_count = 0;
7195 uint8 zeroed_count = 0;
7196 uint8 ignored_count = 0;
7197 uint8 checkfail_count = 0;
7198 uint64 *io_data;
7199 uint8 handle_data_len;
7200
7201 if (is_temp)
7202 {
7203 Assert(td->smgr.is_temp);
7205 }
7206 else
7207 Assert(!td->smgr.is_temp);
7208
7209 /*
7210 * Iterate over all the buffers affected by this IO and call the
7211 * per-buffer completion function for each buffer.
7212 */
7213 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7214 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7215 {
7216 Buffer buf = io_data[buf_off];
7217 bool failed;
7218 bool failed_verification = false;
7219 bool failed_checksum = false;
7220 bool zeroed_buffer = false;
7221 bool ignored_checksum = false;
7222
7224
7225 /*
7226 * If the entire I/O failed on a lower-level, each buffer needs to be
7227 * marked as failed. In case of a partial read, the first few buffers
7228 * may be ok.
7229 */
7230 failed =
7231 prior_result.status == PGAIO_RS_ERROR
7232 || prior_result.result <= buf_off;
7233
7234 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7235 &failed_verification,
7236 &failed_checksum,
7237 &ignored_checksum,
7238 &zeroed_buffer);
7239
7240 /*
7241 * Track information about the number of different kinds of error
7242 * conditions across all pages, as there can be multiple pages failing
7243 * verification as part of one IO.
7244 */
7245 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7246 first_error_off = buf_off;
7247 if (zeroed_buffer && zeroed_count++ == 0)
7248 first_zeroed_off = buf_off;
7249 if (ignored_checksum && ignored_count++ == 0)
7250 first_ignored_off = buf_off;
7251 if (failed_checksum)
7252 checkfail_count++;
7253 }
7254
7255 /*
7256 * If the smgr read succeeded [partially] and page verification failed for
7257 * some of the pages, adjust the IO's result state appropriately.
7258 */
7259 if (prior_result.status != PGAIO_RS_ERROR &&
7260 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7261 {
7262 buffer_readv_encode_error(&result, is_temp,
7263 zeroed_count > 0, ignored_count > 0,
7264 error_count, zeroed_count, checkfail_count,
7265 first_error_off, first_zeroed_off,
7266 first_ignored_off);
7267 pgaio_result_report(result, td, DEBUG1);
7268 }
7269
7270 /*
7271 * For shared relations this reporting is done in
7272 * shared_buffer_readv_complete_local().
7273 */
7274 if (is_temp && checkfail_count > 0)
7276 checkfail_count);
7277
7278 return result;
7279}
7280
7281/*
7282 * AIO error reporting callback for aio_shared_buffer_readv_cb and
7283 * aio_local_buffer_readv_cb.
7284 *
7285 * The error is encoded / decoded in buffer_readv_encode_error() /
7286 * buffer_readv_decode_error().
7287 */
7288static void
7290 int elevel)
7291{
7292 int nblocks = td->smgr.nblocks;
7293 BlockNumber first = td->smgr.blockNum;
7294 BlockNumber last = first + nblocks - 1;
7295 ProcNumber errProc =
7297 RelPathStr rpath =
7298 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7299 bool zeroed_any,
7300 ignored_any;
7301 uint8 zeroed_or_error_count,
7302 checkfail_count,
7303 first_off;
7304 uint8 affected_count;
7305 const char *msg_one,
7306 *msg_mult,
7307 *det_mult,
7308 *hint_mult;
7309
7310 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7311 &zeroed_or_error_count,
7312 &checkfail_count,
7313 &first_off);
7314
7315 /*
7316 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7317 * special case, it's too irregular to be emitted the same way as the
7318 * other cases.
7319 */
7320 if (zeroed_any && ignored_any)
7321 {
7322 Assert(zeroed_any && ignored_any);
7323 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7324 Assert(result.status != PGAIO_RS_ERROR);
7325 affected_count = zeroed_or_error_count;
7326
7327 ereport(elevel,
7329 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
7330 affected_count, checkfail_count, first, last, rpath.str),
7331 affected_count > 1 ?
7332 errdetail("Block %u held the first zeroed page.",
7333 first + first_off) : 0,
7334 errhint_plural("See server log for details about the other %d invalid block.",
7335 "See server log for details about the other %d invalid blocks.",
7336 affected_count + checkfail_count - 1,
7337 affected_count + checkfail_count - 1));
7338 return;
7339 }
7340
7341 /*
7342 * The other messages are highly repetitive. To avoid duplicating a long
7343 * and complicated ereport(), gather the translated format strings
7344 * separately and then do one common ereport.
7345 */
7346 if (result.status == PGAIO_RS_ERROR)
7347 {
7348 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7349 affected_count = zeroed_or_error_count;
7350 msg_one = _("invalid page in block %u of relation \"%s\"");
7351 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
7352 det_mult = _("Block %u held the first invalid page.");
7353 hint_mult = _("See server log for the other %u invalid block(s).");
7354 }
7355 else if (zeroed_any && !ignored_any)
7356 {
7357 affected_count = zeroed_or_error_count;
7358 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
7359 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
7360 det_mult = _("Block %u held the first zeroed page.");
7361 hint_mult = _("See server log for the other %u zeroed block(s).");
7362 }
7363 else if (!zeroed_any && ignored_any)
7364 {
7365 affected_count = checkfail_count;
7366 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
7367 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
7368 det_mult = _("Block %u held the first ignored page.");
7369 hint_mult = _("See server log for the other %u ignored block(s).");
7370 }
7371 else
7373
7374 ereport(elevel,
7376 affected_count == 1 ?
7377 errmsg_internal(msg_one, first + first_off, rpath.str) :
7378 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7379 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7380 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7381}
7382
7383static void
7385{
7386 buffer_stage_common(ioh, false, false);
7387}
7388
7389static PgAioResult
7391 uint8 cb_data)
7392{
7393 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7394}
7395
7396/*
7397 * We need a backend-local completion callback for shared buffers, to be able
7398 * to report checksum errors correctly. Unfortunately that can only safely
7399 * happen if the reporting backend has previously called
7400 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7401 * the backend that started the IO. Hence this callback.
7402 */
7403static PgAioResult
7405 uint8 cb_data)
7406{
7407 bool zeroed_any,
7408 ignored_any;
7409 uint8 zeroed_or_error_count,
7410 checkfail_count,
7411 first_off;
7412
7413 if (prior_result.status == PGAIO_RS_OK)
7414 return prior_result;
7415
7416 buffer_readv_decode_error(prior_result,
7417 &zeroed_any,
7418 &ignored_any,
7419 &zeroed_or_error_count,
7420 &checkfail_count,
7421 &first_off);
7422
7423 if (checkfail_count)
7424 {
7426
7428 checkfail_count);
7429 }
7430
7431 return prior_result;
7432}
7433
7434static void
7436{
7437 buffer_stage_common(ioh, false, true);
7438}
7439
7440static PgAioResult
7442 uint8 cb_data)
7443{
7444 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7445}
7446
7447/* readv callback is passed READ_BUFFERS_* flags as callback data */
7450 .complete_shared = shared_buffer_readv_complete,
7451 /* need a local callback to report checksum failures */
7452 .complete_local = shared_buffer_readv_complete_local,
7453 .report = buffer_readv_report,
7454};
7455
7456/* readv callback is passed READ_BUFFERS_* flags as callback data */
7459
7460 /*
7461 * Note that this, in contrast to the shared_buffers case, uses
7462 * complete_local, as only the issuing backend has access to the required
7463 * datastructures. This is important in case the IO completion may be
7464 * consumed incidentally by another backend.
7465 */
7466 .complete_local = local_buffer_readv_complete,
7467 .report = buffer_readv_report,
7468};
int io_method
Definition: aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:330
bool pgaio_have_staged(void)
Definition: aio.c:1107
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:1005
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:355
void pgaio_submit_staged(void)
Definition: aio.c:1123
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:991
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static uint32 pg_atomic_fetch_and_u32(volatile pg_atomic_uint32 *ptr, uint32 and_)
Definition: atomics.h:394
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:347
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:408
static uint32 pg_atomic_fetch_sub_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:379
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:293
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:237
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
int BgWriterDelay
Definition: bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:224
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:71
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static void UnlockBufHdr(BufferDesc *desc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
#define BM_DIRTY
Definition: buf_internals.h:69
static uint32 UnlockBufHdrExt(BufferDesc *desc, uint32 old_buf_state, uint32 set_bits, uint32 unset_bits, int refcount_change)
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:68
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:147
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5651
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:5030
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5398
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4895
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:6352
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7186
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:244
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1545
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4545
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3008
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7404
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1243
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1508
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:653
bool BufferIsLockedByMeInMode(Buffer buffer, int mode)
Definition: bufmgr.c:2869
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4283
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6551
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7441
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1470
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6683
int io_max_combine_limit
Definition: bufmgr.c:172
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4420
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:235
bool zero_damaged_pages
Definition: bufmgr.c:144
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3179
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6733
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:7042
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:6318
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5915
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6559
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:845
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3990
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6166
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition: bufmgr.c:7448
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:877
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1174
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1574
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1012
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1981
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4059
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1532
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:5242
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4665
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6237
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:906
struct SMgrSortArray SMgrSortArray
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition: bufmgr.c:7457
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2249
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4041
int io_combine_limit_guc
Definition: bufmgr.c:171
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6383
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:4244
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6794
#define BUF_REUSABLE
Definition: bufmgr.c:81
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6221
static void BufferSync(int flags)
Definition: bufmgr.c:3343
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1745
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7435
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4166
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6582
void CheckPointBuffers(int flags)
Definition: bufmgr.c:4209
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2911
static uint32 MaxProportionalPins
Definition: bufmgr.c:221
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2575
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3622
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3211
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:4469
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7384
void UnlockBuffers(void)
Definition: bufmgr.c:5573
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:563
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7390
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2320
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5630
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:4437
int bgwriter_flush_after
Definition: bufmgr.c:179
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5366
pg_noinline uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6294
bool BufferIsLockedByMe(Buffer buffer)
Definition: bufmgr.c:2843
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition: bufmgr.c:3068
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4835
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4499
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5831
int checkpoint_flush_after
Definition: bufmgr.c:178
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5383
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1091
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6104
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3256
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6205
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6418
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1613
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6406
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2943
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6592
int backend_flush_after
Definition: bufmgr.c:180
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2513
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7289
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2531
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5684
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5604
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5430
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4942
#define READV_COUNT_BITS
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6468
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition: bufmgr.c:6654
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:829
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:684
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
int maintenance_io_concurrency
Definition: bufmgr.c:162
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3247
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:5306
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2154
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5128
int effective_io_concurrency
Definition: bufmgr.c:155
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6046
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1489
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:792
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6264
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6567
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6905
#define READV_COUNT_MASK
int io_combine_limit
Definition: bufmgr.c:170
void InitBufferManagerAccess(void)
Definition: bufmgr.c:4007
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6947
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3920
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2487
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745
void TrackNewBufferPin(Buffer buf)
Definition: bufmgr.c:3303
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5857
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
uint32 GetPinLimit(void)
Definition: bufmgr.c:2475
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5967
#define BUF_WRITTEN
Definition: bufmgr.c:80
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:5346
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:203
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:204
#define P_NEW
Definition: bufmgr.h:198
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:122
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:425
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:174
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:392
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:124
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:173
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:168
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:126
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:169
void * Block
Definition: bufmgr.h:26
#define BMR_GET_SMGR(bmr)
Definition: bufmgr.h:118
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:128
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:205
ReadBufferMode
Definition: bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
@ RBM_NORMAL
Definition: bufmgr.h:46
#define BMR_REL(p_rel)
Definition: bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:376
bool ignore_checksum_failure
Definition: bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:468
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:233
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
PageData * Page
Definition: bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:469
#define pg_noinline
Definition: c.h:290
#define likely(x)
Definition: c.h:406
uint8_t uint8
Definition: c.h:541
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:228
#define Max(x, y)
Definition: c.h:1002
double float8
Definition: c.h:640
#define pg_attribute_always_inline
Definition: c.h:274
int16_t int16
Definition: c.h:538
int32_t int32
Definition: c.h:539
uint64_t uint64
Definition: c.h:544
#define pg_unreachable()
Definition: c.h:336
#define unlikely(x)
Definition: c.h:407
uint32_t uint32
Definition: c.h:543
#define lengthof(array)
Definition: c.h:792
#define MemSet(start, val, len)
Definition: c.h:1024
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:942
size_t Size
Definition: c.h:615
bool IsCatalogRelationOid(Oid relid)
Definition: catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition: catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:785
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1243
int errdetail(const char *fmt,...)
Definition: elog.c:1216
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint_internal(const char *fmt,...)
Definition: elog.c:1352
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1373
#define _(x)
Definition: elog.c:91
#define errcontext
Definition: elog.h:198
#define DEBUG3
Definition: elog.h:28
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:321
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:643
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:747
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:174
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:787
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
int NBuffers
Definition: globals.c:142
bool enableFsync
Definition: globals.c:129
ProcNumber MyProcNumber
Definition: globals.c:90
int VacuumCostPageMiss
Definition: globals.c:152
bool VacuumCostActive
Definition: globals.c:158
int VacuumCostBalance
Definition: globals.c:157
int MaxBackends
Definition: globals.c:146
int VacuumCostPageDirty
Definition: globals.c:153
int VacuumCostPageHit
Definition: globals.c:151
Assert(PointerIsAligned(start, uint64))
const char * str
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
int32 * LocalRefCount
Definition: localbuf.c:49
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:841
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:523
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:1003
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1014
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:805
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:491
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:702
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:562
int NLocBuffer
Definition: localbuf.c:45
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:72
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:346
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:848
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: localbuf.c:665
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:119
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1883
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2021
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1345
void ForEachLWLockHeldByMe(void(*callback)(LWLock *, LWLockMode, void *), void *context)
Definition: lwlock.c:1962
LWLockMode
Definition: lwlock.h:111
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
static PgChecksumMode mode
Definition: pg_checksums.c:56
static int64 current_size
Definition: pg_checksums.c:64
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition: pg_iovec.h:47
static char * buf
Definition: pg_test_fsync.c:72
IOObject
Definition: pgstat.h:276
@ IOOBJECT_RELATION
Definition: pgstat.h:277
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:278
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:713
IOContext
Definition: pgstat.h:285
@ IOCONTEXT_NORMAL
Definition: pgstat.h:289
@ IOOP_EXTEND
Definition: pgstat.h:314
@ IOOP_READ
Definition: pgstat.h:315
@ IOOP_WRITEBACK
Definition: pgstat.h:311
@ IOOP_HIT
Definition: pgstat.h:309
@ IOOP_EVICT
Definition: pgstat.h:307
@ IOOP_REUSE
Definition: pgstat.h:310
@ IOOP_WRITE
Definition: pgstat.h:316
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:718
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
#define qsort(a, b, c, d)
Definition: port.h:479
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define NUM_AUXILIARY_PROCS
Definition: proc.h:463
#define DELAY_CHKPT_START
Definition: proc.h:135
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:499
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:387
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:761
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:577
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:647
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:668
#define RelationIsValid(relation)
Definition: rel.h:490
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:733
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1996
PGPROC * MyProc
Definition: proc.c:67
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:771
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1984
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
PgAioWaitRef io_wref
SMgrRelation smgr
Definition: bufmgr.h:110
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:222
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:257
PgAioHandleCallbackStage stage
Definition: aio.h:219
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
PgStat_Counter buf_written_clean
Definition: pgstat.h:242
PgStat_Counter maxwritten_clean
Definition: pgstat.h:243
PgStat_Counter buf_alloc
Definition: pgstat.h:244
PgStat_Counter buffers_written
Definition: pgstat.h:266
Buffer recent_buffer
Definition: bufmgr.h:61
ForkNumber forknum
Definition: bufmgr.h:137
PgAioWaitRef io_wref
Definition: bufmgr.h:150
Buffer * buffers
Definition: bufmgr.h:145
SMgrRelation smgr
Definition: bufmgr.h:135
BufferAccessStrategy strategy
Definition: bufmgr.h:138
BlockNumber blocknum
Definition: bufmgr.h:146
PgAioReturn io_return
Definition: bufmgr.h:151
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1837
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
struct PgAioTargetData::@125 smgr
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68
static volatile sig_atomic_t waiting
Definition: waiteventset.c:171
bool RecoveryInProgress(void)
Definition: xlog.c:6406
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3127
CheckpointStatsData CheckpointStats
Definition: xlog.c:211
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2783
#define CHECKPOINT_FLUSH_UNLOGGED
Definition: xlog.h:143
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsValid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1087
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1259
#define InHotStandby
Definition: xlogutils.h:60