PostgreSQL Source Code git master
md.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * md.c
4 * This code manages relations that reside on magnetic disk.
5 *
6 * Or at least, that was what the Berkeley folk had in mind when they named
7 * this file. In reality, what this code provides is an interface from
8 * the smgr API to Unix-like filesystem APIs, so it will work with any type
9 * of device for which the operating system provides filesystem support.
10 * It doesn't matter whether the bits are on spinning rust or some other
11 * storage technology.
12 *
13 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 *
17 * IDENTIFICATION
18 * src/backend/storage/smgr/md.c
19 *
20 *-------------------------------------------------------------------------
21 */
22#include "postgres.h"
23
24#include <limits.h>
25#include <unistd.h>
26#include <fcntl.h>
27#include <sys/file.h>
28
29#include "access/xlogutils.h"
30#include "commands/tablespace.h"
31#include "common/file_utils.h"
32#include "miscadmin.h"
33#include "pg_trace.h"
34#include "pgstat.h"
35#include "storage/aio.h"
36#include "storage/bufmgr.h"
37#include "storage/fd.h"
38#include "storage/md.h"
40#include "storage/smgr.h"
41#include "storage/sync.h"
42#include "utils/memutils.h"
43
44/*
45 * The magnetic disk storage manager keeps track of open file
46 * descriptors in its own descriptor pool. This is done to make it
47 * easier to support relations that are larger than the operating
48 * system's file size limit (often 2GBytes). In order to do that,
49 * we break relations up into "segment" files that are each shorter than
50 * the OS file size limit. The segment size is set by the RELSEG_SIZE
51 * configuration constant in pg_config.h.
52 *
53 * On disk, a relation must consist of consecutively numbered segment
54 * files in the pattern
55 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
56 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
57 * -- Optionally, any number of inactive segments of size 0 blocks.
58 * The full and partial segments are collectively the "active" segments.
59 * Inactive segments are those that once contained data but are currently
60 * not needed because of an mdtruncate() operation. The reason for leaving
61 * them present at size zero, rather than unlinking them, is that other
62 * backends and/or the checkpointer might be holding open file references to
63 * such segments. If the relation expands again after mdtruncate(), such
64 * that a deactivated segment becomes active again, it is important that
65 * such file references still be valid --- else data might get written
66 * out to an unlinked old copy of a segment file that will eventually
67 * disappear.
68 *
69 * RELSEG_SIZE must fit into BlockNumber; but since we expose its value
70 * as an integer GUC, it actually needs to fit in signed int. It's worth
71 * having a cross-check for this since configure's --with-segsize options
72 * could let people select insane values.
73 */
74StaticAssertDecl(RELSEG_SIZE > 0 && RELSEG_SIZE <= INT_MAX,
75 "RELSEG_SIZE must fit in an integer");
76
77/*
78 * File descriptors are stored in the per-fork md_seg_fds arrays inside
79 * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
80 * Note that a fork's md_num_open_segs having a specific value does not
81 * necessarily mean the relation doesn't have additional segments; we may
82 * just not have opened the next segment yet. (We could not have "all
83 * segments are in the array" as an invariant anyway, since another backend
84 * could extend the relation while we aren't looking.) We do not have
85 * entries for inactive segments, however; as soon as we find a partial
86 * segment, we assume that any subsequent segments are inactive.
87 *
88 * The entire MdfdVec array is palloc'd in the MdCxt memory context.
89 */
90
91typedef struct _MdfdVec
92{
93 File mdfd_vfd; /* fd number in fd.c's pool */
94 BlockNumber mdfd_segno; /* segment number, from 0 */
96
97static MemoryContext MdCxt; /* context for all MdfdVec objects */
98
99
100/* Populate a file tag describing an md.c segment file. */
101#define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
102( \
103 memset(&(a), 0, sizeof(FileTag)), \
104 (a).handler = SYNC_HANDLER_MD, \
105 (a).rlocator = (xx_rlocator), \
106 (a).forknum = (xx_forknum), \
107 (a).segno = (xx_segno) \
108)
109
110
111/*** behavior for mdopen & _mdfd_getseg ***/
112/* ereport if segment not present */
113#define EXTENSION_FAIL (1 << 0)
114/* return NULL if segment not present */
115#define EXTENSION_RETURN_NULL (1 << 1)
116/* create new segments as needed */
117#define EXTENSION_CREATE (1 << 2)
118/* create new segments if needed during recovery */
119#define EXTENSION_CREATE_RECOVERY (1 << 3)
120/* don't try to open a segment, if not already open */
121#define EXTENSION_DONT_OPEN (1 << 5)
122
123
124/*
125 * Fixed-length string to represent paths to files that need to be built by
126 * md.c.
127 *
128 * The maximum number of segments is MaxBlockNumber / RELSEG_SIZE, where
129 * RELSEG_SIZE can be set to 1 (for testing only).
130 */
131#define SEGMENT_CHARS OIDCHARS
132#define MD_PATH_STR_MAXLEN \
133 (\
134 REL_PATH_STR_MAXLEN \
135 + sizeof((char)'.') \
136 + SEGMENT_CHARS \
137 )
138typedef struct MdPathStr
139{
142
143
144/* local routines */
145static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
146 bool isRedo);
147static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
148static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
149 MdfdVec *seg);
150static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
151 BlockNumber segno);
152static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
153 BlockNumber segno);
154static void _fdvec_resize(SMgrRelation reln,
155 ForkNumber forknum,
156 int nseg);
158 BlockNumber segno);
159static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
160 BlockNumber segno, int oflags);
161static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
162 BlockNumber blkno, bool skipFsync, int behavior);
163static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
164 MdfdVec *seg);
165
166static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
167static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
168
171 .report = md_readv_report,
172};
173
174
175static inline int
177{
178 int flags = O_RDWR | PG_BINARY;
179
181 flags |= PG_O_DIRECT;
182
183 return flags;
184}
185
186/*
187 * mdinit() -- Initialize private state for magnetic disk storage manager.
188 */
189void
191{
193 "MdSmgr",
195}
196
197/*
198 * mdexists() -- Does the physical file exist?
199 *
200 * Note: this will return true for lingering files, with pending deletions
201 */
202bool
204{
205 /*
206 * Close it first, to ensure that we notice if the fork has been unlinked
207 * since we opened it. As an optimization, we can skip that in recovery,
208 * which already closes relations when dropping them.
209 */
210 if (!InRecovery)
211 mdclose(reln, forknum);
212
213 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
214}
215
216/*
217 * mdcreate() -- Create a new relation on magnetic disk.
218 *
219 * If isRedo is true, it's okay for the relation to exist already.
220 */
221void
222mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
223{
224 MdfdVec *mdfd;
225 RelPathStr path;
226 File fd;
227
228 if (isRedo && reln->md_num_open_segs[forknum] > 0)
229 return; /* created and opened already... */
230
231 Assert(reln->md_num_open_segs[forknum] == 0);
232
233 /*
234 * We may be using the target table space for the first time in this
235 * database, so create a per-database subdirectory if needed.
236 *
237 * XXX this is a fairly ugly violation of module layering, but this seems
238 * to be the best place to put the check. Maybe TablespaceCreateDbspace
239 * should be here and not in commands/tablespace.c? But that would imply
240 * importing a lot of stuff that smgr.c oughtn't know, either.
241 */
244 isRedo);
245
246 path = relpath(reln->smgr_rlocator, forknum);
247
248 fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
249
250 if (fd < 0)
251 {
252 int save_errno = errno;
253
254 if (isRedo)
256 if (fd < 0)
257 {
258 /* be sure to report the error reported by create, not open */
259 errno = save_errno;
262 errmsg("could not create file \"%s\": %m", path.str)));
263 }
264 }
265
266 _fdvec_resize(reln, forknum, 1);
267 mdfd = &reln->md_seg_fds[forknum][0];
268 mdfd->mdfd_vfd = fd;
269 mdfd->mdfd_segno = 0;
270
271 if (!SmgrIsTemp(reln))
272 register_dirty_segment(reln, forknum, mdfd);
273}
274
275/*
276 * mdunlink() -- Unlink a relation.
277 *
278 * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
279 * there won't be an SMgrRelation hashtable entry anymore.
280 *
281 * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
282 * to delete all forks.
283 *
284 * For regular relations, we don't unlink the first segment file of the rel,
285 * but just truncate it to zero length, and record a request to unlink it after
286 * the next checkpoint. Additional segments can be unlinked immediately,
287 * however. Leaving the empty file in place prevents that relfilenumber
288 * from being reused. The scenario this protects us from is:
289 * 1. We delete a relation (and commit, and actually remove its file).
290 * 2. We create a new relation, which by chance gets the same relfilenumber as
291 * the just-deleted one (OIDs must've wrapped around for that to happen).
292 * 3. We crash before another checkpoint occurs.
293 * During replay, we would delete the file and then recreate it, which is fine
294 * if the contents of the file were repopulated by subsequent WAL entries.
295 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
296 * file after populating it (as we do at wal_level=minimal), the contents of
297 * the file would be lost forever. By leaving the empty file until after the
298 * next checkpoint, we prevent reassignment of the relfilenumber until it's
299 * safe, because relfilenumber assignment skips over any existing file.
300 *
301 * Additional segments, if any, are truncated and then unlinked. The reason
302 * for truncating is that other backends may still hold open FDs for these at
303 * the smgr level, so that the kernel can't remove the file yet. We want to
304 * reclaim the disk space right away despite that.
305 *
306 * We do not need to go through this dance for temp relations, though, because
307 * we never make WAL entries for temp rels, and so a temp rel poses no threat
308 * to the health of a regular rel that has taken over its relfilenumber.
309 * The fact that temp rels and regular rels have different file naming
310 * patterns provides additional safety. Other backends shouldn't have open
311 * FDs for them, either.
312 *
313 * We also don't do it while performing a binary upgrade. There is no reuse
314 * hazard in that case, since after a crash or even a simple ERROR, the
315 * upgrade fails and the whole cluster must be recreated from scratch.
316 * Furthermore, it is important to remove the files from disk immediately,
317 * because we may be about to reuse the same relfilenumber.
318 *
319 * All the above applies only to the relation's main fork; other forks can
320 * just be removed immediately, since they are not needed to prevent the
321 * relfilenumber from being recycled. Also, we do not carefully
322 * track whether other forks have been created or not, but just attempt to
323 * unlink them unconditionally; so we should never complain about ENOENT.
324 *
325 * If isRedo is true, it's unsurprising for the relation to be already gone.
326 * Also, we should remove the file immediately instead of queuing a request
327 * for later, since during redo there's no possibility of creating a
328 * conflicting relation.
329 *
330 * Note: we currently just never warn about ENOENT at all. We could warn in
331 * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
332 *
333 * Note: any failure should be reported as WARNING not ERROR, because
334 * we are usually not in a transaction anymore when this is called.
335 */
336void
337mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
338{
339 /* Now do the per-fork work */
340 if (forknum == InvalidForkNumber)
341 {
342 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
343 mdunlinkfork(rlocator, forknum, isRedo);
344 }
345 else
346 mdunlinkfork(rlocator, forknum, isRedo);
347}
348
349/*
350 * Truncate a file to release disk space.
351 */
352static int
353do_truncate(const char *path)
354{
355 int save_errno;
356 int ret;
357
358 ret = pg_truncate(path, 0);
359
360 /* Log a warning here to avoid repetition in callers. */
361 if (ret < 0 && errno != ENOENT)
362 {
363 save_errno = errno;
366 errmsg("could not truncate file \"%s\": %m", path)));
367 errno = save_errno;
368 }
369
370 return ret;
371}
372
373static void
374mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
375{
376 RelPathStr path;
377 int ret;
378 int save_errno;
379
380 path = relpath(rlocator, forknum);
381
382 /*
383 * Truncate and then unlink the first segment, or just register a request
384 * to unlink it later, as described in the comments for mdunlink().
385 */
386 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
388 {
389 if (!RelFileLocatorBackendIsTemp(rlocator))
390 {
391 /* Prevent other backends' fds from holding on to the disk space */
392 ret = do_truncate(path.str);
393
394 /* Forget any pending sync requests for the first segment */
395 save_errno = errno;
396 register_forget_request(rlocator, forknum, 0 /* first seg */ );
397 errno = save_errno;
398 }
399 else
400 ret = 0;
401
402 /* Next unlink the file, unless it was already found to be missing */
403 if (ret >= 0 || errno != ENOENT)
404 {
405 ret = unlink(path.str);
406 if (ret < 0 && errno != ENOENT)
407 {
408 save_errno = errno;
411 errmsg("could not remove file \"%s\": %m", path.str)));
412 errno = save_errno;
413 }
414 }
415 }
416 else
417 {
418 /* Prevent other backends' fds from holding on to the disk space */
419 ret = do_truncate(path.str);
420
421 /* Register request to unlink first segment later */
422 save_errno = errno;
423 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
424 errno = save_errno;
425 }
426
427 /*
428 * Delete any additional segments.
429 *
430 * Note that because we loop until getting ENOENT, we will correctly
431 * remove all inactive segments as well as active ones. Ideally we'd
432 * continue the loop until getting exactly that errno, but that risks an
433 * infinite loop if the problem is directory-wide (for instance, if we
434 * suddenly can't read the data directory itself). We compromise by
435 * continuing after a non-ENOENT truncate error, but stopping after any
436 * unlink error. If there is indeed a directory-wide problem, additional
437 * unlink attempts wouldn't work anyway.
438 */
439 if (ret >= 0 || errno != ENOENT)
440 {
441 MdPathStr segpath;
442 BlockNumber segno;
443
444 for (segno = 1;; segno++)
445 {
446 sprintf(segpath.str, "%s.%u", path.str, segno);
447
448 if (!RelFileLocatorBackendIsTemp(rlocator))
449 {
450 /*
451 * Prevent other backends' fds from holding on to the disk
452 * space. We're done if we see ENOENT, though.
453 */
454 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
455 break;
456
457 /*
458 * Forget any pending sync requests for this segment before we
459 * try to unlink.
460 */
461 register_forget_request(rlocator, forknum, segno);
462 }
463
464 if (unlink(segpath.str) < 0)
465 {
466 /* ENOENT is expected after the last segment... */
467 if (errno != ENOENT)
470 errmsg("could not remove file \"%s\": %m", segpath.str)));
471 break;
472 }
473 }
474 }
475}
476
477/*
478 * mdextend() -- Add a block to the specified relation.
479 *
480 * The semantics are nearly the same as mdwrite(): write at the
481 * specified position. However, this is to be used for the case of
482 * extending a relation (i.e., blocknum is at or beyond the current
483 * EOF). Note that we assume writing a block beyond current EOF
484 * causes intervening file space to become filled with zeroes.
485 */
486void
488 const void *buffer, bool skipFsync)
489{
490 pgoff_t seekpos;
491 int nbytes;
492 MdfdVec *v;
493
494 /* If this build supports direct I/O, the buffer must be I/O aligned. */
495 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
496 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
497
498 /* This assert is too expensive to have on normally ... */
499#ifdef CHECK_WRITE_VS_EXTEND
500 Assert(blocknum >= mdnblocks(reln, forknum));
501#endif
502
503 /*
504 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
505 * more --- we mustn't create a block whose number actually is
506 * InvalidBlockNumber. (Note that this failure should be unreachable
507 * because of upstream checks in bufmgr.c.)
508 */
509 if (blocknum == InvalidBlockNumber)
511 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
512 errmsg("cannot extend file \"%s\" beyond %u blocks",
513 relpath(reln->smgr_rlocator, forknum).str,
515
516 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
517
518 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
519
520 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
521
522 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
523 {
524 if (nbytes < 0)
527 errmsg("could not extend file \"%s\": %m",
529 errhint("Check free disk space.")));
530 /* short write: complain appropriately */
532 (errcode(ERRCODE_DISK_FULL),
533 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
535 nbytes, BLCKSZ, blocknum),
536 errhint("Check free disk space.")));
537 }
538
539 if (!skipFsync && !SmgrIsTemp(reln))
540 register_dirty_segment(reln, forknum, v);
541
542 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
543}
544
545/*
546 * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
547 *
548 * Similar to mdextend(), except the relation can be extended by multiple
549 * blocks at once and the added blocks will be filled with zeroes.
550 */
551void
553 BlockNumber blocknum, int nblocks, bool skipFsync)
554{
555 MdfdVec *v;
556 BlockNumber curblocknum = blocknum;
557 int remblocks = nblocks;
558
559 Assert(nblocks > 0);
560
561 /* This assert is too expensive to have on normally ... */
562#ifdef CHECK_WRITE_VS_EXTEND
563 Assert(blocknum >= mdnblocks(reln, forknum));
564#endif
565
566 /*
567 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
568 * more --- we mustn't create a block whose number actually is
569 * InvalidBlockNumber or larger.
570 */
571 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
573 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
574 errmsg("cannot extend file \"%s\" beyond %u blocks",
575 relpath(reln->smgr_rlocator, forknum).str,
577
578 while (remblocks > 0)
579 {
580 BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
581 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
582 int numblocks;
583
584 if (segstartblock + remblocks > RELSEG_SIZE)
585 numblocks = RELSEG_SIZE - segstartblock;
586 else
587 numblocks = remblocks;
588
589 v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
590
591 Assert(segstartblock < RELSEG_SIZE);
592 Assert(segstartblock + numblocks <= RELSEG_SIZE);
593
594 /*
595 * If available and useful, use posix_fallocate() (via
596 * FileFallocate()) to extend the relation. That's often more
597 * efficient than using write(), as it commonly won't cause the kernel
598 * to allocate page cache space for the extended pages.
599 *
600 * However, we don't use FileFallocate() for small extensions, as it
601 * defeats delayed allocation on some filesystems. Not clear where
602 * that decision should be made though? For now just use a cutoff of
603 * 8, anything between 4 and 8 worked OK in some local testing.
604 */
605 if (numblocks > 8)
606 {
607 int ret;
608
609 ret = FileFallocate(v->mdfd_vfd,
610 seekpos, (pgoff_t) BLCKSZ * numblocks,
611 WAIT_EVENT_DATA_FILE_EXTEND);
612 if (ret != 0)
613 {
616 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
618 errhint("Check free disk space."));
619 }
620 }
621 else
622 {
623 int ret;
624
625 /*
626 * Even if we don't want to use fallocate, we can still extend a
627 * bit more efficiently than writing each 8kB block individually.
628 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
629 * to avoid multiple writes or needing a zeroed buffer for the
630 * whole length of the extension.
631 */
632 ret = FileZero(v->mdfd_vfd,
633 seekpos, (pgoff_t) BLCKSZ * numblocks,
634 WAIT_EVENT_DATA_FILE_EXTEND);
635 if (ret < 0)
638 errmsg("could not extend file \"%s\": %m",
640 errhint("Check free disk space."));
641 }
642
643 if (!skipFsync && !SmgrIsTemp(reln))
644 register_dirty_segment(reln, forknum, v);
645
646 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
647
648 remblocks -= numblocks;
649 curblocknum += numblocks;
650 }
651}
652
653/*
654 * mdopenfork() -- Open one fork of the specified relation.
655 *
656 * Note we only open the first segment, when there are multiple segments.
657 *
658 * If first segment is not present, either ereport or return NULL according
659 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
660 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
661 * invent one out of whole cloth.
662 */
663static MdfdVec *
664mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
665{
666 MdfdVec *mdfd;
667 RelPathStr path;
668 File fd;
669
670 /* No work if already open */
671 if (reln->md_num_open_segs[forknum] > 0)
672 return &reln->md_seg_fds[forknum][0];
673
674 path = relpath(reln->smgr_rlocator, forknum);
675
677
678 if (fd < 0)
679 {
680 if ((behavior & EXTENSION_RETURN_NULL) &&
682 return NULL;
685 errmsg("could not open file \"%s\": %m", path.str)));
686 }
687
688 _fdvec_resize(reln, forknum, 1);
689 mdfd = &reln->md_seg_fds[forknum][0];
690 mdfd->mdfd_vfd = fd;
691 mdfd->mdfd_segno = 0;
692
693 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
694
695 return mdfd;
696}
697
698/*
699 * mdopen() -- Initialize newly-opened relation.
700 */
701void
703{
704 /* mark it not open */
705 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
706 reln->md_num_open_segs[forknum] = 0;
707}
708
709/*
710 * mdclose() -- Close the specified relation, if it isn't closed already.
711 */
712void
714{
715 int nopensegs = reln->md_num_open_segs[forknum];
716
717 /* No work if already closed */
718 if (nopensegs == 0)
719 return;
720
721 /* close segments starting from the end */
722 while (nopensegs > 0)
723 {
724 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
725
727 _fdvec_resize(reln, forknum, nopensegs - 1);
728 nopensegs--;
729 }
730}
731
732/*
733 * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
734 */
735bool
737 int nblocks)
738{
739#ifdef USE_PREFETCH
740
742
743 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
744 return false;
745
746 while (nblocks > 0)
747 {
748 pgoff_t seekpos;
749 MdfdVec *v;
750 int nblocks_this_segment;
751
752 v = _mdfd_getseg(reln, forknum, blocknum, false,
754 if (v == NULL)
755 return false;
756
757 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
758
759 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
760
761 nblocks_this_segment =
762 Min(nblocks,
763 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
764
765 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
766 WAIT_EVENT_DATA_FILE_PREFETCH);
767
768 blocknum += nblocks_this_segment;
769 nblocks -= nblocks_this_segment;
770 }
771#endif /* USE_PREFETCH */
772
773 return true;
774}
775
776/*
777 * Convert an array of buffer address into an array of iovec objects, and
778 * return the number that were required. 'iov' must have enough space for up
779 * to 'nblocks' elements, but the number used may be less depending on
780 * merging. In the case of a run of fully contiguous buffers, a single iovec
781 * will be populated that can be handled as a plain non-vectored I/O.
782 */
783static int
784buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
785{
786 struct iovec *iovp;
787 int iovcnt;
788
789 Assert(nblocks >= 1);
790
791 /* If this build supports direct I/O, buffers must be I/O aligned. */
792 for (int i = 0; i < nblocks; ++i)
793 {
794 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
795 Assert((uintptr_t) buffers[i] ==
796 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
797 }
798
799 /* Start the first iovec off with the first buffer. */
800 iovp = &iov[0];
801 iovp->iov_base = buffers[0];
802 iovp->iov_len = BLCKSZ;
803 iovcnt = 1;
804
805 /* Try to merge the rest. */
806 for (int i = 1; i < nblocks; ++i)
807 {
808 void *buffer = buffers[i];
809
810 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
811 {
812 /* Contiguous with the last iovec. */
813 iovp->iov_len += BLCKSZ;
814 }
815 else
816 {
817 /* Need a new iovec. */
818 iovp++;
819 iovp->iov_base = buffer;
820 iovp->iov_len = BLCKSZ;
821 iovcnt++;
822 }
823 }
824
825 return iovcnt;
826}
827
828/*
829 * mdmaxcombine() -- Return the maximum number of total blocks that can be
830 * combined with an IO starting at blocknum.
831 */
832uint32
834 BlockNumber blocknum)
835{
836 BlockNumber segoff;
837
838 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
839
840 return RELSEG_SIZE - segoff;
841}
842
843/*
844 * mdreadv() -- Read the specified blocks from a relation.
845 */
846void
848 void **buffers, BlockNumber nblocks)
849{
850 while (nblocks > 0)
851 {
852 struct iovec iov[PG_IOV_MAX];
853 int iovcnt;
854 pgoff_t seekpos;
855 int nbytes;
856 MdfdVec *v;
857 BlockNumber nblocks_this_segment;
858 size_t transferred_this_segment;
859 size_t size_this_segment;
860
861 v = _mdfd_getseg(reln, forknum, blocknum, false,
863
864 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
865
866 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
867
868 nblocks_this_segment =
869 Min(nblocks,
870 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
871 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
872
873 if (nblocks_this_segment != nblocks)
874 elog(ERROR, "read crosses segment boundary");
875
876 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
877 size_this_segment = nblocks_this_segment * BLCKSZ;
878 transferred_this_segment = 0;
879
880 /*
881 * Inner loop to continue after a short read. We'll keep going until
882 * we hit EOF rather than assuming that a short read means we hit the
883 * end.
884 */
885 for (;;)
886 {
887 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
891 reln->smgr_rlocator.backend);
892 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
893 WAIT_EVENT_DATA_FILE_READ);
894 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
899 nbytes,
900 size_this_segment - transferred_this_segment);
901
902#ifdef SIMULATE_SHORT_READ
903 nbytes = Min(nbytes, 4096);
904#endif
905
906 if (nbytes < 0)
909 errmsg("could not read blocks %u..%u in file \"%s\": %m",
910 blocknum,
911 blocknum + nblocks_this_segment - 1,
912 FilePathName(v->mdfd_vfd))));
913
914 if (nbytes == 0)
915 {
916 /*
917 * We are at or past EOF, or we read a partial block at EOF.
918 * Normally this is an error; upper levels should never try to
919 * read a nonexistent block. However, if zero_damaged_pages
920 * is ON or we are InRecovery, we should instead return zeroes
921 * without complaining. This allows, for example, the case of
922 * trying to update a block that was later truncated away.
923 *
924 * NB: We think that this codepath is unreachable in recovery
925 * and incomplete with zero_damaged_pages, as missing segments
926 * are not created. Putting blocks into the buffer-pool that
927 * do not exist on disk is rather problematic, as it will not
928 * be found by scans that rely on smgrnblocks(), as they are
929 * beyond EOF. It also can cause weird problems with relation
930 * extension, as relation extension does not expect blocks
931 * beyond EOF to exist.
932 *
933 * Therefore we do not want to copy the logic into
934 * mdstartreadv(), where it would have to be more complicated
935 * due to potential differences in the zero_damaged_pages
936 * setting between the definer and completor of IO.
937 *
938 * For PG 18, we are putting an Assert(false) in mdreadv()
939 * (triggering failures in assertion-enabled builds, but
940 * continuing to work in production builds). Afterwards we
941 * plan to remove this code entirely.
942 */
944 {
945 Assert(false); /* see comment above */
946
947 for (BlockNumber i = transferred_this_segment / BLCKSZ;
948 i < nblocks_this_segment;
949 ++i)
950 memset(buffers[i], 0, BLCKSZ);
951 break;
952 }
953 else
956 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
957 blocknum,
958 blocknum + nblocks_this_segment - 1,
960 transferred_this_segment,
961 size_this_segment)));
962 }
963
964 /* One loop should usually be enough. */
965 transferred_this_segment += nbytes;
966 Assert(transferred_this_segment <= size_this_segment);
967 if (transferred_this_segment == size_this_segment)
968 break;
969
970 /* Adjust position and vectors after a short read. */
971 seekpos += nbytes;
972 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
973 }
974
975 nblocks -= nblocks_this_segment;
976 buffers += nblocks_this_segment;
977 blocknum += nblocks_this_segment;
978 }
979}
980
981/*
982 * mdstartreadv() -- Asynchronous version of mdreadv().
983 */
984void
986 SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
987 void **buffers, BlockNumber nblocks)
988{
989 pgoff_t seekpos;
990 MdfdVec *v;
991 BlockNumber nblocks_this_segment;
992 struct iovec *iov;
993 int iovcnt;
994 int ret;
995
996 v = _mdfd_getseg(reln, forknum, blocknum, false,
998
999 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1000
1001 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1002
1003 nblocks_this_segment =
1004 Min(nblocks,
1005 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1006
1007 if (nblocks_this_segment != nblocks)
1008 elog(ERROR, "read crossing segment boundary");
1009
1010 iovcnt = pgaio_io_get_iovec(ioh, &iov);
1011
1012 Assert(nblocks <= iovcnt);
1013
1014 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
1015
1016 Assert(iovcnt <= nblocks_this_segment);
1017
1020
1022 reln,
1023 forknum,
1024 blocknum,
1025 nblocks,
1026 false);
1028
1029 ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
1030 if (ret != 0)
1031 ereport(ERROR,
1033 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1034 blocknum,
1035 blocknum + nblocks_this_segment - 1,
1036 FilePathName(v->mdfd_vfd))));
1037
1038 /*
1039 * The error checks corresponding to the post-read checks in mdreadv() are
1040 * in md_readv_complete().
1041 *
1042 * However we chose, at least for now, to not implement the
1043 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1044 * that logic is rather problematic, and we want to get rid of it. Here
1045 * equivalent logic would have to be more complicated due to potential
1046 * differences in the zero_damaged_pages setting between the definer and
1047 * completor of IO.
1048 */
1049}
1050
1051/*
1052 * mdwritev() -- Write the supplied blocks at the appropriate location.
1053 *
1054 * This is to be used only for updating already-existing blocks of a
1055 * relation (ie, those before the current EOF). To extend a relation,
1056 * use mdextend().
1057 */
1058void
1060 const void **buffers, BlockNumber nblocks, bool skipFsync)
1061{
1062 /* This assert is too expensive to have on normally ... */
1063#ifdef CHECK_WRITE_VS_EXTEND
1064 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1065#endif
1066
1067 while (nblocks > 0)
1068 {
1069 struct iovec iov[PG_IOV_MAX];
1070 int iovcnt;
1071 pgoff_t seekpos;
1072 int nbytes;
1073 MdfdVec *v;
1074 BlockNumber nblocks_this_segment;
1075 size_t transferred_this_segment;
1076 size_t size_this_segment;
1077
1078 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1080
1081 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1082
1083 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1084
1085 nblocks_this_segment =
1086 Min(nblocks,
1087 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1088 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
1089
1090 if (nblocks_this_segment != nblocks)
1091 elog(ERROR, "write crosses segment boundary");
1092
1093 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1094 size_this_segment = nblocks_this_segment * BLCKSZ;
1095 transferred_this_segment = 0;
1096
1097 /*
1098 * Inner loop to continue after a short write. If the reason is that
1099 * we're out of disk space, a future attempt should get an ENOSPC
1100 * error from the kernel.
1101 */
1102 for (;;)
1103 {
1104 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1108 reln->smgr_rlocator.backend);
1109 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1110 WAIT_EVENT_DATA_FILE_WRITE);
1111 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1115 reln->smgr_rlocator.backend,
1116 nbytes,
1117 size_this_segment - transferred_this_segment);
1118
1119#ifdef SIMULATE_SHORT_WRITE
1120 nbytes = Min(nbytes, 4096);
1121#endif
1122
1123 if (nbytes < 0)
1124 {
1125 bool enospc = errno == ENOSPC;
1126
1127 ereport(ERROR,
1129 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1130 blocknum,
1131 blocknum + nblocks_this_segment - 1,
1133 enospc ? errhint("Check free disk space.") : 0));
1134 }
1135
1136 /* One loop should usually be enough. */
1137 transferred_this_segment += nbytes;
1138 Assert(transferred_this_segment <= size_this_segment);
1139 if (transferred_this_segment == size_this_segment)
1140 break;
1141
1142 /* Adjust position and iovecs after a short write. */
1143 seekpos += nbytes;
1144 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1145 }
1146
1147 if (!skipFsync && !SmgrIsTemp(reln))
1148 register_dirty_segment(reln, forknum, v);
1149
1150 nblocks -= nblocks_this_segment;
1151 buffers += nblocks_this_segment;
1152 blocknum += nblocks_this_segment;
1153 }
1154}
1155
1156
1157/*
1158 * mdwriteback() -- Tell the kernel to write pages back to storage.
1159 *
1160 * This accepts a range of blocks because flushing several pages at once is
1161 * considerably more efficient than doing so individually.
1162 */
1163void
1165 BlockNumber blocknum, BlockNumber nblocks)
1166{
1168
1169 /*
1170 * Issue flush requests in as few requests as possible; have to split at
1171 * segment boundaries though, since those are actually separate files.
1172 */
1173 while (nblocks > 0)
1174 {
1175 BlockNumber nflush = nblocks;
1176 pgoff_t seekpos;
1177 MdfdVec *v;
1178 int segnum_start,
1179 segnum_end;
1180
1181 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1183
1184 /*
1185 * We might be flushing buffers of already removed relations, that's
1186 * ok, just ignore that case. If the segment file wasn't open already
1187 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1188 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1189 * us with a descriptor to a file that is about to be unlinked.
1190 */
1191 if (!v)
1192 return;
1193
1194 /* compute offset inside the current segment */
1195 segnum_start = blocknum / RELSEG_SIZE;
1196
1197 /* compute number of desired writes within the current segment */
1198 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1199 if (segnum_start != segnum_end)
1200 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1201
1202 Assert(nflush >= 1);
1203 Assert(nflush <= nblocks);
1204
1205 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1206
1207 FileWriteback(v->mdfd_vfd, seekpos, (pgoff_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
1208
1209 nblocks -= nflush;
1210 blocknum += nflush;
1211 }
1212}
1213
1214/*
1215 * mdnblocks() -- Get the number of blocks stored in a relation.
1216 *
1217 * Important side effect: all active segments of the relation are opened
1218 * and added to the md_seg_fds array. If this routine has not been
1219 * called, then only segments up to the last one actually touched
1220 * are present in the array.
1221 */
1224{
1225 MdfdVec *v;
1226 BlockNumber nblocks;
1227 BlockNumber segno;
1228
1229 mdopenfork(reln, forknum, EXTENSION_FAIL);
1230
1231 /* mdopen has opened the first segment */
1232 Assert(reln->md_num_open_segs[forknum] > 0);
1233
1234 /*
1235 * Start from the last open segments, to avoid redundant seeks. We have
1236 * previously verified that these segments are exactly RELSEG_SIZE long,
1237 * and it's useless to recheck that each time.
1238 *
1239 * NOTE: this assumption could only be wrong if another backend has
1240 * truncated the relation. We rely on higher code levels to handle that
1241 * scenario by closing and re-opening the md fd, which is handled via
1242 * relcache flush. (Since the checkpointer doesn't participate in
1243 * relcache flush, it could have segment entries for inactive segments;
1244 * that's OK because the checkpointer never needs to compute relation
1245 * size.)
1246 */
1247 segno = reln->md_num_open_segs[forknum] - 1;
1248 v = &reln->md_seg_fds[forknum][segno];
1249
1250 for (;;)
1251 {
1252 nblocks = _mdnblocks(reln, forknum, v);
1253 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1254 elog(FATAL, "segment too big");
1255 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1256 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1257
1258 /*
1259 * If segment is exactly RELSEG_SIZE, advance to next one.
1260 */
1261 segno++;
1262
1263 /*
1264 * We used to pass O_CREAT here, but that has the disadvantage that it
1265 * might create a segment which has vanished through some operating
1266 * system misadventure. In such a case, creating the segment here
1267 * undermines _mdfd_getseg's attempts to notice and report an error
1268 * upon access to a missing segment.
1269 */
1270 v = _mdfd_openseg(reln, forknum, segno, 0);
1271 if (v == NULL)
1272 return segno * ((BlockNumber) RELSEG_SIZE);
1273 }
1274}
1275
1276/*
1277 * mdtruncate() -- Truncate relation to specified number of blocks.
1278 *
1279 * Guaranteed not to allocate memory, so it can be used in a critical section.
1280 * Caller must have called smgrnblocks() to obtain curnblk while holding a
1281 * sufficient lock to prevent a change in relation size, and not used any smgr
1282 * functions for this relation or handled interrupts in between. This makes
1283 * sure we have opened all active segments, so that truncate loop will get
1284 * them all!
1285 */
1286void
1288 BlockNumber curnblk, BlockNumber nblocks)
1289{
1290 BlockNumber priorblocks;
1291 int curopensegs;
1292
1293 if (nblocks > curnblk)
1294 {
1295 /* Bogus request ... but no complaint if InRecovery */
1296 if (InRecovery)
1297 return;
1298 ereport(ERROR,
1299 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1300 relpath(reln->smgr_rlocator, forknum).str,
1301 nblocks, curnblk)));
1302 }
1303 if (nblocks == curnblk)
1304 return; /* no work */
1305
1306 /*
1307 * Truncate segments, starting at the last one. Starting at the end makes
1308 * managing the memory for the fd array easier, should there be errors.
1309 */
1310 curopensegs = reln->md_num_open_segs[forknum];
1311 while (curopensegs > 0)
1312 {
1313 MdfdVec *v;
1314
1315 priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1316
1317 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1318
1319 if (priorblocks > nblocks)
1320 {
1321 /*
1322 * This segment is no longer active. We truncate the file, but do
1323 * not delete it, for reasons explained in the header comments.
1324 */
1325 if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1326 ereport(ERROR,
1328 errmsg("could not truncate file \"%s\": %m",
1329 FilePathName(v->mdfd_vfd))));
1330
1331 if (!SmgrIsTemp(reln))
1332 register_dirty_segment(reln, forknum, v);
1333
1334 /* we never drop the 1st segment */
1335 Assert(v != &reln->md_seg_fds[forknum][0]);
1336
1337 FileClose(v->mdfd_vfd);
1338 _fdvec_resize(reln, forknum, curopensegs - 1);
1339 }
1340 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1341 {
1342 /*
1343 * This is the last segment we want to keep. Truncate the file to
1344 * the right length. NOTE: if nblocks is exactly a multiple K of
1345 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1346 * keep it. This adheres to the invariant given in the header
1347 * comments.
1348 */
1349 BlockNumber lastsegblocks = nblocks - priorblocks;
1350
1351 if (FileTruncate(v->mdfd_vfd, (pgoff_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1352 ereport(ERROR,
1354 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1356 nblocks)));
1357 if (!SmgrIsTemp(reln))
1358 register_dirty_segment(reln, forknum, v);
1359 }
1360 else
1361 {
1362 /*
1363 * We still need this segment, so nothing to do for this and any
1364 * earlier segment.
1365 */
1366 break;
1367 }
1368 curopensegs--;
1369 }
1370}
1371
1372/*
1373 * mdregistersync() -- Mark whole relation as needing fsync
1374 */
1375void
1377{
1378 int segno;
1379 int min_inactive_seg;
1380
1381 /*
1382 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1383 * the loop below will get them all!
1384 */
1385 mdnblocks(reln, forknum);
1386
1387 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1388
1389 /*
1390 * Temporarily open inactive segments, then close them after sync. There
1391 * may be some inactive segments left opened after error, but that is
1392 * harmless. We don't bother to clean them up and take a risk of further
1393 * trouble. The next mdclose() will soon close them.
1394 */
1395 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1396 segno++;
1397
1398 while (segno > 0)
1399 {
1400 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1401
1402 register_dirty_segment(reln, forknum, v);
1403
1404 /* Close inactive segments immediately */
1405 if (segno > min_inactive_seg)
1406 {
1407 FileClose(v->mdfd_vfd);
1408 _fdvec_resize(reln, forknum, segno - 1);
1409 }
1410
1411 segno--;
1412 }
1413}
1414
1415/*
1416 * mdimmedsync() -- Immediately sync a relation to stable storage.
1417 *
1418 * Note that only writes already issued are synced; this routine knows
1419 * nothing of dirty buffers that may exist inside the buffer manager. We
1420 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1421 * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1422 * some segment, then mdtruncate() renders that segment inactive. If we
1423 * crash before the next checkpoint syncs the newly-inactive segment, that
1424 * segment may survive recovery, reintroducing unwanted data into the table.
1425 */
1426void
1428{
1429 int segno;
1430 int min_inactive_seg;
1431
1432 /*
1433 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1434 * the loop below will get them all!
1435 */
1436 mdnblocks(reln, forknum);
1437
1438 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1439
1440 /*
1441 * Temporarily open inactive segments, then close them after sync. There
1442 * may be some inactive segments left opened after fsync() error, but that
1443 * is harmless. We don't bother to clean them up and take a risk of
1444 * further trouble. The next mdclose() will soon close them.
1445 */
1446 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1447 segno++;
1448
1449 while (segno > 0)
1450 {
1451 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1452
1453 /*
1454 * fsyncs done through mdimmedsync() should be tracked in a separate
1455 * IOContext than those done through mdsyncfiletag() to differentiate
1456 * between unavoidable client backend fsyncs (e.g. those done during
1457 * index build) and those which ideally would have been done by the
1458 * checkpointer. Since other IO operations bypassing the buffer
1459 * manager could also be tracked in such an IOContext, wait until
1460 * these are also tracked to track immediate fsyncs.
1461 */
1462 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1465 errmsg("could not fsync file \"%s\": %m",
1466 FilePathName(v->mdfd_vfd))));
1467
1468 /* Close inactive segments immediately */
1469 if (segno > min_inactive_seg)
1470 {
1471 FileClose(v->mdfd_vfd);
1472 _fdvec_resize(reln, forknum, segno - 1);
1473 }
1474
1475 segno--;
1476 }
1477}
1478
1479int
1480mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
1481{
1482 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1483
1484 v = _mdfd_getseg(reln, forknum, blocknum, false,
1486
1487 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1488
1489 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1490
1491 return FileGetRawDesc(v->mdfd_vfd);
1492}
1493
1494/*
1495 * register_dirty_segment() -- Mark a relation segment as needing fsync
1496 *
1497 * If there is a local pending-ops table, just make an entry in it for
1498 * ProcessSyncRequests to process later. Otherwise, try to pass off the
1499 * fsync request to the checkpointer process. If that fails, just do the
1500 * fsync locally before returning (we hope this will not happen often
1501 * enough to be a performance problem).
1502 */
1503static void
1505{
1506 FileTag tag;
1507
1508 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1509
1510 /* Temp relations should never be fsync'd */
1511 Assert(!SmgrIsTemp(reln));
1512
1513 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1514 {
1515 instr_time io_start;
1516
1518 (errmsg_internal("could not forward fsync request because request queue is full")));
1519
1521
1522 if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1525 errmsg("could not fsync file \"%s\": %m",
1526 FilePathName(seg->mdfd_vfd))));
1527
1528 /*
1529 * We have no way of knowing if the current IOContext is
1530 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1531 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1532 * IOContext. This is probably okay, because the number of backend
1533 * fsyncs doesn't say anything about the efficacy of the
1534 * BufferAccessStrategy. And counting both fsyncs done in
1535 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1536 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1537 * backend fsyncs.
1538 */
1540 IOOP_FSYNC, io_start, 1, 0);
1541 }
1542}
1543
1544/*
1545 * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1546 */
1547static void
1549 BlockNumber segno)
1550{
1551 FileTag tag;
1552
1553 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1554
1555 /* Should never be used with temp relations */
1557
1558 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1559}
1560
1561/*
1562 * register_forget_request() -- forget any fsyncs for a relation fork's segment
1563 */
1564static void
1566 BlockNumber segno)
1567{
1568 FileTag tag;
1569
1570 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1571
1572 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1573}
1574
1575/*
1576 * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1577 */
1578void
1580{
1581 FileTag tag;
1582 RelFileLocator rlocator;
1583
1584 rlocator.dbOid = dbid;
1585 rlocator.spcOid = 0;
1586 rlocator.relNumber = 0;
1587
1589
1590 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1591}
1592
1593/*
1594 * DropRelationFiles -- drop files of all given relations
1595 */
1596void
1597DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
1598{
1599 SMgrRelation *srels;
1600 int i;
1601
1602 srels = palloc(sizeof(SMgrRelation) * ndelrels);
1603 for (i = 0; i < ndelrels; i++)
1604 {
1605 SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
1606
1607 if (isRedo)
1608 {
1609 ForkNumber fork;
1610
1611 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1612 XLogDropRelation(delrels[i], fork);
1613 }
1614 srels[i] = srel;
1615 }
1616
1617 smgrdounlinkall(srels, ndelrels, isRedo);
1618
1619 for (i = 0; i < ndelrels; i++)
1620 smgrclose(srels[i]);
1621 pfree(srels);
1622}
1623
1624
1625/*
1626 * _fdvec_resize() -- Resize the fork's open segments array
1627 */
1628static void
1630 ForkNumber forknum,
1631 int nseg)
1632{
1633 if (nseg == 0)
1634 {
1635 if (reln->md_num_open_segs[forknum] > 0)
1636 {
1637 pfree(reln->md_seg_fds[forknum]);
1638 reln->md_seg_fds[forknum] = NULL;
1639 }
1640 }
1641 else if (reln->md_num_open_segs[forknum] == 0)
1642 {
1643 reln->md_seg_fds[forknum] =
1644 MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1645 }
1646 else if (nseg > reln->md_num_open_segs[forknum])
1647 {
1648 /*
1649 * It doesn't seem worthwhile complicating the code to amortize
1650 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1651 * FileClose(), and the memory context internally will sometimes avoid
1652 * doing an actual reallocation.
1653 */
1654 reln->md_seg_fds[forknum] =
1655 repalloc(reln->md_seg_fds[forknum],
1656 sizeof(MdfdVec) * nseg);
1657 }
1658 else
1659 {
1660 /*
1661 * We don't reallocate a smaller array, because we want mdtruncate()
1662 * to be able to promise that it won't allocate memory, so that it is
1663 * allowed in a critical section. This means that a bit of space in
1664 * the array is now wasted, until the next time we add a segment and
1665 * reallocate.
1666 */
1667 }
1668
1669 reln->md_num_open_segs[forknum] = nseg;
1670}
1671
1672/*
1673 * Return the filename for the specified segment of the relation. The
1674 * returned string is palloc'd.
1675 */
1676static MdPathStr
1678{
1679 RelPathStr path;
1680 MdPathStr fullpath;
1681
1682 path = relpath(reln->smgr_rlocator, forknum);
1683
1684 if (segno > 0)
1685 sprintf(fullpath.str, "%s.%u", path.str, segno);
1686 else
1687 strcpy(fullpath.str, path.str);
1688
1689 return fullpath;
1690}
1691
1692/*
1693 * Open the specified segment of the relation,
1694 * and make a MdfdVec object for it. Returns NULL on failure.
1695 */
1696static MdfdVec *
1698 int oflags)
1699{
1700 MdfdVec *v;
1701 File fd;
1702 MdPathStr fullpath;
1703
1704 fullpath = _mdfd_segpath(reln, forknum, segno);
1705
1706 /* open the file */
1707 fd = PathNameOpenFile(fullpath.str, _mdfd_open_flags() | oflags);
1708
1709 if (fd < 0)
1710 return NULL;
1711
1712 /*
1713 * Segments are always opened in order from lowest to highest, so we must
1714 * be adding a new one at the end.
1715 */
1716 Assert(segno == reln->md_num_open_segs[forknum]);
1717
1718 _fdvec_resize(reln, forknum, segno + 1);
1719
1720 /* fill the entry */
1721 v = &reln->md_seg_fds[forknum][segno];
1722 v->mdfd_vfd = fd;
1723 v->mdfd_segno = segno;
1724
1725 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1726
1727 /* all done */
1728 return v;
1729}
1730
1731/*
1732 * _mdfd_getseg() -- Find the segment of the relation holding the
1733 * specified block.
1734 *
1735 * If the segment doesn't exist, we ereport, return NULL, or create the
1736 * segment, according to "behavior". Note: skipFsync is only used in the
1737 * EXTENSION_CREATE case.
1738 */
1739static MdfdVec *
1741 bool skipFsync, int behavior)
1742{
1743 MdfdVec *v;
1744 BlockNumber targetseg;
1745 BlockNumber nextsegno;
1746
1747 /* some way to handle non-existent segments needs to be specified */
1748 Assert(behavior &
1751
1752 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1753
1754 /* if an existing and opened segment, we're done */
1755 if (targetseg < reln->md_num_open_segs[forknum])
1756 {
1757 v = &reln->md_seg_fds[forknum][targetseg];
1758 return v;
1759 }
1760
1761 /* The caller only wants the segment if we already had it open. */
1762 if (behavior & EXTENSION_DONT_OPEN)
1763 return NULL;
1764
1765 /*
1766 * The target segment is not yet open. Iterate over all the segments
1767 * between the last opened and the target segment. This way missing
1768 * segments either raise an error, or get created (according to
1769 * 'behavior'). Start with either the last opened, or the first segment if
1770 * none was opened before.
1771 */
1772 if (reln->md_num_open_segs[forknum] > 0)
1773 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1774 else
1775 {
1776 v = mdopenfork(reln, forknum, behavior);
1777 if (!v)
1778 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1779 }
1780
1781 for (nextsegno = reln->md_num_open_segs[forknum];
1782 nextsegno <= targetseg; nextsegno++)
1783 {
1784 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1785 int flags = 0;
1786
1787 Assert(nextsegno == v->mdfd_segno + 1);
1788
1789 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1790 elog(FATAL, "segment too big");
1791
1792 if ((behavior & EXTENSION_CREATE) ||
1793 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1794 {
1795 /*
1796 * Normally we will create new segments only if authorized by the
1797 * caller (i.e., we are doing mdextend()). But when doing WAL
1798 * recovery, create segments anyway; this allows cases such as
1799 * replaying WAL data that has a write into a high-numbered
1800 * segment of a relation that was later deleted. We want to go
1801 * ahead and create the segments so we can finish out the replay.
1802 *
1803 * We have to maintain the invariant that segments before the last
1804 * active segment are of size RELSEG_SIZE; therefore, if
1805 * extending, pad them out with zeroes if needed. (This only
1806 * matters if in recovery, or if the caller is extending the
1807 * relation discontiguously, but that can happen in hash indexes.)
1808 */
1809 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1810 {
1811 char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
1813
1814 mdextend(reln, forknum,
1815 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1816 zerobuf, skipFsync);
1817 pfree(zerobuf);
1818 }
1819 flags = O_CREAT;
1820 }
1821 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1822 {
1823 /*
1824 * When not extending, only open the next segment if the current
1825 * one is exactly RELSEG_SIZE. If not (this branch), either
1826 * return NULL or fail.
1827 */
1828 if (behavior & EXTENSION_RETURN_NULL)
1829 {
1830 /*
1831 * Some callers discern between reasons for _mdfd_getseg()
1832 * returning NULL based on errno. As there's no failing
1833 * syscall involved in this case, explicitly set errno to
1834 * ENOENT, as that seems the closest interpretation.
1835 */
1836 errno = ENOENT;
1837 return NULL;
1838 }
1839
1840 ereport(ERROR,
1842 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1843 _mdfd_segpath(reln, forknum, nextsegno).str,
1844 blkno, nblocks)));
1845 }
1846
1847 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1848
1849 if (v == NULL)
1850 {
1851 if ((behavior & EXTENSION_RETURN_NULL) &&
1852 FILE_POSSIBLY_DELETED(errno))
1853 return NULL;
1854 ereport(ERROR,
1856 errmsg("could not open file \"%s\" (target block %u): %m",
1857 _mdfd_segpath(reln, forknum, nextsegno).str,
1858 blkno)));
1859 }
1860 }
1861
1862 return v;
1863}
1864
1865/*
1866 * Get number of blocks present in a single disk file
1867 */
1868static BlockNumber
1870{
1871 pgoff_t len;
1872
1873 len = FileSize(seg->mdfd_vfd);
1874 if (len < 0)
1875 ereport(ERROR,
1877 errmsg("could not seek to end of file \"%s\": %m",
1878 FilePathName(seg->mdfd_vfd))));
1879 /* note that this calculation will ignore any partial block at EOF */
1880 return (BlockNumber) (len / BLCKSZ);
1881}
1882
1883/*
1884 * Sync a file to disk, given a file tag. Write the path into an output
1885 * buffer so the caller can use it in error messages.
1886 *
1887 * Return 0 on success, -1 on failure, with errno set.
1888 */
1889int
1890mdsyncfiletag(const FileTag *ftag, char *path)
1891{
1893 File file;
1894 instr_time io_start;
1895 bool need_to_close;
1896 int result,
1897 save_errno;
1898
1899 /* See if we already have the file open, or need to open it. */
1900 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1901 {
1902 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1903 strlcpy(path, FilePathName(file), MAXPGPATH);
1904 need_to_close = false;
1905 }
1906 else
1907 {
1908 MdPathStr p;
1909
1910 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1911 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1912
1913 file = PathNameOpenFile(path, _mdfd_open_flags());
1914 if (file < 0)
1915 return -1;
1916 need_to_close = true;
1917 }
1918
1920
1921 /* Sync the file. */
1922 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1923 save_errno = errno;
1924
1925 if (need_to_close)
1926 FileClose(file);
1927
1929 IOOP_FSYNC, io_start, 1, 0);
1930
1931 errno = save_errno;
1932 return result;
1933}
1934
1935/*
1936 * Unlink a file, given a file tag. Write the path into an output
1937 * buffer so the caller can use it in error messages.
1938 *
1939 * Return 0 on success, -1 on failure, with errno set.
1940 */
1941int
1942mdunlinkfiletag(const FileTag *ftag, char *path)
1943{
1944 RelPathStr p;
1945
1946 /* Compute the path. */
1947 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1948 strlcpy(path, p.str, MAXPGPATH);
1949
1950 /* Try to unlink the file. */
1951 return unlink(path);
1952}
1953
1954/*
1955 * Check if a given candidate request matches a given tag, when processing
1956 * a SYNC_FILTER_REQUEST request. This will be called for all pending
1957 * requests to find out whether to forget them.
1958 */
1959bool
1960mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1961{
1962 /*
1963 * For now we only use filter requests as a way to drop all scheduled
1964 * callbacks relating to a given database, when dropping the database.
1965 * We'll return true for all candidates that have the same database OID as
1966 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1967 */
1968 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1969}
1970
1971/*
1972 * AIO completion callback for mdstartreadv().
1973 */
1974static PgAioResult
1976{
1978 PgAioResult result = prior_result;
1979
1980 if (prior_result.result < 0)
1981 {
1982 result.status = PGAIO_RS_ERROR;
1983 result.id = PGAIO_HCB_MD_READV;
1984 /* For "hard" errors, track the error number in error_data */
1985 result.error_data = -prior_result.result;
1986 result.result = 0;
1987
1988 /*
1989 * Immediately log a message about the IO error, but only to the
1990 * server log. The reason to do so immediately is that the originator
1991 * might not process the query result immediately (because it is busy
1992 * doing another part of query processing) or at all (e.g. if it was
1993 * cancelled or errored out due to another IO also failing). The
1994 * definer of the IO will emit an ERROR when processing the IO's
1995 * results
1996 */
1998
1999 return result;
2000 }
2001
2002 /*
2003 * As explained above smgrstartreadv(), the smgr API operates on the level
2004 * of blocks, rather than bytes. Convert.
2005 */
2006 result.result /= BLCKSZ;
2007
2008 Assert(result.result <= td->smgr.nblocks);
2009
2010 if (result.result == 0)
2011 {
2012 /* consider 0 blocks read a failure */
2013 result.status = PGAIO_RS_ERROR;
2014 result.id = PGAIO_HCB_MD_READV;
2015 result.error_data = 0;
2016
2017 /* see comment above the "hard error" case */
2019
2020 return result;
2021 }
2022
2023 if (result.status != PGAIO_RS_ERROR &&
2024 result.result < td->smgr.nblocks)
2025 {
2026 /* partial reads should be retried at upper level */
2027 result.status = PGAIO_RS_PARTIAL;
2028 result.id = PGAIO_HCB_MD_READV;
2029 }
2030
2031 return result;
2032}
2033
2034/*
2035 * AIO error reporting callback for mdstartreadv().
2036 *
2037 * Errors are encoded as follows:
2038 * - PgAioResult.error_data != 0 encodes IO that failed with that errno
2039 * - PgAioResult.error_data == 0 encodes IO that didn't read all data
2040 */
2041static void
2042md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
2043{
2044 RelPathStr path;
2045
2046 path = relpathbackend(td->smgr.rlocator,
2048 td->smgr.forkNum);
2049
2050 if (result.error_data != 0)
2051 {
2052 /* for errcode_for_file_access() and %m */
2053 errno = result.error_data;
2054
2055 ereport(elevel,
2057 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2058 td->smgr.blockNum,
2059 td->smgr.blockNum + td->smgr.nblocks - 1,
2060 path.str));
2061 }
2062 else
2063 {
2064 /*
2065 * NB: This will typically only be output in debug messages, while
2066 * retrying a partial IO.
2067 */
2068 ereport(elevel,
2070 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2071 td->smgr.blockNum,
2072 td->smgr.blockNum + td->smgr.nblocks - 1,
2073 path.str,
2074 result.result * (size_t) BLCKSZ,
2075 td->smgr.nblocks * (size_t) BLCKSZ));
2076 }
2077}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:330
@ PGAIO_HCB_MD_READV
Definition: aio.h:196
@ PGAIO_HF_BUFFERED
Definition: aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition: aio_io.c:42
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition: tablespace.c:112
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define MaxBlockNumber
Definition: block.h:35
bool track_io_timing
Definition: bufmgr.c:147
bool zero_damaged_pages
Definition: bufmgr.c:144
#define Min(x, y)
Definition: c.h:1008
#define TYPEALIGN(ALIGNVAL, LEN)
Definition: c.h:808
uint8_t uint8
Definition: c.h:541
#define PG_BINARY
Definition: c.h:1273
uint64_t uint64
Definition: c.h:544
uint32_t uint32
Definition: c.h:543
#define lengthof(array)
Definition: c.h:792
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode_for_file_access(void)
Definition: elog.c:886
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition: fd.c:717
int FileGetRawDesc(File file)
Definition: fd.c:2529
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition: fd.c:2136
int io_direct_flags
Definition: fd.c:168
char * FilePathName(File file)
Definition: fd.c:2513
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2349
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2218
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2162
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition: fd.c:2421
pgoff_t FileSize(File file)
Definition: fd.c:2461
void FileClose(File file)
Definition: fd.c:1979
int data_sync_elevel(int elevel)
Definition: fd.c:3998
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1576
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2478
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition: fd.c:2376
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition: fd.c:2080
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2244
#define IO_DIRECT_DATA
Definition: fd.h:54
static ssize_t FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition: fd.h:211
#define FILE_POSSIBLY_DELETED(err)
Definition: fd.h:78
int File
Definition: fd.h:51
#define PG_O_DIRECT
Definition: fd.h:97
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition: file_utils.c:614
bool IsBinaryUpgrade
Definition: globals.c:121
ProcNumber MyProcNumber
Definition: globals.c:90
Assert(PointerIsAligned(start, uint64))
const char * str
int i
Definition: isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
MemoryContext TopMemoryContext
Definition: mcxt.c:166
void * palloc(Size size)
Definition: mcxt.c:1365
void * palloc_aligned(Size size, Size alignto, int flags)
Definition: mcxt.c:1584
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:337
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: md.c:2042
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1565
#define EXTENSION_CREATE_RECOVERY
Definition: md.c:119
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition: md.c:1287
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1869
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:374
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition: md.c:1059
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition: md.c:1960
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:203
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: md.c:847
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1677
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1548
#define EXTENSION_DONT_OPEN
Definition: md.c:121
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1223
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition: md.c:1942
static MemoryContext MdCxt
Definition: md.c:97
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: md.c:222
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: md.c:1480
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: md.c:487
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: md.c:1975
static int do_truncate(const char *path)
Definition: md.c:353
void mdinit(void)
Definition: md.c:190
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:713
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: md.c:552
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition: md.c:1697
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1504
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition: md.c:1890
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: md.c:1164
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: md.c:833
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition: md.c:1740
#define EXTENSION_RETURN_NULL
Definition: md.c:115
void mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: md.c:985
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: md.c:736
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1376
void mdopen(SMgrRelation reln)
Definition: md.c:702
#define EXTENSION_CREATE
Definition: md.c:117
const PgAioHandleCallbacks aio_md_readv_cb
Definition: md.c:169
static int _mdfd_open_flags(void)
Definition: md.c:176
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition: md.c:101
#define EXTENSION_FAIL
Definition: md.c:113
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition: md.c:664
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition: md.c:1597
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition: md.c:784
struct MdPathStr MdPathStr
#define MD_PATH_STR_MAXLEN
Definition: md.c:132
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition: md.c:1629
void ForgetDatabaseSyncRequests(Oid dbid)
Definition: md.c:1579
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1427
struct _MdfdVec MdfdVec
StaticAssertDecl(RELSEG_SIZE > 0 &&RELSEG_SIZE<=INT_MAX, "RELSEG_SIZE must fit in an integer")
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define MAXPGPATH
#define PG_IO_ALIGN_SIZE
const void size_t len
#define PG_IOV_MAX
Definition: pg_iovec.h:47
@ IOOBJECT_RELATION
Definition: pgstat.h:277
@ IOCONTEXT_NORMAL
Definition: pgstat.h:289
@ IOOP_FSYNC
Definition: pgstat.h:308
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
#define sprintf
Definition: port.h:241
#define pgoff_t
Definition: port.h:401
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
unsigned int Oid
Definition: postgres_ext.h:32
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
#define RelFileLocatorBackendIsTemp(rlocator)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ InvalidForkNumber
Definition: relpath.h:57
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrclose(SMgrRelation reln)
Definition: smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:538
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition: smgr.c:1029
#define SmgrIsTemp(smgr)
Definition: smgr.h:74
Definition: sync.h:51
RelFileLocator rlocator
Definition: sync.h:54
int16 forknum
Definition: sync.h:53
uint64 segno
Definition: sync.h:55
Definition: md.c:139
char str[MD_PATH_STR_MAXLEN+1]
Definition: md.c:140
PgAioHandleCallbackComplete complete_shared
Definition: aio.h:239
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
int md_num_open_segs[MAX_FORKNUM+1]
Definition: smgr.h:61
struct _MdfdVec * md_seg_fds[MAX_FORKNUM+1]
Definition: smgr.h:62
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
Definition: md.c:92
File mdfd_vfd
Definition: md.c:93
BlockNumber mdfd_segno
Definition: md.c:94
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:580
@ SYNC_FILTER_REQUEST
Definition: sync.h:28
@ SYNC_FORGET_REQUEST
Definition: sync.h:27
@ SYNC_UNLINK_REQUEST
Definition: sync.h:26
@ SYNC_REQUEST
Definition: sync.h:25
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
struct PgAioTargetData::@125 smgr
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68
bool InRecovery
Definition: xlogutils.c:50
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition: xlogutils.c:630