@@ -306,7 +306,10 @@ static void walkdir(const char *path,
306306#ifdef PG_FLUSH_DATA_WORKS
307307static void pre_sync_fname (const char * fname , bool isdir , int elevel );
308308#endif
309- static void fsync_fname_ext (const char * fname , bool isdir , int elevel );
309+ static void datadir_fsync_fname (const char * fname , bool isdir , int elevel );
310+
311+ static int fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel );
312+ static int fsync_parent_path (const char * fname , int elevel );
310313
311314
312315/*
@@ -413,54 +416,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
413416 * indicate the OS just doesn't allow/require fsyncing directories.
414417 */
415418void
416- fsync_fname (char * fname , bool isdir )
419+ fsync_fname (const char * fname , bool isdir )
420+ {
421+ fsync_fname_ext (fname , isdir , false, ERROR );
422+ }
423+
424+ /*
425+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
426+ *
427+ * This routine ensures that, after returning, the effect of renaming file
428+ * persists in case of a crash. A crash while this routine is running will
429+ * leave you with either the pre-existing or the moved file in place of the
430+ * new file; no mixed state or truncated files are possible.
431+ *
432+ * It does so by using fsync on the old filename and the possibly existing
433+ * target filename before the rename, and the target file and directory after.
434+ *
435+ * Note that rename() cannot be used across arbitrary directories, as they
436+ * might not be on the same filesystem. Therefore this routine does not
437+ * support renaming across directories.
438+ *
439+ * Log errors with the caller specified severity.
440+ *
441+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
442+ * valid upon return.
443+ */
444+ int
445+ durable_rename (const char * oldfile , const char * newfile , int elevel )
417446{
418447 int fd ;
419- int returncode ;
420448
421449 /*
422- * Some OSs require directories to be opened read-only whereas other
423- * systems don't allow us to fsync files opened read-only; so we need both
424- * cases here
450+ * First fsync the old and target path (if it exists), to ensure that they
451+ * are properly persistent on disk. Syncing the target file is not
452+ * strictly necessary, but it makes it easier to reason about crashes;
453+ * because it's then guaranteed that either source or target file exists
454+ * after a crash.
425455 */
426- if (!isdir )
427- fd = OpenTransientFile (fname ,
428- O_RDWR | PG_BINARY ,
429- S_IRUSR | S_IWUSR );
456+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
457+ return -1 ;
458+
459+ fd = OpenTransientFile ((char * ) newfile , PG_BINARY | O_RDWR , 0 );
460+ if (fd < 0 )
461+ {
462+ if (errno != ENOENT )
463+ {
464+ ereport (elevel ,
465+ (errcode_for_file_access (),
466+ errmsg ("could not open file \"%s\": %m" , newfile )));
467+ return -1 ;
468+ }
469+ }
430470 else
431- fd = OpenTransientFile (fname ,
432- O_RDONLY | PG_BINARY ,
433- S_IRUSR | S_IWUSR );
471+ {
472+ if (pg_fsync (fd ) != 0 )
473+ {
474+ int save_errno ;
475+
476+ /* close file upon error, might not be in transaction context */
477+ save_errno = errno ;
478+ CloseTransientFile (fd );
479+ errno = save_errno ;
480+
481+ ereport (elevel ,
482+ (errcode_for_file_access (),
483+ errmsg ("could not fsync file \"%s\": %m" , newfile )));
484+ return -1 ;
485+ }
486+ CloseTransientFile (fd );
487+ }
488+
489+ /* Time to do the real deal... */
490+ if (rename (oldfile , newfile ) < 0 )
491+ {
492+ ereport (elevel ,
493+ (errcode_for_file_access (),
494+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
495+ oldfile , newfile )));
496+ return -1 ;
497+ }
434498
435499 /*
436- * Some OSs don't allow us to open directories at all (Windows returns
437- * EACCES)
500+ * To guarantee renaming the file is persistent, fsync the file with its
501+ * new name, and its containing directory.
438502 */
439- if (fd < 0 && isdir && ( errno == EISDIR || errno == EACCES ) )
440- return ;
503+ if (fsync_fname_ext ( newfile , false, false, elevel ) != 0 )
504+ return -1 ;
441505
442- else if (fd < 0 )
443- ereport (ERROR ,
444- (errcode_for_file_access (),
445- errmsg ("could not open file \"%s\": %m" , fname )));
506+ if (fsync_parent_path (newfile , elevel ) != 0 )
507+ return -1 ;
446508
447- returncode = pg_fsync (fd );
509+ return 0 ;
510+ }
511+
512+ /*
513+ * durable_link_or_rename -- rename a file in a durable manner.
514+ *
515+ * Similar to durable_rename(), except that this routine tries (but does not
516+ * guarantee) not to overwrite the target file.
517+ *
518+ * Note that a crash in an unfortunate moment can leave you with two links to
519+ * the target file.
520+ *
521+ * Log errors with the caller specified severity.
522+ *
523+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
524+ * valid upon return.
525+ */
526+ int
527+ durable_link_or_rename (const char * oldfile , const char * newfile , int elevel )
528+ {
529+ /*
530+ * Ensure that, if we crash directly after the rename/link, a file with
531+ * valid contents is moved into place.
532+ */
533+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
534+ return -1 ;
448535
449- /* Some OSs don't allow us to fsync directories at all */
450- if (returncode != 0 && isdir && errno == EBADF )
536+ #if HAVE_WORKING_LINK
537+ if (link ( oldfile , newfile ) < 0 )
451538 {
452- CloseTransientFile (fd );
453- return ;
539+ ereport (elevel ,
540+ (errcode_for_file_access (),
541+ errmsg ("could not link file \"%s\" to \"%s\": %m" ,
542+ oldfile , newfile )));
543+ return -1 ;
454544 }
455-
456- if (returncode != 0 )
457- ereport (ERROR ,
545+ unlink (oldfile );
546+ #else
547+ /* XXX: Add racy file existence check? */
548+ if (rename (oldfile , newfile ) < 0 )
549+ {
550+ ereport (elevel ,
458551 (errcode_for_file_access (),
459- errmsg ("could not fsync file \"%s\": %m" , fname )));
552+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
553+ oldfile , newfile )));
554+ return -1 ;
555+ }
556+ #endif
460557
461- CloseTransientFile (fd );
462- }
558+ /*
559+ * Make change persistent in case of an OS crash, both the new entry and
560+ * its parent directory need to be flushed.
561+ */
562+ if (fsync_fname_ext (newfile , false, false, elevel ) != 0 )
563+ return -1 ;
564+
565+ /* Same for parent directory */
566+ if (fsync_parent_path (newfile , elevel ) != 0 )
567+ return -1 ;
463568
569+ return 0 ;
570+ }
464571
465572/*
466573 * InitFileAccess --- initialize this module during backend startup
@@ -2581,10 +2688,10 @@ SyncDataDirectory(void)
25812688 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
25822689 * so we don't worry about optimizing it.
25832690 */
2584- walkdir ("." , fsync_fname_ext , false, LOG );
2691+ walkdir ("." , datadir_fsync_fname , false, LOG );
25852692 if (xlog_is_symlink )
2586- walkdir ("pg_xlog" , fsync_fname_ext , false, LOG );
2587- walkdir ("pg_tblspc" , fsync_fname_ext , true, LOG );
2693+ walkdir ("pg_xlog" , datadir_fsync_fname , false, LOG );
2694+ walkdir ("pg_tblspc" , datadir_fsync_fname , true, LOG );
25882695}
25892696
25902697/*
@@ -2698,15 +2805,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
26982805
26992806#endif /* PG_FLUSH_DATA_WORKS */
27002807
2808+ static void
2809+ datadir_fsync_fname (const char * fname , bool isdir , int elevel )
2810+ {
2811+ /*
2812+ * We want to silently ignoring errors about unreadable files. Pass that
2813+ * desire on to fsync_fname_ext().
2814+ */
2815+ fsync_fname_ext (fname , isdir , true, elevel );
2816+ }
2817+
27012818/*
27022819 * fsync_fname_ext -- Try to fsync a file or directory
27032820 *
2704- * Ignores errors trying to open unreadable files, or trying to fsync
2705- * directories on systems where that isn't allowed/required, and logs other
2706- * errors at a caller-specified level.
2821+ * If ignore_perm is true, ignore errors upon trying to open unreadable
2822+ * files. Logs other errors at a caller-specified level.
2823+ *
2824+ * Returns 0 if the operation succeeded, -1 otherwise.
27072825 */
2708- static void
2709- fsync_fname_ext (const char * fname , bool isdir , int elevel )
2826+ static int
2827+ fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel )
27102828{
27112829 int fd ;
27122830 int flags ;
@@ -2724,20 +2842,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
27242842 else
27252843 flags |= O_RDONLY ;
27262844
2845+ fd = OpenTransientFile ((char * ) fname , flags , 0 );
2846+
27272847 /*
2728- * Open the file, silently ignoring errors about unreadable files (or
2729- * unsupported operations, e.g. opening a directory under Windows), and
2730- * logging others.
2848+ * Some OSs don't allow us to open directories at all (Windows returns
2849+ * EACCES), just ignore the error in that case. If desired also silently
2850+ * ignoring errors about unreadable files. Log others.
27312851 */
2732- fd = OpenTransientFile ((char * ) fname , flags , 0 );
2733- if (fd < 0 )
2852+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES ))
2853+ return 0 ;
2854+ else if (fd < 0 && ignore_perm && errno == EACCES )
2855+ return 0 ;
2856+ else if (fd < 0 )
27342857 {
2735- if (errno == EACCES || (isdir && errno == EISDIR ))
2736- return ;
27372858 ereport (elevel ,
27382859 (errcode_for_file_access (),
27392860 errmsg ("could not open file \"%s\": %m" , fname )));
2740- return ;
2861+ return -1 ;
27412862 }
27422863
27432864 returncode = pg_fsync (fd );
@@ -2747,9 +2868,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
27472868 * those errors. Anything else needs to be logged.
27482869 */
27492870 if (returncode != 0 && !(isdir && errno == EBADF ))
2871+ {
2872+ int save_errno ;
2873+
2874+ /* close file upon error, might not be in transaction context */
2875+ save_errno = errno ;
2876+ (void ) CloseTransientFile (fd );
2877+ errno = save_errno ;
2878+
27502879 ereport (elevel ,
27512880 (errcode_for_file_access (),
27522881 errmsg ("could not fsync file \"%s\": %m" , fname )));
2882+ return -1 ;
2883+ }
27532884
27542885 (void ) CloseTransientFile (fd );
2886+
2887+ return 0 ;
2888+ }
2889+
2890+ /*
2891+ * fsync_parent_path -- fsync the parent path of a file or directory
2892+ *
2893+ * This is aimed at making file operations persistent on disk in case of
2894+ * an OS crash or power failure.
2895+ */
2896+ static int
2897+ fsync_parent_path (const char * fname , int elevel )
2898+ {
2899+ char parentpath [MAXPGPATH ];
2900+
2901+ strlcpy (parentpath , fname , MAXPGPATH );
2902+ get_parent_directory (parentpath );
2903+
2904+ /*
2905+ * get_parent_directory() returns an empty string if the input argument is
2906+ * just a file name (see comments in path.c), so handle that as being the
2907+ * current directory.
2908+ */
2909+ if (strlen (parentpath ) == 0 )
2910+ strlcpy (parentpath , "." , MAXPGPATH );
2911+
2912+ if (fsync_fname_ext (parentpath , true, false, elevel ) != 0 )
2913+ return -1 ;
2914+
2915+ return 0 ;
27552916}
0 commit comments