6464#ifndef WIN32
6565#include <sys/mman.h>
6666#endif
67+ #include <limits.h>
6768#include <unistd.h>
6869#include <fcntl.h>
6970#ifdef HAVE_SYS_RESOURCE_H
@@ -391,34 +392,36 @@ pg_fdatasync(int fd)
391392/*
392393 * pg_flush_data --- advise OS that the described dirty data should be flushed
393394 *
394- * An offset of 0 with an nbytes 0 means that the entire file should be
395- * flushed.
395+ * offset of 0 with nbytes 0 means that the entire file should be flushed;
396+ * in this case, this function may have side-effects on the file's
397+ * seek position!
396398 */
397399void
398400pg_flush_data (int fd , off_t offset , off_t nbytes )
399401{
400402 /*
401403 * Right now file flushing is primarily used to avoid making later
402- * fsync()/fdatasync() calls have a less impact. Thus don't trigger
403- * flushes if fsyncs are disabled - that's a decision we might want to
404- * make configurable at some point.
404+ * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
405+ * if fsyncs are disabled - that's a decision we might want to make
406+ * configurable at some point.
405407 */
406408 if (!enableFsync )
407409 return ;
408410
409411 /*
410- * XXX: compile all alternatives, to find portability problems more easily
412+ * We compile all alternatives that are supported on the current platform,
413+ * to find portability problems more easily.
411414 */
412415#if defined(HAVE_SYNC_FILE_RANGE )
413416 {
414- int rc = 0 ;
417+ int rc ;
415418
416419 /*
417420 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
418- * tells the OS that writeback for the passed in blocks should be
421+ * tells the OS that writeback for the specified blocks should be
419422 * started, but that we don't want to wait for completion. Note that
420423 * this call might block if too much dirty data exists in the range.
421- * This is the preferrable method on OSs supporting it, as it works
424+ * This is the preferable method on OSs supporting it, as it works
422425 * reliably when available (contrast to msync()) and doesn't flush out
423426 * clean data (like FADV_DONTNEED).
424427 */
@@ -438,72 +441,107 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
438441#endif
439442#if !defined(WIN32 ) && defined(MS_ASYNC )
440443 {
441- int rc = 0 ;
442444 void * p ;
445+ static int pagesize = 0 ;
443446
444447 /*
445448 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
446- * writeback. On linux it only does so with MS_SYNC is specified, but
449+ * writeback. On linux it only does so if MS_SYNC is specified, but
447450 * then it does the writeback synchronously. Luckily all common linux
448- * systems have sync_file_range(). This is preferrable over
451+ * systems have sync_file_range(). This is preferable over
449452 * FADV_DONTNEED because it doesn't flush out clean data.
450453 *
451454 * We map the file (mmap()), tell the kernel to sync back the contents
452455 * (msync()), and then remove the mapping again (munmap()).
453456 */
454- p = mmap (NULL , nbytes ,
455- PROT_READ | PROT_WRITE , MAP_SHARED ,
456- fd , offset );
457- if (p == MAP_FAILED )
458- {
459- ereport (WARNING ,
460- (errcode_for_file_access (),
461- errmsg ("could not mmap while flushing dirty data: %m" )));
462- return ;
463- }
464457
465- rc = msync ( p , nbytes , MS_ASYNC );
466- if (rc ! = 0 )
458+ /* mmap() needs actual length if we want to map whole file */
459+ if (offset == 0 && nbytes = = 0 )
467460 {
468- ereport (WARNING ,
469- (errcode_for_file_access (),
470- errmsg ("could not flush dirty data: %m" )));
471- /* NB: need to fall through to munmap()! */
461+ nbytes = lseek (fd , 0 , SEEK_END );
462+ if (nbytes < 0 )
463+ {
464+ ereport (WARNING ,
465+ (errcode_for_file_access (),
466+ errmsg ("could not determine dirty data size: %m" )));
467+ return ;
468+ }
472469 }
473470
474- rc = munmap (p , nbytes );
475- if (rc != 0 )
471+ /*
472+ * Some platforms reject partial-page mmap() attempts. To deal with
473+ * that, just truncate the request to a page boundary. If any extra
474+ * bytes don't get flushed, well, it's only a hint anyway.
475+ */
476+
477+ /* fetch pagesize only once */
478+ if (pagesize == 0 )
479+ pagesize = sysconf (_SC_PAGESIZE );
480+
481+ /* align length to pagesize, dropping any fractional page */
482+ if (pagesize > 0 )
483+ nbytes = (nbytes / pagesize ) * pagesize ;
484+
485+ /* fractional-page request is a no-op */
486+ if (nbytes <= 0 )
487+ return ;
488+
489+ /*
490+ * mmap could well fail, particularly on 32-bit platforms where there
491+ * may simply not be enough address space. If so, silently fall
492+ * through to the next implementation.
493+ */
494+ if (nbytes <= (off_t ) SSIZE_MAX )
495+ p = mmap (NULL , nbytes , PROT_READ , MAP_SHARED , fd , offset );
496+ else
497+ p = MAP_FAILED ;
498+
499+ if (p != MAP_FAILED )
476500 {
477- /* FATAL error because mapping would remain */
478- ereport (FATAL ,
479- (errcode_for_file_access (),
480- errmsg ("could not munmap while flushing blocks: %m" )));
481- }
501+ int rc ;
482502
483- return ;
503+ rc = msync (p , (size_t ) nbytes , MS_ASYNC );
504+ if (rc != 0 )
505+ {
506+ ereport (WARNING ,
507+ (errcode_for_file_access (),
508+ errmsg ("could not flush dirty data: %m" )));
509+ /* NB: need to fall through to munmap()! */
510+ }
511+
512+ rc = munmap (p , (size_t ) nbytes );
513+ if (rc != 0 )
514+ {
515+ /* FATAL error because mapping would remain */
516+ ereport (FATAL ,
517+ (errcode_for_file_access (),
518+ errmsg ("could not munmap() while flushing data: %m" )));
519+ }
520+
521+ return ;
522+ }
484523 }
485524#endif
486525#if defined(USE_POSIX_FADVISE ) && defined(POSIX_FADV_DONTNEED )
487526 {
488- int rc = 0 ;
527+ int rc ;
489528
490529 /*
491530 * Signal the kernel that the passed in range should not be cached
492531 * anymore. This has the, desired, side effect of writing out dirty
493532 * data, and the, undesired, side effect of likely discarding useful
494533 * clean cached blocks. For the latter reason this is the least
495- * preferrable method.
534+ * preferable method.
496535 */
497536
498537 rc = posix_fadvise (fd , offset , nbytes , POSIX_FADV_DONTNEED );
499538
500- /* don't error out, this is just a performance optimization */
501539 if (rc != 0 )
502540 {
541+ /* don't error out, this is just a performance optimization */
503542 ereport (WARNING ,
504543 (errcode_for_file_access (),
505544 errmsg ("could not flush dirty data: %m" )));
506- return ;
507545 }
508546
509547 return ;
@@ -1510,6 +1548,13 @@ FileWriteback(File file, off_t offset, int amount)
15101548 file , VfdCache [file ].fileName ,
15111549 (int64 ) offset , amount ));
15121550
1551+ /*
1552+ * Caution: do not call pg_flush_data with amount = 0, it could trash the
1553+ * file's seek position.
1554+ */
1555+ if (amount <= 0 )
1556+ return ;
1557+
15131558 returnCode = FileAccess (file );
15141559 if (returnCode < 0 )
15151560 return ;
@@ -2904,11 +2949,15 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
29042949{
29052950 int fd ;
29062951
2952+ /* Don't try to flush directories, it'll likely just fail */
2953+ if (isdir )
2954+ return ;
2955+
29072956 fd = OpenTransientFile ((char * ) fname , O_RDONLY | PG_BINARY , 0 );
29082957
29092958 if (fd < 0 )
29102959 {
2911- if (errno == EACCES || ( isdir && errno == EISDIR ) )
2960+ if (errno == EACCES )
29122961 return ;
29132962 ereport (elevel ,
29142963 (errcode_for_file_access (),
0 commit comments