6060/*
6161 * The map file is critical data: we have no automatic method for recovering
6262 * from loss or corruption of it. We use a CRC so that we can detect
63- * corruption. To minimize the risk of failed updates, the map file should
64- * be kept to no more than one standard-size disk sector (ie 512 bytes),
65- * and we use overwrite-in-place rather than playing renaming games.
66- * The struct layout below is designed to occupy exactly 512 bytes, which
67- * might make filesystem updates a bit more efficient.
63+ * corruption. Since the file might be more than one standard-size disk
64+ * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
65+ * a new file and rename it into place, atomically replacing the original file.
6866 *
6967 * Entries in the mappings[] array are in no particular order. We could
7068 * speed searching by insisting on OID order, but it really shouldn't be
7169 * worth the trouble given the intended size of the mapping sets.
7270 */
7371#define RELMAPPER_FILENAME "pg_filenode.map"
72+ #define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
7473
7574#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
7675
77- #define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
76+ /*
77+ * There's no need for this constant to have any particular value, and we
78+ * can raise it as necessary if we end up with more mapped relations. For
79+ * now, we just pick a round number that is modestly larger than the expected
80+ * number of mappings.
81+ */
82+ #define MAX_MAPPINGS 64
7883
7984typedef struct RelMapping
8085{
@@ -88,7 +93,6 @@ typedef struct RelMapFile
8893 int32 num_mappings ; /* number of valid RelMapping entries */
8994 RelMapping mappings [MAX_MAPPINGS ];
9095 pg_crc32c crc ; /* CRC of all above */
91- int32 pad ; /* to make the struct size be 512 exactly */
9296} RelMapFile ;
9397
9498/*
@@ -877,6 +881,7 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
877881{
878882 int fd ;
879883 char mapfilename [MAXPGPATH ];
884+ char maptempfilename [MAXPGPATH ];
880885
881886 /*
882887 * Fill in the overhead fields and update CRC.
@@ -890,17 +895,47 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
890895 FIN_CRC32C (newmap -> crc );
891896
892897 /*
893- * Open the target file. We prefer to do this before entering the
894- * critical section, so that an open() failure need not force PANIC .
898+ * Construct filenames -- a temporary file that we'll create to write the
899+ * data initially, and then the permanent name to which we will rename it .
895900 */
896901 snprintf (mapfilename , sizeof (mapfilename ), "%s/%s" ,
897902 dbpath , RELMAPPER_FILENAME );
898- fd = OpenTransientFile (mapfilename , O_WRONLY | O_CREAT | PG_BINARY );
903+ snprintf (maptempfilename , sizeof (maptempfilename ), "%s/%s" ,
904+ dbpath , RELMAPPER_TEMP_FILENAME );
905+
906+ /*
907+ * Open a temporary file. If a file already exists with this name, it must
908+ * be left over from a previous crash, so we can overwrite it. Concurrent
909+ * calls to this function are not allowed.
910+ */
911+ fd = OpenTransientFile (maptempfilename ,
912+ O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY );
899913 if (fd < 0 )
900914 ereport (ERROR ,
901915 (errcode_for_file_access (),
902916 errmsg ("could not open file \"%s\": %m" ,
903- mapfilename )));
917+ maptempfilename )));
918+
919+ /* Write new data to the file. */
920+ pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_WRITE );
921+ if (write (fd , newmap , sizeof (RelMapFile )) != sizeof (RelMapFile ))
922+ {
923+ /* if write didn't set errno, assume problem is no disk space */
924+ if (errno == 0 )
925+ errno = ENOSPC ;
926+ ereport (ERROR ,
927+ (errcode_for_file_access (),
928+ errmsg ("could not write file \"%s\": %m" ,
929+ maptempfilename )));
930+ }
931+ pgstat_report_wait_end ();
932+
933+ /* And close the file. */
934+ if (CloseTransientFile (fd ) != 0 )
935+ ereport (ERROR ,
936+ (errcode_for_file_access (),
937+ errmsg ("could not close file \"%s\": %m" ,
938+ maptempfilename )));
904939
905940 if (write_wal )
906941 {
@@ -924,40 +959,17 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
924959 XLogFlush (lsn );
925960 }
926961
927- errno = 0 ;
928- pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_WRITE );
929- if (write (fd , newmap , sizeof (RelMapFile )) != sizeof (RelMapFile ))
930- {
931- /* if write didn't set errno, assume problem is no disk space */
932- if (errno == 0 )
933- errno = ENOSPC ;
934- ereport (ERROR ,
935- (errcode_for_file_access (),
936- errmsg ("could not write file \"%s\": %m" ,
937- mapfilename )));
938- }
939- pgstat_report_wait_end ();
940-
941962 /*
942- * We choose to fsync the data to disk before considering the task done.
943- * It would be possible to relax this if it turns out to be a performance
944- * issue, but it would complicate checkpointing --- see notes for
945- * CheckPointRelationMap.
963+ * durable_rename() does all the hard work of making sure that we rename
964+ * the temporary file into place in a crash-safe manner.
965+ *
966+ * NB: Although we instruct durable_rename() to use ERROR, we will often
967+ * be in a critical section at this point; if so, ERROR will become PANIC.
946968 */
947- pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_SYNC );
948- if (pg_fsync (fd ) != 0 )
949- ereport (data_sync_elevel (ERROR ),
950- (errcode_for_file_access (),
951- errmsg ("could not fsync file \"%s\": %m" ,
952- mapfilename )));
969+ pgstat_report_wait_start (WAIT_EVENT_RELATION_MAP_REPLACE );
970+ durable_rename (maptempfilename , mapfilename , ERROR );
953971 pgstat_report_wait_end ();
954972
955- if (CloseTransientFile (fd ) != 0 )
956- ereport (ERROR ,
957- (errcode_for_file_access (),
958- errmsg ("could not close file \"%s\": %m" ,
959- mapfilename )));
960-
961973 /*
962974 * Now that the file is safely on disk, send sinval message to let other
963975 * backends know to re-read it. We must do this inside the critical
0 commit comments