88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.122 2006/10/04 00:29:58 momjian Exp $
11+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123 2006/11/20 01:07:56 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
3535 * descriptors in its own descriptor pool. This is done to make it
3636 * easier to support relations that are larger than the operating
3737 * system's file size limit (often 2GBytes). In order to do that,
38- * we break relations up into chunks of < 2GBytes and store one chunk
39- * in each of several files that represent the relation. See the
40- * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
41- * All chunks except the last MUST have size exactly equal to RELSEG_SIZE
42- * blocks --- see mdnblocks() and mdtruncate().
38+ * we break relations up into "segment" files that are each shorter than
39+ * the OS file size limit. The segment size is set by the RELSEG_SIZE
40+ * configuration constant in pg_config_manual.h.
41+ *
42+ * On disk, a relation must consist of consecutively numbered segment
43+ * files in the pattern
44+ * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
45+ * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
46+ * -- Optionally, any number of inactive segments of size 0 blocks.
47+ * The full and partial segments are collectively the "active" segments.
48+ * Inactive segments are those that once contained data but are currently
49+ * not needed because of an mdtruncate() operation. The reason for leaving
50+ * them present at size zero, rather than unlinking them, is that other
51+ * backends and/or the bgwriter might be holding open file references to
52+ * such segments. If the relation expands again after mdtruncate(), such
53+ * that a deactivated segment becomes active again, it is important that
54+ * such file references still be valid --- else data might get written
55+ * out to an unlinked old copy of a segment file that will eventually
56+ * disappear.
4357 *
4458 * The file descriptor pointer (md_fd field) stored in the SMgrRelation
45- * cache is, therefore, just the head of a list of MdfdVec objects.
46- * But note the md_fd pointer can be NULL, indicating relation not open.
59+ * cache is, therefore, just the head of a list of MdfdVec objects, one
60+ * per segment. But note the md_fd pointer can be NULL, indicating
61+ * relation not open.
4762 *
48- * Note that mdfd_chain == NULL does not necessarily mean the relation
63+ * Also note that mdfd_chain == NULL does not necessarily mean the relation
4964 * doesn't have another segment after this one; we may just not have
5065 * opened the next segment yet. (We could not have "all segments are
5166 * in the chain" as an invariant anyway, since another backend could
52- * extend the relation when we weren't looking.)
67+ * extend the relation when we weren't looking.) We do not make chain
68+ * entries for inactive segments, however; as soon as we find a partial
69+ * segment, we assume that any subsequent segments are inactive.
5370 *
5471 * All MdfdVec objects are palloc'd in the MdCxt memory context.
72+ *
73+ * Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
74+ * for use on machines that support large files. Beware that that
75+ * code has not been tested in a long time and is probably bit-rotted.
5576 */
5677
5778typedef struct _MdfdVec
@@ -77,8 +98,6 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
7798 *
7899 * (Regular backends do not track pending operations locally, but forward
79100 * them to the bgwriter.)
80- *
81- * XXX for WIN32, may want to expand this to track pending deletes, too.
82101 */
83102typedef struct
84103{
@@ -222,12 +241,16 @@ mdunlink(RelFileNode rnode, bool isRedo)
222241 }
223242
224243#ifndef LET_OS_MANAGE_FILESIZE
225- /* Get the additional segments, if any */
244+ /* Delete the additional segments, if any */
226245 if (status )
227246 {
228247 char * segpath = (char * ) palloc (strlen (path ) + 12 );
229248 BlockNumber segno ;
230249
250+ /*
251+ * Note that because we loop until getting ENOENT, we will
252+ * correctly remove all inactive segments as well as active ones.
253+ */
231254 for (segno = 1 ;; segno ++ )
232255 {
233256 sprintf (segpath , "%s.%u" , path , segno );
@@ -257,15 +280,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
257280 *
258281 * The semantics are basically the same as mdwrite(): write at the
259282 * specified position. However, we are expecting to extend the
260- * relation (ie, blocknum is the current EOF), and so in case of
283+ * relation (ie, blocknum is >= the current EOF), and so in case of
261284 * failure we clean up by truncating.
262285 *
263286 * This routine returns true or false, with errno set as appropriate.
264- *
265- * Note: this routine used to call mdnblocks() to get the block position
266- * to write at, but that's pretty silly since the caller needs to know where
267- * the block will be written, and accordingly must have done mdnblocks()
268- * already. Might as well pass in the position and save a seek.
269287 */
270288bool
271289mdextend (SMgrRelation reln , BlockNumber blocknum , char * buffer , bool isTemp )
@@ -498,10 +516,10 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
498516/*
499517 * mdnblocks() -- Get the number of blocks stored in a relation.
500518 *
501- * Important side effect: all segments of the relation are opened
519+ * Important side effect: all active segments of the relation are opened
502520 * and added to the mdfd_chain list. If this routine has not been
503521 * called, then only segments up to the last one actually touched
504- * are present in the chain...
522+ * are present in the chain.
505523 *
506524 * Returns # of blocks, or InvalidBlockNumber on error.
507525 */
@@ -518,9 +536,13 @@ mdnblocks(SMgrRelation reln)
518536 * Skip through any segments that aren't the last one, to avoid redundant
519537 * seeks on them. We have previously verified that these segments are
520538 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
521- * (NOTE: this assumption could only be wrong if another backend has
539+ *
540+ * NOTE: this assumption could only be wrong if another backend has
522541 * truncated the relation. We rely on higher code levels to handle that
523- * scenario by closing and re-opening the md fd.)
542+ * scenario by closing and re-opening the md fd, which is handled via
543+ * relcache flush. (Since the bgwriter doesn't participate in relcache
544+ * flush, it could have segment chain entries for inactive segments;
545+ * that's OK because the bgwriter never needs to compute relation size.)
524546 */
525547 while (v -> mdfd_chain != NULL )
526548 {
@@ -546,8 +568,8 @@ mdnblocks(SMgrRelation reln)
546568 /*
547569 * Because we pass O_CREAT, we will create the next segment (with
548570 * zero length) immediately, if the last segment is of length
549- * REL_SEGSIZE . This is unnecessary but harmless, and testing for
550- * the case would take more cycles than it seems worth .
571+ * RELSEG_SIZE . While perhaps not strictly necessary, this keeps
572+ * the logic simple .
551573 */
552574 v -> mdfd_chain = _mdfd_openseg (reln , segno , O_CREAT );
553575 if (v -> mdfd_chain == NULL )
@@ -577,8 +599,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
577599#endif
578600
579601 /*
580- * NOTE: mdnblocks makes sure we have opened all existing segments, so
581- * that truncate/delete loop will get them all!
602+ * NOTE: mdnblocks makes sure we have opened all active segments, so
603+ * that truncation loop will get them all!
582604 */
583605 curnblk = mdnblocks (reln );
584606 if (curnblk == InvalidBlockNumber )
@@ -599,14 +621,17 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
599621 if (priorblocks > nblocks )
600622 {
601623 /*
602- * This segment is no longer wanted at all (and has already been
603- * unlinked from the mdfd_chain). We truncate the file before
604- * deleting it because if other backends are holding the file
605- * open, the unlink will fail on some platforms. Better a
606- * zero-size file gets left around than a big file...
624+ * This segment is no longer active (and has already been
625+ * unlinked from the mdfd_chain). We truncate the file, but do
626+ * not delete it, for reasons explained in the header comments.
607627 */
608- FileTruncate (v -> mdfd_vfd , 0 );
609- FileUnlink (v -> mdfd_vfd );
628+ if (FileTruncate (v -> mdfd_vfd , 0 ) < 0 )
629+ return InvalidBlockNumber ;
630+ if (!isTemp )
631+ {
632+ if (!register_dirty_segment (reln , v ))
633+ return InvalidBlockNumber ;
634+ }
610635 v = v -> mdfd_chain ;
611636 Assert (ov != reln -> md_fd ); /* we never drop the 1st segment */
612637 pfree (ov );
@@ -618,8 +643,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
618643 * the right length, and clear chain link that points to any
619644 * remaining segments (which we shall zap). NOTE: if nblocks is
620645 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
621- * segment to 0 length but keep it. This is mainly so that the
622- * right thing happens if nblocks==0 .
646+ * segment to 0 length but keep it. This adheres to the invariant
647+ * given in the header comments .
623648 */
624649 BlockNumber lastsegblocks = nblocks - priorblocks ;
625650
@@ -669,7 +694,7 @@ mdimmedsync(SMgrRelation reln)
669694 BlockNumber curnblk ;
670695
671696 /*
672- * NOTE: mdnblocks makes sure we have opened all existing segments, so
697+ * NOTE: mdnblocks makes sure we have opened all active segments, so
673698 * that fsync loop will get them all!
674699 */
675700 curnblk = mdnblocks (reln );
0 commit comments