2626 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
2727 * Portions Copyright (c) 1994, Regents of the University of California
2828 *
29- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.47 2008/08/01 13:16:08 alvherre Exp $
29+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.48 2008/10/20 19:18:18 alvherre Exp $
3030 *
3131 *-------------------------------------------------------------------------
3232 */
@@ -80,32 +80,182 @@ static int ZeroCLOGPage(int pageno, bool writeXlog);
8080static bool CLOGPagePrecedes (int page1 , int page2 );
8181static void WriteZeroPageXlogRec (int pageno );
8282static void WriteTruncateXlogRec (int pageno );
83+ static void TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
84+ TransactionId * subxids , XidStatus status ,
85+ XLogRecPtr lsn , int pageno );
86+ static void TransactionIdSetStatusBit (TransactionId xid , XidStatus status ,
87+ XLogRecPtr lsn , int slotno );
88+ static void set_status_by_pages (int nsubxids , TransactionId * subxids ,
89+ XidStatus status , XLogRecPtr lsn );
8390
8491
8592/*
86- * Record the final state of a transaction in the commit log.
93+ * TransactionIdSetTreeStatus
94+ *
95+ * Record the final state of transaction entries in the commit log for
96+ * a transaction and its subtransaction tree. Take care to ensure this is
97+ * efficient, and as atomic as possible.
98+ *
99+ * xid is a single xid to set status for. This will typically be
100+ * the top level transactionid for a top level commit or abort. It can
101+ * also be a subtransaction when we record transaction aborts.
102+ *
103+ * subxids is an array of xids of length nsubxids, representing subtransactions
104+ * in the tree of xid. In various cases nsubxids may be zero.
87105 *
88106 * lsn must be the WAL location of the commit record when recording an async
89107 * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
90108 * caller guarantees the commit record is already flushed in that case. It
91109 * should be InvalidXLogRecPtr for abort cases, too.
92110 *
111+ * In the commit case, atomicity is limited by whether all the subxids are in
112+ * the same CLOG page as xid. If they all are, then the lock will be grabbed
113+ * only once, and the status will be set to committed directly. Otherwise
114+ * we must
115+ * 1. set sub-committed all subxids that are not on the same page as the
116+ * main xid
117+ * 2. atomically set committed the main xid and the subxids on the same page
118+ * 3. go over the first bunch again and set them committed
119+ * Note that as far as concurrent checkers are concerned, main transaction
120+ * commit as a whole is still atomic.
121+ *
122+ * Example:
123+ * TransactionId t commits and has subxids t1, t2, t3, t4
124+ * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
125+ * 1. update pages2-3:
126+ * page2: set t2,t3 as sub-committed
127+ * page3: set t4 as sub-committed
128+ * 2. update page1:
129+ * set t1 as sub-committed,
130+ * then set t as committed,
131+ then set t1 as committed
132+ * 3. update pages2-3:
133+ * page2: set t2,t3 as committed
134+ * page3: set t4 as committed
135+ *
93136 * NB: this is a low-level routine and is NOT the preferred entry point
94- * for most uses; TransactionLogUpdate() in transam.c is the intended caller.
137+ * for most uses; functions in transam.c are the intended callers.
138+ *
139+ * XXX Think about issuing FADVISE_WILLNEED on pages that we will need,
140+ * but aren't yet in cache, as well as hinting pages not to fall out of
141+ * cache yet.
95142 */
96143void
97- TransactionIdSetStatus (TransactionId xid , XidStatus status , XLogRecPtr lsn )
144+ TransactionIdSetTreeStatus (TransactionId xid , int nsubxids ,
145+ TransactionId * subxids , XidStatus status , XLogRecPtr lsn )
146+ {
147+ int pageno = TransactionIdToPage (xid ); /* get page of parent */
148+ int i ;
149+
150+ Assert (status == TRANSACTION_STATUS_COMMITTED ||
151+ status == TRANSACTION_STATUS_ABORTED );
152+
153+ /*
154+ * See how many subxids, if any, are on the same page as the parent, if any.
155+ */
156+ for (i = 0 ; i < nsubxids ; i ++ )
157+ {
158+ if (TransactionIdToPage (subxids [i ]) != pageno )
159+ break ;
160+ }
161+
162+ /*
163+ * Do all items fit on a single page?
164+ */
165+ if (i == nsubxids )
166+ {
167+ /*
168+ * Set the parent and all subtransactions in a single call
169+ */
170+ TransactionIdSetPageStatus (xid , nsubxids , subxids , status , lsn ,
171+ pageno );
172+ }
173+ else
174+ {
175+ int nsubxids_on_first_page = i ;
176+
177+ /*
178+ * If this is a commit then we care about doing this correctly (i.e.
179+ * using the subcommitted intermediate status). By here, we know we're
180+ * updating more than one page of clog, so we must mark entries that
181+ * are *not* on the first page so that they show as subcommitted before
182+ * we then return to update the status to fully committed.
183+ *
184+ * To avoid touching the first page twice, skip marking subcommitted
185+ * for the subxids on that first page.
186+ */
187+ if (status == TRANSACTION_STATUS_COMMITTED )
188+ set_status_by_pages (nsubxids - nsubxids_on_first_page ,
189+ subxids + nsubxids_on_first_page ,
190+ TRANSACTION_STATUS_SUB_COMMITTED , lsn );
191+
192+ /*
193+ * Now set the parent and subtransactions on same page as the parent,
194+ * if any
195+ */
196+ pageno = TransactionIdToPage (xid );
197+ TransactionIdSetPageStatus (xid , nsubxids_on_first_page , subxids , status ,
198+ lsn , pageno );
199+
200+ /*
201+ * Now work through the rest of the subxids one clog page at a time,
202+ * starting from the second page onwards, like we did above.
203+ */
204+ set_status_by_pages (nsubxids - nsubxids_on_first_page ,
205+ subxids + nsubxids_on_first_page ,
206+ status , lsn );
207+ }
208+ }
209+
210+ /*
211+ * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
212+ * transactions, chunking in the separate CLOG pages involved. We never
213+ * pass the whole transaction tree to this function, only subtransactions
214+ * that are on different pages to the top level transaction id.
215+ */
216+ static void
217+ set_status_by_pages (int nsubxids , TransactionId * subxids ,
218+ XidStatus status , XLogRecPtr lsn )
219+ {
220+ int pageno = TransactionIdToPage (subxids [0 ]);
221+ int offset = 0 ;
222+ int i = 0 ;
223+
224+ while (i < nsubxids )
225+ {
226+ int num_on_page = 0 ;
227+
228+ while (TransactionIdToPage (subxids [i ]) == pageno && i < nsubxids )
229+ {
230+ num_on_page ++ ;
231+ i ++ ;
232+ }
233+
234+ TransactionIdSetPageStatus (InvalidTransactionId ,
235+ num_on_page , subxids + offset ,
236+ status , lsn , pageno );
237+ offset = i ;
238+ pageno = TransactionIdToPage (subxids [offset ]);
239+ }
240+ }
241+
242+ /*
243+ * Record the final state of transaction entries in the commit log for
244+ * all entries on a single page. Atomic only on this page.
245+ *
246+ * Otherwise API is same as TransactionIdSetTreeStatus()
247+ */
248+ static void
249+ TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
250+ TransactionId * subxids , XidStatus status ,
251+ XLogRecPtr lsn , int pageno )
98252{
99- int pageno = TransactionIdToPage (xid );
100- int byteno = TransactionIdToByte (xid );
101- int bshift = TransactionIdToBIndex (xid ) * CLOG_BITS_PER_XACT ;
102253 int slotno ;
103- char * byteptr ;
104- char byteval ;
254+ int i ;
105255
106256 Assert (status == TRANSACTION_STATUS_COMMITTED ||
107257 status == TRANSACTION_STATUS_ABORTED ||
108- status == TRANSACTION_STATUS_SUB_COMMITTED );
258+ ( status == TRANSACTION_STATUS_SUB_COMMITTED && ! TransactionIdIsValid ( xid )) );
109259
110260 LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
111261
@@ -116,9 +266,62 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
116266 * mustn't let it reach disk until we've done the appropriate WAL flush.
117267 * But when lsn is invalid, it's OK to scribble on a page while it is
118268 * write-busy, since we don't care if the update reaches disk sooner than
119- * we think. Hence, pass write_ok = XLogRecPtrIsInvalid(lsn).
269+ * we think.
120270 */
121271 slotno = SimpleLruReadPage (ClogCtl , pageno , XLogRecPtrIsInvalid (lsn ), xid );
272+
273+ /*
274+ * Set the main transaction id, if any.
275+ *
276+ * If we update more than one xid on this page while it is being written
277+ * out, we might find that some of the bits go to disk and others don't.
278+ * If we are updating commits on the page with the top-level xid that could
279+ * break atomicity, so we subcommit the subxids first before we mark the
280+ * top-level commit.
281+ */
282+ if (TransactionIdIsValid (xid ))
283+ {
284+ /* Subtransactions first, if needed ... */
285+ if (status == TRANSACTION_STATUS_COMMITTED )
286+ {
287+ for (i = 0 ; i < nsubxids ; i ++ )
288+ {
289+ Assert (ClogCtl -> shared -> page_number [slotno ] == TransactionIdToPage (subxids [i ]));
290+ TransactionIdSetStatusBit (subxids [i ],
291+ TRANSACTION_STATUS_SUB_COMMITTED ,
292+ lsn , slotno );
293+ }
294+ }
295+
296+ /* ... then the main transaction */
297+ TransactionIdSetStatusBit (xid , status , lsn , slotno );
298+ }
299+
300+ /* Set the subtransactions */
301+ for (i = 0 ; i < nsubxids ; i ++ )
302+ {
303+ Assert (ClogCtl -> shared -> page_number [slotno ] == TransactionIdToPage (subxids [i ]));
304+ TransactionIdSetStatusBit (subxids [i ], status , lsn , slotno );
305+ }
306+
307+ ClogCtl -> shared -> page_dirty [slotno ] = true;
308+
309+ LWLockRelease (CLogControlLock );
310+ }
311+
312+ /*
313+ * Sets the commit status of a single transaction.
314+ *
315+ * Must be called with CLogControlLock held
316+ */
317+ static void
318+ TransactionIdSetStatusBit (TransactionId xid , XidStatus status , XLogRecPtr lsn , int slotno )
319+ {
320+ int byteno = TransactionIdToByte (xid );
321+ int bshift = TransactionIdToBIndex (xid ) * CLOG_BITS_PER_XACT ;
322+ char * byteptr ;
323+ char byteval ;
324+
122325 byteptr = ClogCtl -> shared -> page_buffer [slotno ] + byteno ;
123326
124327 /* Current state should be 0, subcommitted or target state */
@@ -132,8 +335,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
132335 byteval |= (status << bshift );
133336 * byteptr = byteval ;
134337
135- ClogCtl -> shared -> page_dirty [slotno ] = true;
136-
137338 /*
138339 * Update the group LSN if the transaction completion LSN is higher.
139340 *
@@ -149,8 +350,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
149350 if (XLByteLT (ClogCtl -> shared -> group_lsn [lsnindex ], lsn ))
150351 ClogCtl -> shared -> group_lsn [lsnindex ] = lsn ;
151352 }
152-
153- LWLockRelease (CLogControlLock );
154353}
155354
156355/*
0 commit comments