3535#include "access/clog.h"
3636#include "access/slru.h"
3737#include "access/transam.h"
38+ #include "access/twophase.h"
3839#include "access/xlog.h"
3940#include "access/xloginsert.h"
4041#include "access/xlogutils.h"
4142#include "miscadmin.h"
4243#include "pg_trace.h"
44+ #include "storage/proc.h"
4345
4446/*
4547 * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
@@ -86,11 +88,17 @@ static void WriteZeroPageXlogRec(int pageno);
8688static void WriteTruncateXlogRec (int pageno );
8789static void TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
8890 TransactionId * subxids , XidStatus status ,
89- XLogRecPtr lsn , int pageno );
91+ XLogRecPtr lsn , int pageno ,
92+ bool all_xact_same_page );
9093static void TransactionIdSetStatusBit (TransactionId xid , XidStatus status ,
9194 XLogRecPtr lsn , int slotno );
9295static void set_status_by_pages (int nsubxids , TransactionId * subxids ,
9396 XidStatus status , XLogRecPtr lsn );
97+ static bool TransactionGroupUpdateXidStatus (TransactionId xid , XidStatus status ,
98+ XLogRecPtr lsn , int pageno );
99+ static void TransactionIdSetPageStatusInternal (TransactionId xid , int nsubxids ,
100+ TransactionId * subxids , XidStatus status ,
101+ XLogRecPtr lsn , int pageno );
94102
95103
96104/*
@@ -173,7 +181,7 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
173181 * Set the parent and all subtransactions in a single call
174182 */
175183 TransactionIdSetPageStatus (xid , nsubxids , subxids , status , lsn ,
176- pageno );
184+ pageno , true );
177185 }
178186 else
179187 {
@@ -200,7 +208,7 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
200208 */
201209 pageno = TransactionIdToPage (xid );
202210 TransactionIdSetPageStatus (xid , nsubxids_on_first_page , subxids , status ,
203- lsn , pageno );
211+ lsn , pageno , false );
204212
205213 /*
206214 * Now work through the rest of the subxids one clog page at a time,
@@ -238,7 +246,7 @@ set_status_by_pages(int nsubxids, TransactionId *subxids,
238246
239247 TransactionIdSetPageStatus (InvalidTransactionId ,
240248 num_on_page , subxids + offset ,
241- status , lsn , pageno );
249+ status , lsn , pageno , false );
242250 offset = i ;
243251 pageno = TransactionIdToPage (subxids [offset ]);
244252 }
@@ -248,21 +256,78 @@ set_status_by_pages(int nsubxids, TransactionId *subxids,
248256 * Record the final state of transaction entries in the commit log for
249257 * all entries on a single page. Atomic only on this page.
250258 *
251- * Otherwise API is same as TransactionIdSetTreeStatus()
259+ * When there is contention on CLogControlLock, we try to group multiple
260+ * updates; a single leader process will perform transaction status updates
261+ * for multiple backends so that the number of times CLogControlLock needs
262+ * to be acquired is reduced. We don't try to do this if a process has
263+ * overflowed the subxids array in its PGPROC, since in that case we
264+ * don't have a complete list of XIDs for it. We also skip it if a process
265+ * has XIDs on more than one CLOG page, or on a different CLOG page than
266+ * processes already waiting for a group update. This latter condition
267+ * has a race condition (see TransactionGroupUpdateXidStatus) but the
268+ * worst thing that happens if we mess up is a small loss of efficiency;
269+ * the intent is to avoid having the leader access pages it wouldn't
270+ * otherwise need to touch. Finally, we skip it for prepared transactions,
271+ * which don't have the semaphore we would need for this optimization,
272+ * and which are anyway probably not all that common.
252273 */
253274static void
254275TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
255276 TransactionId * subxids , XidStatus status ,
256- XLogRecPtr lsn , int pageno )
277+ XLogRecPtr lsn , int pageno ,
278+ bool all_xact_same_page )
279+ {
280+ if (all_xact_same_page &&
281+ nsubxids < PGPROC_MAX_CACHED_SUBXIDS &&
282+ !IsGXactActive ())
283+ {
284+ /*
285+ * If we can immediately acquire CLogControlLock, we update the status
286+ * of our own XID and release the lock. If not, try use group XID
287+ * update. If that doesn't work out, fall back to waiting for the
288+ * lock to perform an update for this transaction only.
289+ */
290+ if (LWLockConditionalAcquire (CLogControlLock , LW_EXCLUSIVE ))
291+ {
292+ TransactionIdSetPageStatusInternal (xid , nsubxids , subxids , status , lsn , pageno );
293+ LWLockRelease (CLogControlLock );
294+ }
295+ else if (!TransactionGroupUpdateXidStatus (xid , status , lsn , pageno ))
296+ {
297+ LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
298+
299+ TransactionIdSetPageStatusInternal (xid , nsubxids , subxids , status , lsn , pageno );
300+
301+ LWLockRelease (CLogControlLock );
302+ }
303+ }
304+ else
305+ {
306+ LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
307+
308+ TransactionIdSetPageStatusInternal (xid , nsubxids , subxids , status , lsn , pageno );
309+
310+ LWLockRelease (CLogControlLock );
311+ }
312+ }
313+
314+ /*
315+ * Record the final state of transaction entry in the commit log
316+ *
317+ * We don't do any locking here; caller must handle that.
318+ */
319+ static void
320+ TransactionIdSetPageStatusInternal (TransactionId xid , int nsubxids ,
321+ TransactionId * subxids , XidStatus status ,
322+ XLogRecPtr lsn , int pageno )
257323{
258324 int slotno ;
259325 int i ;
260326
261327 Assert (status == TRANSACTION_STATUS_COMMITTED ||
262328 status == TRANSACTION_STATUS_ABORTED ||
263329 (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid (xid )));
264-
265- LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
330+ Assert (LWLockHeldByMeInMode (CLogControlLock , LW_EXCLUSIVE ));
266331
267332 /*
268333 * If we're doing an async commit (ie, lsn is valid), then we must wait
@@ -310,8 +375,166 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
310375 }
311376
312377 ClogCtl -> shared -> page_dirty [slotno ] = true;
378+ }
313379
380+ /*
381+ * When we cannot immediately acquire CLogControlLock in exclusive mode at
382+ * commit time, add ourselves to a list of processes that need their XIDs
383+ * status update. The first process to add itself to the list will acquire
384+ * CLogControlLock in exclusive mode and set transaction status as required
385+ * on behalf of all group members. This avoids a great deal of contention
386+ * around CLogControlLock when many processes are trying to commit at once,
387+ * since the lock need not be repeatedly handed off from one committing
388+ * process to the next.
389+ *
390+ * Returns true when transaction status has been updated in clog; returns
391+ * false if we decided against applying the optimization because the page
392+ * number we need to update differs from those processes already waiting.
393+ */
394+ static bool
395+ TransactionGroupUpdateXidStatus (TransactionId xid , XidStatus status ,
396+ XLogRecPtr lsn , int pageno )
397+ {
398+ volatile PROC_HDR * procglobal = ProcGlobal ;
399+ PGPROC * proc = MyProc ;
400+ uint32 nextidx ;
401+ uint32 wakeidx ;
402+
403+ /* We should definitely have an XID whose status needs to be updated. */
404+ Assert (TransactionIdIsValid (xid ));
405+
406+ /*
407+ * Add ourselves to the list of processes needing a group XID status
408+ * update.
409+ */
410+ proc -> clogGroupMember = true;
411+ proc -> clogGroupMemberXid = xid ;
412+ proc -> clogGroupMemberXidStatus = status ;
413+ proc -> clogGroupMemberPage = pageno ;
414+ proc -> clogGroupMemberLsn = lsn ;
415+
416+ nextidx = pg_atomic_read_u32 (& procglobal -> clogGroupFirst );
417+
418+ while (true)
419+ {
420+ /*
421+ * Add the proc to list, if the clog page where we need to update the
422+ * current transaction status is same as group leader's clog page.
423+ *
424+ * There is a race condition here, which is that after doing the below
425+ * check and before adding this proc's clog update to a group, the
426+ * group leader might have already finished the group update for this
427+ * page and becomes group leader of another group. This will lead to a
428+ * situation where a single group can have different clog page
429+ * updates. This isn't likely and will still work, just maybe a bit
430+ * less efficiently.
431+ */
432+ if (nextidx != INVALID_PGPROCNO &&
433+ ProcGlobal -> allProcs [nextidx ].clogGroupMemberPage != proc -> clogGroupMemberPage )
434+ {
435+ proc -> clogGroupMember = false;
436+ return false;
437+ }
438+
439+ pg_atomic_write_u32 (& proc -> clogGroupNext , nextidx );
440+
441+ if (pg_atomic_compare_exchange_u32 (& procglobal -> clogGroupFirst ,
442+ & nextidx ,
443+ (uint32 ) proc -> pgprocno ))
444+ break ;
445+ }
446+
447+ /*
448+ * If the list was not empty, the leader will update the status of our
449+ * XID. It is impossible to have followers without a leader because the
450+ * first process that has added itself to the list will always have
451+ * nextidx as INVALID_PGPROCNO.
452+ */
453+ if (nextidx != INVALID_PGPROCNO )
454+ {
455+ int extraWaits = 0 ;
456+
457+ /* Sleep until the leader updates our XID status. */
458+ for (;;)
459+ {
460+ /* acts as a read barrier */
461+ PGSemaphoreLock (proc -> sem );
462+ if (!proc -> clogGroupMember )
463+ break ;
464+ extraWaits ++ ;
465+ }
466+
467+ Assert (pg_atomic_read_u32 (& proc -> clogGroupNext ) == INVALID_PGPROCNO );
468+
469+ /* Fix semaphore count for any absorbed wakeups */
470+ while (extraWaits -- > 0 )
471+ PGSemaphoreUnlock (proc -> sem );
472+ return true;
473+ }
474+
475+ /* We are the leader. Acquire the lock on behalf of everyone. */
476+ LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
477+
478+ /*
479+ * Now that we've got the lock, clear the list of processes waiting for
480+ * group XID status update, saving a pointer to the head of the list.
481+ * Trying to pop elements one at a time could lead to an ABA problem.
482+ */
483+ nextidx = pg_atomic_exchange_u32 (& procglobal -> clogGroupFirst , INVALID_PGPROCNO );
484+
485+ /* Remember head of list so we can perform wakeups after dropping lock. */
486+ wakeidx = nextidx ;
487+
488+ /* Walk the list and update the status of all XIDs. */
489+ while (nextidx != INVALID_PGPROCNO )
490+ {
491+ PGPROC * proc = & ProcGlobal -> allProcs [nextidx ];
492+ PGXACT * pgxact = & ProcGlobal -> allPgXact [nextidx ];
493+
494+ /*
495+ * Overflowed transactions should not use group XID status update
496+ * mechanism.
497+ */
498+ Assert (!pgxact -> overflowed );
499+
500+ TransactionIdSetPageStatusInternal (proc -> clogGroupMemberXid ,
501+ pgxact -> nxids ,
502+ proc -> subxids .xids ,
503+ proc -> clogGroupMemberXidStatus ,
504+ proc -> clogGroupMemberLsn ,
505+ proc -> clogGroupMemberPage );
506+
507+ /* Move to next proc in list. */
508+ nextidx = pg_atomic_read_u32 (& proc -> clogGroupNext );
509+ }
510+
511+ /* We're done with the lock now. */
314512 LWLockRelease (CLogControlLock );
513+
514+ /*
515+ * Now that we've released the lock, go back and wake everybody up. We
516+ * don't do this under the lock so as to keep lock hold times to a
517+ * minimum. The system calls we need to perform to wake other processes
518+ * up are probably slower and can cause performance slowdown if done under
519+ * lock.
520+ */
521+ while (wakeidx != INVALID_PGPROCNO )
522+ {
523+ PGPROC * proc = & ProcGlobal -> allProcs [wakeidx ];
524+
525+ wakeidx = pg_atomic_read_u32 (& proc -> clogGroupNext );
526+ pg_atomic_write_u32 (& proc -> clogGroupNext , INVALID_PGPROCNO );
527+
528+ /* ensure all previous writes are visible before follower continues. */
529+ pg_write_barrier ();
530+
531+ proc -> clogGroupMember = false;
532+
533+ if (proc != MyProc )
534+ PGSemaphoreUnlock (proc -> sem );
535+ }
536+
537+ return true;
315538}
316539
317540/*
0 commit comments