1515 */
1616#include "postgres.h"
1717
18+ #include "port/atomics.h"
1819#include "storage/buf_internals.h"
1920#include "storage/bufmgr.h"
21+ #include "storage/proc.h"
22+
23+ #define INT_ACCESS_ONCE (var ) ((int)(*((volatile int *)&(var))))
2024
2125
2226/*
@@ -27,8 +31,12 @@ typedef struct
2731 /* Spinlock: protects the values below */
2832 slock_t buffer_strategy_lock ;
2933
30- /* Clock sweep hand: index of next buffer to consider grabbing */
31- int nextVictimBuffer ;
34+ /*
35+ * Clock sweep hand: index of next buffer to consider grabbing. Note that
36+ * this isn't a concrete buffer - we only ever increase the value. So, to
37+ * get an actual buffer, it needs to be used modulo NBuffers.
38+ */
39+ pg_atomic_uint32 nextVictimBuffer ;
3240
3341 int firstFreeBuffer ; /* Head of list of unused buffers */
3442 int lastFreeBuffer ; /* Tail of list of unused buffers */
@@ -42,13 +50,14 @@ typedef struct
4250 * Statistics. These counters should be wide enough that they can't
4351 * overflow during a single bgwriter cycle.
4452 */
45- uint32 completePasses ; /* Complete cycles of the clock sweep */
46- uint32 numBufferAllocs ; /* Buffers allocated since last reset */
53+ uint32 completePasses ; /* Complete cycles of the clock sweep */
54+ pg_atomic_uint32 numBufferAllocs ; /* Buffers allocated since last reset */
4755
4856 /*
49- * Notification latch, or NULL if none. See StrategyNotifyBgWriter.
57+ * Bgworker process to be notified upon activity or -1 if none. See
58+ * StrategyNotifyBgWriter.
5059 */
51- Latch * bgwriterLatch ;
60+ int bgwprocno ;
5261} BufferStrategyControl ;
5362
5463/* Pointers to shared state */
@@ -93,6 +102,70 @@ static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
93102static void AddBufferToRing (BufferAccessStrategy strategy ,
94103 volatile BufferDesc * buf );
95104
105+ /*
106+ * ClockSweepTick - Helper routine for StrategyGetBuffer()
107+ *
108+ * Move the clock hand one buffer ahead of its current position and return the
109+ * id of the buffer now under the hand.
110+ */
111+ static inline uint32
112+ ClockSweepTick (void )
113+ {
114+ uint32 victim ;
115+
116+ /*
117+ * Atomically move hand ahead one buffer - if there's several processes
118+ * doing this, this can lead to buffers being returned slightly out of
119+ * apparent order.
120+ */
121+ victim =
122+ pg_atomic_fetch_add_u32 (& StrategyControl -> nextVictimBuffer , 1 );
123+
124+ if (victim >= NBuffers )
125+ {
126+ uint32 originalVictim = victim ;
127+
128+ /* always wrap what we look up in BufferDescriptors */
129+ victim = victim % NBuffers ;
130+
131+ /*
132+ * If we're the one that just caused a wraparound, force
133+ * completePasses to be incremented while holding the spinlock. We
134+ * need the spinlock so StrategySyncStart() can return a consistent
135+ * value consisting of nextVictimBuffer and completePasses.
136+ */
137+ if (victim == 0 )
138+ {
139+ uint32 expected ;
140+ uint32 wrapped ;
141+ bool success = false;
142+
143+ expected = originalVictim + 1 ;
144+
145+ while (!success )
146+ {
147+ /*
148+ * Acquire the spinlock while increasing completePasses. That
149+ * allows other readers to read nextVictimBuffer and
150+ * completePasses in a consistent manner which is required for
151+ * StrategySyncStart(). In theory delaying the increment
152+ * could lead to a overflow of nextVictimBuffers, but that's
153+ * highly unlikely and wouldn't be particularly harmful.
154+ */
155+ SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
156+
157+ wrapped = expected % NBuffers ;
158+
159+ success = pg_atomic_compare_exchange_u32 (& StrategyControl -> nextVictimBuffer ,
160+ & expected , wrapped );
161+ if (success )
162+ StrategyControl -> completePasses ++ ;
163+ SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
164+ }
165+ }
166+ }
167+ return victim ;
168+ }
96169
97170/*
98171 * StrategyGetBuffer
@@ -110,7 +183,7 @@ volatile BufferDesc *
110183StrategyGetBuffer (BufferAccessStrategy strategy )
111184{
112185 volatile BufferDesc * buf ;
113- Latch * bgwriterLatch ;
186+ int bgwprocno ;
114187 int trycounter ;
115188
116189 /*
@@ -124,86 +197,107 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
124197 return buf ;
125198 }
126199
127- /* Nope, so lock the freelist */
128- SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
200+ /*
201+ * If asked, we need to waken the bgwriter. Since we don't want to rely on
202+ * a spinlock for this we force a read from shared memory once, and then
203+ * set the latch based on that value. We need to go through that length
204+ * because otherwise bgprocno might be reset while/after we check because
205+ * the compiler might just reread from memory.
206+ *
207+ * This can possibly set the latch of the wrong process if the bgwriter
208+ * dies in the wrong moment. But since PGPROC->procLatch is never
209+ * deallocated the worst consequence of that is that we set the latch of
210+ * some arbitrary process.
211+ */
212+ bgwprocno = INT_ACCESS_ONCE (StrategyControl -> bgwprocno );
213+ if (bgwprocno != -1 )
214+ {
215+ /* reset bgwprocno first, before setting the latch */
216+ StrategyControl -> bgwprocno = -1 ;
217+ pg_write_barrier ();
218+
219+ /*
220+ * Not acquiring ProcArrayLock here which is slightly icky. It's
221+ * actually fine because procLatch isn't ever freed, so we just can
222+ * potentially set the wrong process' (or no process') latch.
223+ */
224+ SetLatch (& ProcGlobal -> allProcs [bgwprocno ].procLatch );
225+ }
129226
130227 /*
131228 * We count buffer allocation requests so that the bgwriter can estimate
132229 * the rate of buffer consumption. Note that buffers recycled by a
133230 * strategy object are intentionally not counted here.
134231 */
135- StrategyControl -> numBufferAllocs ++ ;
232+ pg_atomic_fetch_add_u32 ( & StrategyControl -> numBufferAllocs , 1 ) ;
136233
137234 /*
138- * If bgwriterLatch is set, we need to waken the bgwriter, but we should
139- * not do so while holding buffer_strategy_lock; so release and re-grab.
140- * This is annoyingly tedious, but it happens at most once per bgwriter
141- * cycle, so the performance hit is minimal.
235+ * First check, without acquiring the lock, whether there's buffers in the
236+ * freelist. Since we otherwise don't require the spinlock in every
237+ * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
238+ * uselessly in most cases. That obviously leaves a race where a buffer is
239+ * put on the freelist but we don't see the store yet - but that's pretty
240+ * harmless, it'll just get used during the next buffer acquisition.
241+ *
242+ * If there's buffers on the freelist, acquire the spinlock to pop one
243+ * buffer of the freelist. Then check whether that buffer is usable and
244+ * repeat if not.
245+ *
246+ * Note that the freeNext fields are considered to be protected by the
247+ * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
248+ * manipulate them without holding the spinlock.
142249 */
143- bgwriterLatch = StrategyControl -> bgwriterLatch ;
144- if (bgwriterLatch )
250+ if (StrategyControl -> firstFreeBuffer >= 0 )
145251 {
146- StrategyControl -> bgwriterLatch = NULL ;
147- SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
148- SetLatch (bgwriterLatch );
149- SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
150- }
252+ while (true)
253+ {
254+ /* Acquire the spinlock to remove element from the freelist */
255+ SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
151256
152- /*
153- * Try to get a buffer from the freelist. Note that the freeNext fields
154- * are considered to be protected by the buffer_strategy_lock not the
155- * individual buffer spinlocks, so it's OK to manipulate them without
156- * holding the spinlock.
157- */
158- while (StrategyControl -> firstFreeBuffer >= 0 )
159- {
160- buf = & BufferDescriptors [StrategyControl -> firstFreeBuffer ];
161- Assert (buf -> freeNext != FREENEXT_NOT_IN_LIST );
257+ if (StrategyControl -> firstFreeBuffer < 0 )
258+ {
259+ SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
260+ break ;
261+ }
162262
163- /* Unconditionally remove buffer from freelist */
164- StrategyControl -> firstFreeBuffer = buf -> freeNext ;
165- buf -> freeNext = FREENEXT_NOT_IN_LIST ;
263+ buf = & BufferDescriptors [StrategyControl -> firstFreeBuffer ];
264+ Assert (buf -> freeNext != FREENEXT_NOT_IN_LIST );
166265
167- /*
168- * Release the lock so someone else can access the freelist (or run
169- * the clocksweep) while we check out this buffer.
170- */
171- SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
266+ /* Unconditionally remove buffer from freelist */
267+ StrategyControl -> firstFreeBuffer = buf -> freeNext ;
268+ buf -> freeNext = FREENEXT_NOT_IN_LIST ;
172269
173- /*
174- * If the buffer is pinned or has a nonzero usage_count, we cannot use
175- * it; discard it and retry. (This can only happen if VACUUM put a
176- * valid buffer in the freelist and then someone else used it before
177- * we got to it. It's probably impossible altogether as of 8.3, but
178- * we'd better check anyway.)
179- */
180- LockBufHdr (buf );
181- if (buf -> refcount == 0 && buf -> usage_count == 0 )
182- {
183- if (strategy != NULL )
184- AddBufferToRing (strategy , buf );
185- return buf ;
186- }
187- UnlockBufHdr (buf );
270+ /*
271+ * Release the lock so someone else can access the freelist while
272+ * we check out this buffer.
273+ */
274+ SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
188275
189- /* Reacquire the lock and go around for another pass. */
190- SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
276+ /*
277+ * If the buffer is pinned or has a nonzero usage_count, we cannot
278+ * use it; discard it and retry. (This can only happen if VACUUM
279+ * put a valid buffer in the freelist and then someone else used
280+ * it before we got to it. It's probably impossible altogether as
281+ * of 8.3, but we'd better check anyway.)
282+ */
283+ LockBufHdr (buf );
284+ if (buf -> refcount == 0 && buf -> usage_count == 0 )
285+ {
286+ if (strategy != NULL )
287+ AddBufferToRing (strategy , buf );
288+ return buf ;
289+ }
290+ UnlockBufHdr (buf );
291+
292+ }
191293 }
192294
193295 /* Nothing on the freelist, so run the "clock sweep" algorithm */
194296 trycounter = NBuffers ;
195297 for (;;)
196298 {
197- buf = & BufferDescriptors [StrategyControl -> nextVictimBuffer ];
198-
199- if (++ StrategyControl -> nextVictimBuffer >= NBuffers )
200- {
201- StrategyControl -> nextVictimBuffer = 0 ;
202- StrategyControl -> completePasses ++ ;
203- }
204299
205- /* Release the lock before manipulating the candidate buffer. */
206- SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
300+ buf = & BufferDescriptors [ClockSweepTick ()];
207301
208302 /*
209303 * If the buffer is pinned or has a nonzero usage_count, we cannot use
@@ -238,9 +332,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
238332 elog (ERROR , "no unpinned buffers available" );
239333 }
240334 UnlockBufHdr (buf );
241-
242- /* Reacquire the lock and get a new candidate buffer. */
243- SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
244335 }
245336}
246337
@@ -281,16 +372,26 @@ StrategyFreeBuffer(volatile BufferDesc *buf)
281372int
282373StrategySyncStart (uint32 * complete_passes , uint32 * num_buf_alloc )
283374{
375+ uint32 nextVictimBuffer ;
284376 int result ;
285377
286378 SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
287- result = StrategyControl -> nextVictimBuffer ;
379+ nextVictimBuffer = pg_atomic_read_u32 (& StrategyControl -> nextVictimBuffer );
380+ result = nextVictimBuffer % NBuffers ;
381+
288382 if (complete_passes )
383+ {
289384 * complete_passes = StrategyControl -> completePasses ;
385+ /*
386+ * Additionally add the number of wraparounds that happened before
387+ * completePasses could be incremented. C.f. ClockSweepTick().
388+ */
389+ * complete_passes += nextVictimBuffer / NBuffers ;
390+ }
391+
290392 if (num_buf_alloc )
291393 {
292- * num_buf_alloc = StrategyControl -> numBufferAllocs ;
293- StrategyControl -> numBufferAllocs = 0 ;
394+ * num_buf_alloc = pg_atomic_exchange_u32 (& StrategyControl -> numBufferAllocs , 0 );
294395 }
295396 SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
296397 return result ;
@@ -305,15 +406,15 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
305406 * from hibernation, and is not meant for anybody else to use.
306407 */
307408void
308- StrategyNotifyBgWriter (Latch * bgwriterLatch )
409+ StrategyNotifyBgWriter (int bgwprocno )
309410{
310411 /*
311412 * We acquire buffer_strategy_lock just to ensure that the store appears
312413 * atomic to StrategyGetBuffer. The bgwriter should call this rather
313414 * infrequently, so there's no performance penalty from being safe.
314415 */
315416 SpinLockAcquire (& StrategyControl -> buffer_strategy_lock );
316- StrategyControl -> bgwriterLatch = bgwriterLatch ;
417+ StrategyControl -> bgwprocno = bgwprocno ;
317418 SpinLockRelease (& StrategyControl -> buffer_strategy_lock );
318419}
319420
@@ -389,14 +490,14 @@ StrategyInitialize(bool init)
389490 StrategyControl -> lastFreeBuffer = NBuffers - 1 ;
390491
391492 /* Initialize the clock sweep pointer */
392- StrategyControl -> nextVictimBuffer = 0 ;
493+ pg_atomic_init_u32 ( & StrategyControl -> nextVictimBuffer , 0 ) ;
393494
394495 /* Clear statistics */
395496 StrategyControl -> completePasses = 0 ;
396- StrategyControl -> numBufferAllocs = 0 ;
497+ pg_atomic_init_u32 ( & StrategyControl -> numBufferAllocs , 0 ) ;
397498
398499 /* No pending notification */
399- StrategyControl -> bgwriterLatch = NULL ;
500+ StrategyControl -> bgwprocno = -1 ;
400501 }
401502 else
402503 Assert (!init );
0 commit comments