7070
7171#define RELS_BSEARCH_THRESHOLD 20
7272
73+ /*
74+ * This is the size (in the number of blocks) above which we scan the
75+ * entire buffer pool to remove the buffers for all the pages of relation
76+ * being dropped. For the relations with size below this threshold, we find
77+ * the buffers by doing lookups in BufMapping table.
78+ */
79+ #define BUF_DROP_FULL_SCAN_THRESHOLD (uint32) (NBuffers / 32)
80+
7381typedef struct PrivateRefCountEntry
7482{
7583 Buffer buffer ;
@@ -473,6 +481,10 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
473481 BufferAccessStrategy strategy ,
474482 bool * foundPtr );
475483static void FlushBuffer (BufferDesc * buf , SMgrRelation reln );
484+ static void FindAndDropRelFileNodeBuffers (RelFileNode rnode ,
485+ ForkNumber forkNum ,
486+ BlockNumber nForkBlock ,
487+ BlockNumber firstDelBlock );
476488static void AtProcExit_Buffers (int code , Datum arg );
477489static void CheckForBufferLeaks (void );
478490static int rnode_comparator (const void * p1 , const void * p2 );
@@ -2965,19 +2977,19 @@ BufferGetLSNAtomic(Buffer buffer)
29652977 * later. It is also the responsibility of higher-level code to ensure
29662978 * that no other process could be trying to load more pages of the
29672979 * relation into buffers.
2968- *
2969- * XXX currently it sequentially searches the buffer pool, should be
2970- * changed to more clever ways of searching. However, this routine
2971- * is used only in code paths that aren't very performance-critical,
2972- * and we shouldn't slow down the hot paths to make it faster ...
29732980 * --------------------------------------------------------------------
29742981 */
29752982void
2976- DropRelFileNodeBuffers (RelFileNodeBackend rnode , ForkNumber * forkNum ,
2983+ DropRelFileNodeBuffers (SMgrRelation smgr_reln , ForkNumber * forkNum ,
29772984 int nforks , BlockNumber * firstDelBlock )
29782985{
29792986 int i ;
29802987 int j ;
2988+ RelFileNodeBackend rnode ;
2989+ BlockNumber nForkBlock [MAX_FORKNUM ];
2990+ BlockNumber nBlocksToInvalidate = 0 ;
2991+
2992+ rnode = smgr_reln -> smgr_rnode ;
29812993
29822994 /* If it's a local relation, it's localbuf.c's problem. */
29832995 if (RelFileNodeBackendIsTemp (rnode ))
@@ -2991,6 +3003,56 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
29913003 return ;
29923004 }
29933005
3006+ /*
3007+ * To remove all the pages of the specified relation forks from the buffer
3008+ * pool, we need to scan the entire buffer pool but we can optimize it by
3009+ * finding the buffers from BufMapping table provided we know the exact
3010+ * size of each fork of the relation. The exact size is required to ensure
3011+ * that we don't leave any buffer for the relation being dropped as
3012+ * otherwise the background writer or checkpointer can lead to a PANIC
3013+ * error while flushing buffers corresponding to files that don't exist.
3014+ *
3015+ * To know the exact size, we rely on the size cached for each fork by us
3016+ * during recovery which limits the optimization to recovery and on
3017+ * standbys but we can easily extend it once we have shared cache for
3018+ * relation size.
3019+ *
3020+ * In recovery, we cache the value returned by the first lseek(SEEK_END)
3021+ * and the future writes keeps the cached value up-to-date. See
3022+ * smgrextend. It is possible that the value of the first lseek is smaller
3023+ * than the actual number of existing blocks in the file due to buggy
3024+ * Linux kernels that might not have accounted for the recent write. But
3025+ * that should be fine because there must not be any buffers after that
3026+ * file size.
3027+ */
3028+ for (i = 0 ; i < nforks ; i ++ )
3029+ {
3030+ /* Get the number of blocks for a relation's fork */
3031+ nForkBlock [i ] = smgrnblocks_cached (smgr_reln , forkNum [i ]);
3032+
3033+ if (nForkBlock [i ] == InvalidBlockNumber )
3034+ {
3035+ nBlocksToInvalidate = InvalidBlockNumber ;
3036+ break ;
3037+ }
3038+
3039+ /* calculate the number of blocks to be invalidated */
3040+ nBlocksToInvalidate += (nForkBlock [i ] - firstDelBlock [i ]);
3041+ }
3042+
3043+ /*
3044+ * We apply the optimization iff the total number of blocks to invalidate
3045+ * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3046+ */
3047+ if (BlockNumberIsValid (nBlocksToInvalidate ) &&
3048+ nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD )
3049+ {
3050+ for (j = 0 ; j < nforks ; j ++ )
3051+ FindAndDropRelFileNodeBuffers (rnode .node , forkNum [j ],
3052+ nForkBlock [j ], firstDelBlock [j ]);
3053+ return ;
3054+ }
3055+
29943056 for (i = 0 ; i < NBuffers ; i ++ )
29953057 {
29963058 BufferDesc * bufHdr = GetBufferDescriptor (i );
@@ -3133,6 +3195,65 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
31333195 pfree (nodes );
31343196}
31353197
3198+ /* ---------------------------------------------------------------------
3199+ * FindAndDropRelFileNodeBuffers
3200+ *
3201+ * This function performs look up in BufMapping table and removes from the
3202+ * buffer pool all the pages of the specified relation fork that has block
3203+ * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3204+ * pages are removed.)
3205+ * --------------------------------------------------------------------
3206+ */
3207+ static void
3208+ FindAndDropRelFileNodeBuffers (RelFileNode rnode , ForkNumber forkNum ,
3209+ BlockNumber nForkBlock ,
3210+ BlockNumber firstDelBlock )
3211+ {
3212+ BlockNumber curBlock ;
3213+
3214+ for (curBlock = firstDelBlock ; curBlock < nForkBlock ; curBlock ++ )
3215+ {
3216+ uint32 bufHash ; /* hash value for tag */
3217+ BufferTag bufTag ; /* identity of requested block */
3218+ LWLock * bufPartitionLock ; /* buffer partition lock for it */
3219+ int buf_id ;
3220+ BufferDesc * bufHdr ;
3221+ uint32 buf_state ;
3222+
3223+ /* create a tag so we can lookup the buffer */
3224+ INIT_BUFFERTAG (bufTag , rnode , forkNum , curBlock );
3225+
3226+ /* determine its hash code and partition lock ID */
3227+ bufHash = BufTableHashCode (& bufTag );
3228+ bufPartitionLock = BufMappingPartitionLock (bufHash );
3229+
3230+ /* Check that it is in the buffer pool. If not, do nothing. */
3231+ LWLockAcquire (bufPartitionLock , LW_SHARED );
3232+ buf_id = BufTableLookup (& bufTag , bufHash );
3233+ LWLockRelease (bufPartitionLock );
3234+
3235+ if (buf_id < 0 )
3236+ continue ;
3237+
3238+ bufHdr = GetBufferDescriptor (buf_id );
3239+
3240+ /*
3241+ * We need to lock the buffer header and recheck if the buffer is
3242+ * still associated with the same block because the buffer could be
3243+ * evicted by some other backend loading blocks for a different
3244+ * relation after we release lock on the BufMapping table.
3245+ */
3246+ buf_state = LockBufHdr (bufHdr );
3247+
3248+ if (RelFileNodeEquals (bufHdr -> tag .rnode , rnode ) &&
3249+ bufHdr -> tag .forkNum == forkNum &&
3250+ bufHdr -> tag .blockNum >= firstDelBlock )
3251+ InvalidateBuffer (bufHdr ); /* releases spinlock */
3252+ else
3253+ UnlockBufHdr (bufHdr , buf_state );
3254+ }
3255+ }
3256+
31363257/* ---------------------------------------------------------------------
31373258 * DropDatabaseBuffers
31383259 *
@@ -3245,8 +3366,7 @@ PrintPinnedBufs(void)
32453366 * XXX currently it sequentially searches the buffer pool, should be
32463367 * changed to more clever ways of searching. This routine is not
32473368 * used in any performance-critical code paths, so it's not worth
3248- * adding additional overhead to normal paths to make it go faster;
3249- * but see also DropRelFileNodeBuffers.
3369+ * adding additional overhead to normal paths to make it go faster.
32503370 * --------------------------------------------------------------------
32513371 */
32523372void
0 commit comments