103103
104104#include "access/brin.h"
105105#include "access/gin.h"
106- #include "access/htup_details.h"
107- #include "access/sysattr.h"
108106#include "access/table.h"
109107#include "access/tableam.h"
110- #include "catalog/index .h"
108+ #include "access/visibilitymap .h"
111109#include "catalog/pg_am.h"
112110#include "catalog/pg_collation.h"
113111#include "catalog/pg_operator.h"
114112#include "catalog/pg_statistic.h"
115113#include "catalog/pg_statistic_ext.h"
116- #include "executor/executor.h"
117114#include "executor/nodeAgg.h"
118115#include "miscadmin.h"
119116#include "nodes/makefuncs.h"
127124#include "parser/parse_clause.h"
128125#include "parser/parsetree.h"
129126#include "statistics/statistics.h"
127+ #include "storage/bufmgr.h"
130128#include "utils/builtins.h"
131129#include "utils/date.h"
132130#include "utils/datum.h"
133131#include "utils/fmgroids.h"
134132#include "utils/index_selfuncs.h"
135133#include "utils/lsyscache.h"
134+ #include "utils/memutils.h"
136135#include "utils/pg_locale.h"
137136#include "utils/rel.h"
138137#include "utils/selfuncs.h"
@@ -198,6 +197,15 @@ static bool get_actual_variable_range(PlannerInfo *root,
198197 VariableStatData * vardata ,
199198 Oid sortop ,
200199 Datum * min , Datum * max );
200+ static bool get_actual_variable_endpoint (Relation heapRel ,
201+ Relation indexRel ,
202+ ScanDirection indexscandir ,
203+ ScanKey scankeys ,
204+ int16 typLen ,
205+ bool typByVal ,
206+ TupleTableSlot * tableslot ,
207+ MemoryContext outercontext ,
208+ Datum * endpointDatum );
201209static RelOptInfo * find_join_input_rel (PlannerInfo * root , Relids relids );
202210
203211
@@ -5180,30 +5188,23 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
51805188 }
51815189
51825190 /*
5183- * Found a suitable index to extract data from. We'll need an EState
5184- * and a bunch of other infrastructure .
5191+ * Found a suitable index to extract data from. Set up some data that
5192+ * can be used by both invocations of get_actual_variable_endpoint .
51855193 */
51865194 {
5187- EState * estate ;
5188- ExprContext * econtext ;
51895195 MemoryContext tmpcontext ;
51905196 MemoryContext oldcontext ;
51915197 Relation heapRel ;
51925198 Relation indexRel ;
5193- IndexInfo * indexInfo ;
51945199 TupleTableSlot * slot ;
51955200 int16 typLen ;
51965201 bool typByVal ;
51975202 ScanKeyData scankeys [1 ];
5198- IndexScanDesc index_scan ;
5199- Datum values [INDEX_MAX_KEYS ];
5200- bool isnull [INDEX_MAX_KEYS ];
5201- SnapshotData SnapshotNonVacuumable ;
5202-
5203- estate = CreateExecutorState ();
5204- econtext = GetPerTupleExprContext (estate );
5205- /* Make sure any cruft is generated in the econtext's memory */
5206- tmpcontext = econtext -> ecxt_per_tuple_memory ;
5203+
5204+ /* Make sure any cruft gets recycled when we're done */
5205+ tmpcontext = AllocSetContextCreate (CurrentMemoryContext ,
5206+ "get_actual_variable_range workspace" ,
5207+ ALLOCSET_DEFAULT_SIZES );
52075208 oldcontext = MemoryContextSwitchTo (tmpcontext );
52085209
52095210 /*
@@ -5213,14 +5214,9 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
52135214 heapRel = table_open (rte -> relid , NoLock );
52145215 indexRel = index_open (index -> indexoid , NoLock );
52155216
5216- /* extract index key information from the index's pg_index info */
5217- indexInfo = BuildIndexInfo (indexRel );
5218-
5219- /* some other stuff */
5217+ /* build some stuff needed for indexscan execution */
52205218 slot = table_slot_create (heapRel , NULL );
5221- econtext -> ecxt_scantuple = slot ;
52225219 get_typlenbyval (vardata -> atttype , & typLen , & typByVal );
5223- InitNonVacuumableSnapshot (SnapshotNonVacuumable , RecentGlobalXmin );
52245220
52255221 /* set up an IS NOT NULL scan key so that we ignore nulls */
52265222 ScanKeyEntryInitialize (& scankeys [0 ],
@@ -5232,94 +5228,38 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
52325228 InvalidOid , /* no reg proc for this */
52335229 (Datum ) 0 ); /* constant */
52345230
5235- have_data = true;
5236-
52375231 /* If min is requested ... */
52385232 if (min )
52395233 {
5240- /*
5241- * In principle, we should scan the index with our current
5242- * active snapshot, which is the best approximation we've got
5243- * to what the query will see when executed. But that won't
5244- * be exact if a new snap is taken before running the query,
5245- * and it can be very expensive if a lot of recently-dead or
5246- * uncommitted rows exist at the beginning or end of the index
5247- * (because we'll laboriously fetch each one and reject it).
5248- * Instead, we use SnapshotNonVacuumable. That will accept
5249- * recently-dead and uncommitted rows as well as normal
5250- * visible rows. On the other hand, it will reject known-dead
5251- * rows, and thus not give a bogus answer when the extreme
5252- * value has been deleted (unless the deletion was quite
5253- * recent); that case motivates not using SnapshotAny here.
5254- *
5255- * A crucial point here is that SnapshotNonVacuumable, with
5256- * RecentGlobalXmin as horizon, yields the inverse of the
5257- * condition that the indexscan will use to decide that index
5258- * entries are killable (see heap_hot_search_buffer()).
5259- * Therefore, if the snapshot rejects a tuple and we have to
5260- * continue scanning past it, we know that the indexscan will
5261- * mark that index entry killed. That means that the next
5262- * get_actual_variable_range() call will not have to visit
5263- * that heap entry. In this way we avoid repetitive work when
5264- * this function is used a lot during planning.
5265- */
5266- index_scan = index_beginscan (heapRel , indexRel ,
5267- & SnapshotNonVacuumable ,
5268- 1 , 0 );
5269- index_rescan (index_scan , scankeys , 1 , NULL , 0 );
5270-
5271- /* Fetch first tuple in sortop's direction */
5272- if (index_getnext_slot (index_scan , indexscandir , slot ))
5273- {
5274- /* Extract the index column values from the slot */
5275- FormIndexDatum (indexInfo , slot , estate ,
5276- values , isnull );
5277-
5278- /* Shouldn't have got a null, but be careful */
5279- if (isnull [0 ])
5280- elog (ERROR , "found unexpected null value in index \"%s\"" ,
5281- RelationGetRelationName (indexRel ));
5282-
5283- /* Copy the index column value out to caller's context */
5284- MemoryContextSwitchTo (oldcontext );
5285- * min = datumCopy (values [0 ], typByVal , typLen );
5286- MemoryContextSwitchTo (tmpcontext );
5287- }
5288- else
5289- have_data = false;
5290-
5291- index_endscan (index_scan );
5234+ have_data = get_actual_variable_endpoint (heapRel ,
5235+ indexRel ,
5236+ indexscandir ,
5237+ scankeys ,
5238+ typLen ,
5239+ typByVal ,
5240+ slot ,
5241+ oldcontext ,
5242+ min );
5243+ }
5244+ else
5245+ {
5246+ /* If min not requested, assume index is nonempty */
5247+ have_data = true;
52925248 }
52935249
52945250 /* If max is requested, and we didn't find the index is empty */
52955251 if (max && have_data )
52965252 {
5297- index_scan = index_beginscan (heapRel , indexRel ,
5298- & SnapshotNonVacuumable ,
5299- 1 , 0 );
5300- index_rescan (index_scan , scankeys , 1 , NULL , 0 );
5301-
5302- /* Fetch first tuple in reverse direction */
5303- if (index_getnext_slot (index_scan , - indexscandir , slot ))
5304- {
5305- /* Extract the index column values from the slot */
5306- FormIndexDatum (indexInfo , slot , estate ,
5307- values , isnull );
5308-
5309- /* Shouldn't have got a null, but be careful */
5310- if (isnull [0 ])
5311- elog (ERROR , "found unexpected null value in index \"%s\"" ,
5312- RelationGetRelationName (indexRel ));
5313-
5314- /* Copy the index column value out to caller's context */
5315- MemoryContextSwitchTo (oldcontext );
5316- * max = datumCopy (values [0 ], typByVal , typLen );
5317- MemoryContextSwitchTo (tmpcontext );
5318- }
5319- else
5320- have_data = false;
5321-
5322- index_endscan (index_scan );
5253+ /* scan in the opposite direction; all else is the same */
5254+ have_data = get_actual_variable_endpoint (heapRel ,
5255+ indexRel ,
5256+ - indexscandir ,
5257+ scankeys ,
5258+ typLen ,
5259+ typByVal ,
5260+ slot ,
5261+ oldcontext ,
5262+ max );
53235263 }
53245264
53255265 /* Clean everything up */
@@ -5329,7 +5269,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
53295269 table_close (heapRel , NoLock );
53305270
53315271 MemoryContextSwitchTo (oldcontext );
5332- FreeExecutorState ( estate );
5272+ MemoryContextDelete ( tmpcontext );
53335273
53345274 /* And we're done */
53355275 break ;
@@ -5339,6 +5279,139 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
53395279 return have_data ;
53405280}
53415281
5282+ /*
5283+ * Get one endpoint datum (min or max depending on indexscandir) from the
5284+ * specified index. Return true if successful, false if index is empty.
5285+ * On success, endpoint value is stored to *endpointDatum (and copied into
5286+ * outercontext).
5287+ *
5288+ * scankeys is a 1-element scankey array set up to reject nulls.
5289+ * typLen/typByVal describe the datatype of the index's first column.
5290+ * tableslot is a slot suitable to hold table tuples, in case we need
5291+ * to probe the heap.
5292+ * (We could compute these values locally, but that would mean computing them
5293+ * twice when get_actual_variable_range needs both the min and the max.)
5294+ */
5295+ static bool
5296+ get_actual_variable_endpoint (Relation heapRel ,
5297+ Relation indexRel ,
5298+ ScanDirection indexscandir ,
5299+ ScanKey scankeys ,
5300+ int16 typLen ,
5301+ bool typByVal ,
5302+ TupleTableSlot * tableslot ,
5303+ MemoryContext outercontext ,
5304+ Datum * endpointDatum )
5305+ {
5306+ bool have_data = false;
5307+ SnapshotData SnapshotNonVacuumable ;
5308+ IndexScanDesc index_scan ;
5309+ Buffer vmbuffer = InvalidBuffer ;
5310+ ItemPointer tid ;
5311+ Datum values [INDEX_MAX_KEYS ];
5312+ bool isnull [INDEX_MAX_KEYS ];
5313+ MemoryContext oldcontext ;
5314+
5315+ /*
5316+ * We use the index-only-scan machinery for this. With mostly-static
5317+ * tables that's a win because it avoids a heap visit. It's also a win
5318+ * for dynamic data, but the reason is less obvious; read on for details.
5319+ *
5320+ * In principle, we should scan the index with our current active
5321+ * snapshot, which is the best approximation we've got to what the query
5322+ * will see when executed. But that won't be exact if a new snap is taken
5323+ * before running the query, and it can be very expensive if a lot of
5324+ * recently-dead or uncommitted rows exist at the beginning or end of the
5325+ * index (because we'll laboriously fetch each one and reject it).
5326+ * Instead, we use SnapshotNonVacuumable. That will accept recently-dead
5327+ * and uncommitted rows as well as normal visible rows. On the other
5328+ * hand, it will reject known-dead rows, and thus not give a bogus answer
5329+ * when the extreme value has been deleted (unless the deletion was quite
5330+ * recent); that case motivates not using SnapshotAny here.
5331+ *
5332+ * A crucial point here is that SnapshotNonVacuumable, with
5333+ * RecentGlobalXmin as horizon, yields the inverse of the condition that
5334+ * the indexscan will use to decide that index entries are killable (see
5335+ * heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple
5336+ * (or more precisely, all tuples of a HOT chain) and we have to continue
5337+ * scanning past it, we know that the indexscan will mark that index entry
5338+ * killed. That means that the next get_actual_variable_endpoint() call
5339+ * will not have to re-consider that index entry. In this way we avoid
5340+ * repetitive work when this function is used a lot during planning.
5341+ *
5342+ * But using SnapshotNonVacuumable creates a hazard of its own. In a
5343+ * recently-created index, some index entries may point at "broken" HOT
5344+ * chains in which not all the tuple versions contain data matching the
5345+ * index entry. The live tuple version(s) certainly do match the index,
5346+ * but SnapshotNonVacuumable can accept recently-dead tuple versions that
5347+ * don't match. Hence, if we took data from the selected heap tuple, we
5348+ * might get a bogus answer that's not close to the index extremal value,
5349+ * or could even be NULL. We avoid this hazard because we take the data
5350+ * from the index entry not the heap.
5351+ */
5352+ InitNonVacuumableSnapshot (SnapshotNonVacuumable , RecentGlobalXmin );
5353+
5354+ index_scan = index_beginscan (heapRel , indexRel ,
5355+ & SnapshotNonVacuumable ,
5356+ 1 , 0 );
5357+ /* Set it up for index-only scan */
5358+ index_scan -> xs_want_itup = true;
5359+ index_rescan (index_scan , scankeys , 1 , NULL , 0 );
5360+
5361+ /* Fetch first/next tuple in specified direction */
5362+ while ((tid = index_getnext_tid (index_scan , indexscandir )) != NULL )
5363+ {
5364+ if (!VM_ALL_VISIBLE (heapRel ,
5365+ ItemPointerGetBlockNumber (tid ),
5366+ & vmbuffer ))
5367+ {
5368+ /* Rats, we have to visit the heap to check visibility */
5369+ if (!index_fetch_heap (index_scan , tableslot ))
5370+ continue ; /* no visible tuple, try next index entry */
5371+
5372+ /* We don't actually need the heap tuple for anything */
5373+ ExecClearTuple (tableslot );
5374+
5375+ /*
5376+ * We don't care whether there's more than one visible tuple in
5377+ * the HOT chain; if any are visible, that's good enough.
5378+ */
5379+ }
5380+
5381+ /*
5382+ * We expect that btree will return data in IndexTuple not HeapTuple
5383+ * format. It's not lossy either.
5384+ */
5385+ if (!index_scan -> xs_itup )
5386+ elog (ERROR , "no data returned for index-only scan" );
5387+ if (index_scan -> xs_recheck )
5388+ elog (ERROR , "unexpected recheck indication from btree" );
5389+
5390+ /* OK to deconstruct the index tuple */
5391+ index_deform_tuple (index_scan -> xs_itup ,
5392+ index_scan -> xs_itupdesc ,
5393+ values , isnull );
5394+
5395+ /* Shouldn't have got a null, but be careful */
5396+ if (isnull [0 ])
5397+ elog (ERROR , "found unexpected null value in index \"%s\"" ,
5398+ RelationGetRelationName (indexRel ));
5399+
5400+ /* Copy the index column value out to caller's context */
5401+ oldcontext = MemoryContextSwitchTo (outercontext );
5402+ * endpointDatum = datumCopy (values [0 ], typByVal , typLen );
5403+ MemoryContextSwitchTo (oldcontext );
5404+ have_data = true;
5405+ break ;
5406+ }
5407+
5408+ if (vmbuffer != InvalidBuffer )
5409+ ReleaseBuffer (vmbuffer );
5410+ index_endscan (index_scan );
5411+
5412+ return have_data ;
5413+ }
5414+
53425415/*
53435416 * find_join_input_rel
53445417 * Look up the input relation for a join.
0 commit comments