@@ -155,15 +155,17 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
155155 numattrs ,
156156 ngroups ,
157157 nitems ;
158-
159- AttrNumber * attnums = build_attnums_array (attrs , & numattrs );
160-
158+ AttrNumber * attnums ;
159+ double mincount ;
161160 SortItem * items ;
162161 SortItem * groups ;
163162 MCVList * mcvlist = NULL ;
163+ MultiSortSupport mss ;
164+
165+ attnums = build_attnums_array (attrs , & numattrs );
164166
165167 /* comparator for all the columns */
166- MultiSortSupport mss = build_mss (stats , numattrs );
168+ mss = build_mss (stats , numattrs );
167169
168170 /* sort the rows */
169171 items = build_sorted_items (numrows , & nitems , rows , stats [0 ]-> tupDesc ,
@@ -196,33 +198,28 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
196198 * per-column frequencies, as if the columns were independent).
197199 *
198200 * Using the same algorithm might exclude items that are close to the
199- * "average" frequency. But it does not say whether the frequency is
200- * close to base frequency or not. We also need to consider unexpectedly
201- * uncommon items (compared to base frequency), and the single-column
202- * algorithm ignores that entirely .
201+ * "average" frequency of the sample . But that does not say whether the
202+ * observed frequency is close to the base frequency or not. We also
203+ * need to consider unexpectedly uncommon items (again, compared to the
204+ * base frequency), and the single-column algorithm does not have to .
203205 *
204- * If we can fit all the items onto the MCV list, do that. Otherwise
205- * use get_mincount_for_mcv_list to decide which items to keep in the
206- * MCV list, based on the number of occurrences in the sample .
206+ * We simply decide how many items to keep by computing minimum count
207+ * using get_mincount_for_mcv_list() and then keep all items that seem
208+ * to be more common than that .
207209 */
208- if (ngroups > nitems )
209- {
210- double mincount ;
210+ mincount = get_mincount_for_mcv_list (numrows , totalrows );
211211
212- mincount = get_mincount_for_mcv_list ( numrows , totalrows );
213-
214- /*
215- * Walk the groups until we find the first group with a count below
216- * the mincount threshold (the index of that group is the number of
217- * groups we want to keep).
218- */
219- for ( i = 0 ; i < nitems ; i ++ )
212+ /*
213+ * Walk the groups until we find the first group with a count below
214+ * the mincount threshold (the index of that group is the number of
215+ * groups we want to keep).
216+ */
217+ for ( i = 0 ; i < nitems ; i ++ )
218+ {
219+ if ( groups [ i ]. count < mincount )
220220 {
221- if (groups [i ].count < mincount )
222- {
223- nitems = i ;
224- break ;
225- }
221+ nitems = i ;
222+ break ;
226223 }
227224 }
228225
@@ -469,11 +466,12 @@ statext_mcv_load(Oid mvoid)
469466 * Each attribute has to be processed separately, as we may be mixing different
470467 * datatypes, with different sort operators, etc.
471468 *
472- * We use uint16 values for the indexes in step (3), as we currently don't allow
473- * more than 8k MCV items anyway, although that's mostly arbitrary limit. We might
474- * increase this to 65k and still fit into uint16. Furthermore, this limit is on
475- * the number of distinct values per column, and we usually have few of those
476- * (and various combinations of them for the those MCV list). So uint16 seems fine.
469+ * We use uint16 values for the indexes in step (3), as the number of MCV items
470+ * is limited by the statistics target (which is capped to 10k at the moment).
471+ * We might increase this to 65k and still fit into uint16, so there's a bit of
472+ * slack. Furthermore, this limit is on the number of distinct values per column,
473+ * and we usually have few of those (and various combinations of them for the
474+ * those MCV list). So uint16 seems fine for now.
477475 *
478476 * We don't really expect the serialization to save as much space as for
479477 * histograms, as we are not doing any bucket splits (which is the source
@@ -1322,7 +1320,7 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
13221320 * somewhat wasteful as we could do with just a single bit, thus reducing
13231321 * the size to ~1/8. It would also allow us to combine bitmaps simply using
13241322 * & and |, which should be faster than min/max. The bitmaps are fairly
1325- * small, though (as we cap the MCV list size to 8k items ).
1323+ * small, though (thanks to the cap on the MCV list size).
13261324 */
13271325static bool *
13281326mcv_get_match_bitmap (PlannerInfo * root , List * clauses ,
0 commit comments