@@ -1769,6 +1769,12 @@ static void compute_scalar_stats(VacAttrStatsP stats,
17691769 double totalrows );
17701770static int compare_scalars (const void * a , const void * b , void * arg );
17711771static int compare_mcvs (const void * a , const void * b );
1772+ static int analyze_mcv_list (int * mcv_counts ,
1773+ int num_mcv ,
1774+ double stadistinct ,
1775+ double stanullfrac ,
1776+ int samplerows ,
1777+ double totalrows );
17721778
17731779
17741780/*
@@ -2187,9 +2193,7 @@ compute_distinct_stats(VacAttrStatsP stats,
21872193 * we are able to generate a complete MCV list (all the values in the
21882194 * sample will fit, and we think these are all the ones in the table),
21892195 * then do so. Otherwise, store only those values that are
2190- * significantly more common than the (estimated) average. We set the
2191- * threshold rather arbitrarily at 25% more than average, with at
2192- * least 2 instances in the sample.
2196+ * significantly more common than the values not in the list.
21932197 *
21942198 * Note: the first of these cases is meant to address columns with
21952199 * small, fixed sets of possible values, such as boolean or enum
@@ -2198,8 +2202,7 @@ compute_distinct_stats(VacAttrStatsP stats,
21982202 * so and thus provide the planner with complete information. But if
21992203 * the MCV list is not complete, it's generally worth being more
22002204 * selective, and not just filling it all the way up to the stats
2201- * target. So for an incomplete list, we try to take only MCVs that
2202- * are significantly more common than average.
2205+ * target.
22032206 */
22042207 if (track_cnt < track_max && toowide_cnt == 0 &&
22052208 stats -> stadistinct > 0 &&
@@ -2210,28 +2213,22 @@ compute_distinct_stats(VacAttrStatsP stats,
22102213 }
22112214 else
22122215 {
2213- double ndistinct_table = stats -> stadistinct ;
2214- double avgcount ,
2215- mincount ;
2216-
2217- /* Re-extract estimate of # distinct nonnull values in table */
2218- if (ndistinct_table < 0 )
2219- ndistinct_table = - ndistinct_table * totalrows ;
2220- /* estimate # occurrences in sample of a typical nonnull value */
2221- avgcount = (double ) nonnull_cnt / ndistinct_table ;
2222- /* set minimum threshold count to store a value */
2223- mincount = avgcount * 1.25 ;
2224- if (mincount < 2 )
2225- mincount = 2 ;
2216+ int * mcv_counts ;
2217+
2218+ /* Incomplete list; decide how many values are worth keeping */
22262219 if (num_mcv > track_cnt )
22272220 num_mcv = track_cnt ;
2228- for (i = 0 ; i < num_mcv ; i ++ )
2221+
2222+ if (num_mcv > 0 )
22292223 {
2230- if (track [i ].count < mincount )
2231- {
2232- num_mcv = i ;
2233- break ;
2234- }
2224+ mcv_counts = (int * ) palloc (num_mcv * sizeof (int ));
2225+ for (i = 0 ; i < num_mcv ; i ++ )
2226+ mcv_counts [i ] = track [i ].count ;
2227+
2228+ num_mcv = analyze_mcv_list (mcv_counts , num_mcv ,
2229+ stats -> stadistinct ,
2230+ stats -> stanullfrac ,
2231+ samplerows , totalrows );
22352232 }
22362233 }
22372234
@@ -2561,14 +2558,7 @@ compute_scalar_stats(VacAttrStatsP stats,
25612558 * we are able to generate a complete MCV list (all the values in the
25622559 * sample will fit, and we think these are all the ones in the table),
25632560 * then do so. Otherwise, store only those values that are
2564- * significantly more common than the (estimated) average. We set the
2565- * threshold rather arbitrarily at 25% more than average, with at
2566- * least 2 instances in the sample. Also, we won't suppress values
2567- * that have a frequency of at least 1/K where K is the intended
2568- * number of histogram bins; such values might otherwise cause us to
2569- * emit duplicate histogram bin boundaries. (We might end up with
2570- * duplicate histogram entries anyway, if the distribution is skewed;
2571- * but we prefer to treat such values as MCVs if at all possible.)
2561+ * significantly more common than the values not in the list.
25722562 *
25732563 * Note: the first of these cases is meant to address columns with
25742564 * small, fixed sets of possible values, such as boolean or enum
@@ -2577,8 +2567,7 @@ compute_scalar_stats(VacAttrStatsP stats,
25772567 * so and thus provide the planner with complete information. But if
25782568 * the MCV list is not complete, it's generally worth being more
25792569 * selective, and not just filling it all the way up to the stats
2580- * target. So for an incomplete list, we try to take only MCVs that
2581- * are significantly more common than average.
2570+ * target.
25822571 */
25832572 if (track_cnt == ndistinct && toowide_cnt == 0 &&
25842573 stats -> stadistinct > 0 &&
@@ -2589,33 +2578,22 @@ compute_scalar_stats(VacAttrStatsP stats,
25892578 }
25902579 else
25912580 {
2592- double ndistinct_table = stats -> stadistinct ;
2593- double avgcount ,
2594- mincount ,
2595- maxmincount ;
2596-
2597- /* Re-extract estimate of # distinct nonnull values in table */
2598- if (ndistinct_table < 0 )
2599- ndistinct_table = - ndistinct_table * totalrows ;
2600- /* estimate # occurrences in sample of a typical nonnull value */
2601- avgcount = (double ) nonnull_cnt / ndistinct_table ;
2602- /* set minimum threshold count to store a value */
2603- mincount = avgcount * 1.25 ;
2604- if (mincount < 2 )
2605- mincount = 2 ;
2606- /* don't let threshold exceed 1/K, however */
2607- maxmincount = (double ) values_cnt / (double ) num_bins ;
2608- if (mincount > maxmincount )
2609- mincount = maxmincount ;
2581+ int * mcv_counts ;
2582+
2583+ /* Incomplete list; decide how many values are worth keeping */
26102584 if (num_mcv > track_cnt )
26112585 num_mcv = track_cnt ;
2612- for (i = 0 ; i < num_mcv ; i ++ )
2586+
2587+ if (num_mcv > 0 )
26132588 {
2614- if (track [i ].count < mincount )
2615- {
2616- num_mcv = i ;
2617- break ;
2618- }
2589+ mcv_counts = (int * ) palloc (num_mcv * sizeof (int ));
2590+ for (i = 0 ; i < num_mcv ; i ++ )
2591+ mcv_counts [i ] = track [i ].count ;
2592+
2593+ num_mcv = analyze_mcv_list (mcv_counts , num_mcv ,
2594+ stats -> stadistinct ,
2595+ stats -> stanullfrac ,
2596+ samplerows , totalrows );
26192597 }
26202598 }
26212599
@@ -2881,3 +2859,125 @@ compare_mcvs(const void *a, const void *b)
28812859
28822860 return da - db ;
28832861}
2862+
2863+ /*
2864+ * Analyze the list of common values in the sample and decide how many are
2865+ * worth storing in the table's MCV list.
2866+ *
2867+ * mcv_counts is assumed to be a list of the counts of the most common values
2868+ * seen in the sample, starting with the most common. The return value is the
2869+ * number that are significantly more common than the values not in the list,
2870+ * and which are therefore deemed worth storing in the table's MCV list.
2871+ */
2872+ static int
2873+ analyze_mcv_list (int * mcv_counts ,
2874+ int num_mcv ,
2875+ double stadistinct ,
2876+ double stanullfrac ,
2877+ int samplerows ,
2878+ double totalrows )
2879+ {
2880+ double ndistinct_table ;
2881+ double sumcount ;
2882+ int i ;
2883+
2884+ /*
2885+ * If the entire table was sampled, keep the whole list. This also
2886+ * protects us against division by zero in the code below.
2887+ */
2888+ if (samplerows == totalrows || totalrows <= 1.0 )
2889+ return num_mcv ;
2890+
2891+ /* Re-extract the estimated number of distinct nonnull values in table */
2892+ ndistinct_table = stadistinct ;
2893+ if (ndistinct_table < 0 )
2894+ ndistinct_table = - ndistinct_table * totalrows ;
2895+
2896+ /*
2897+ * Exclude the least common values from the MCV list, if they are not
2898+ * significantly more common than the estimated selectivity they would
2899+ * have if they weren't in the list. All non-MCV values are assumed to be
2900+ * equally common, after taking into account the frequencies of all the
2901+ * the values in the MCV list and the number of nulls (c.f. eqsel()).
2902+ *
2903+ * Here sumcount tracks the total count of all but the last (least common)
2904+ * value in the MCV list, allowing us to determine the effect of excluding
2905+ * that value from the list.
2906+ *
2907+ * Note that we deliberately do this by removing values from the full
2908+ * list, rather than starting with an empty list and adding values,
2909+ * because the latter approach can fail to add any values if all the most
2910+ * common values have around the same frequency and make up the majority
2911+ * of the table, so that the overall average frequency of all values is
2912+ * roughly the same as that of the common values. This would lead to any
2913+ * uncommon values being significantly overestimated.
2914+ */
2915+ sumcount = 0.0 ;
2916+ for (i = 0 ; i < num_mcv - 1 ; i ++ )
2917+ sumcount += mcv_counts [i ];
2918+
2919+ while (num_mcv > 0 )
2920+ {
2921+ double selec ,
2922+ otherdistinct ,
2923+ N ,
2924+ n ,
2925+ K ,
2926+ variance ,
2927+ stddev ;
2928+
2929+ /*
2930+ * Estimated selectivity the least common value would have if it
2931+ * wasn't in the MCV list (c.f. eqsel()).
2932+ */
2933+ selec = 1.0 - sumcount / samplerows - stanullfrac ;
2934+ if (selec < 0.0 )
2935+ selec = 0.0 ;
2936+ if (selec > 1.0 )
2937+ selec = 1.0 ;
2938+ otherdistinct = ndistinct_table - (num_mcv - 1 );
2939+ if (otherdistinct > 1 )
2940+ selec /= otherdistinct ;
2941+
2942+ /*
2943+ * If the value is kept in the MCV list, its population frequency is
2944+ * assumed to equal its sample frequency. We use the lower end of a
2945+ * textbook continuity-corrected Wald-type confidence interval to
2946+ * determine if that is significantly more common than the non-MCV
2947+ * frequency --- specifically we assume the population frequency is
2948+ * highly likely to be within around 2 standard errors of the sample
2949+ * frequency, which equates to an interval of 2 standard deviations
2950+ * either side of the sample count, plus an additional 0.5 for the
2951+ * continuity correction. Since we are sampling without replacement,
2952+ * this is a hypergeometric distribution.
2953+ *
2954+ * XXX: Empirically, this approach seems to work quite well, but it
2955+ * may be worth considering more advanced techniques for estimating
2956+ * the confidence interval of the hypergeometric distribution.
2957+ */
2958+ N = totalrows ;
2959+ n = samplerows ;
2960+ K = N * mcv_counts [num_mcv - 1 ] / n ;
2961+ variance = n * K * (N - K ) * (N - n ) / (N * N * (N - 1 ));
2962+ stddev = sqrt (variance );
2963+
2964+ if (mcv_counts [num_mcv - 1 ] > selec * samplerows + 2 * stddev + 0.5 )
2965+ {
2966+ /*
2967+ * The value is significantly more common than the non-MCV
2968+ * selectivity would suggest. Keep it, and all the other more
2969+ * common values in the list.
2970+ */
2971+ break ;
2972+ }
2973+ else
2974+ {
2975+ /* Discard this value and consider the next least common value */
2976+ num_mcv -- ;
2977+ if (num_mcv == 0 )
2978+ break ;
2979+ sumcount -= mcv_counts [num_mcv - 1 ];
2980+ }
2981+ }
2982+ return num_mcv ;
2983+ }
0 commit comments