Improve ANALYZE's strategy for finding MCVs.

deanrasheed · deanrasheed · commit b5db1d93d2a6 · 2018-03-22T09:37:36.000Z
Previously, a value was included in the MCV list if its frequency was 25% larger than the estimated average frequency of all nonnull values in the table. For uniform distributions, that can lead to values being included in the MCV list and significantly overestimated on the basis of relatively few (sometimes just 2) instances being seen in the sample. For non-uniform distributions, it can lead to too few values being included in the MCV list, since the overall average frequency may be dominated by a small number of very common values, while the remaining values may still have a large spread of frequencies, causing both substantial overestimation and underestimation of the remaining values. Furthermore, increasing the statistics target may have little effect because the overall average frequency will remain relatively unchanged. Instead, populate the MCV list with the largest set of common values that are statistically significantly more common than the average frequency of the remaining values. This takes into account the variance of the sample counts, which depends on the counts themselves and on the proportion of the table that was sampled. As a result, it constrains the relative standard error of estimates based on the frequencies of values in the list, reducing the chances of too many values being included. At the same time, it allows more values to be included, since the MCVs need only be more common than the remaining non-MCVs, rather than the overall average. Thus it tends to produce fewer MCVs than the previous code for uniform distributions, and more for non-uniform distributions, reducing estimation errors in both cases. In addition, the algorithm responds better to increasing the statistics target, allowing more values to be included in the MCV list when more of the table is sampled. Jeff Janes, substantially modified by me. Reviewed by John Naylor and Tomas Vondra. Discussion: https://postgr.es/m/CAMkU=1yvdGvW9TmiLAhz2erFnvnPFYHbOZuO+a=4DVkzpuQ2tw@mail.gmail.com
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
@@ -1769,6 +1769,12 @@ static void compute_scalar_stats(VacAttrStatsP stats,
 					 double totalrows);
 static int	compare_scalars(const void *a, const void *b, void *arg);
 static int	compare_mcvs(const void *a, const void *b);
+static int analyze_mcv_list(int *mcv_counts,
+				 int num_mcv,
+				 double stadistinct,
+				 double stanullfrac,
+				 int samplerows,
+				 double totalrows);
 
 
 /*
@@ -2187,9 +2193,7 @@ compute_distinct_stats(VacAttrStatsP stats,
 		 * we are able to generate a complete MCV list (all the values in the
 		 * sample will fit, and we think these are all the ones in the table),
 		 * then do so.  Otherwise, store only those values that are
-		 * significantly more common than the (estimated) average. We set the
-		 * threshold rather arbitrarily at 25% more than average, with at
-		 * least 2 instances in the sample.
+		 * significantly more common than the values not in the list.
 		 *
 		 * Note: the first of these cases is meant to address columns with
 		 * small, fixed sets of possible values, such as boolean or enum
@@ -2198,8 +2202,7 @@ compute_distinct_stats(VacAttrStatsP stats,
 		 * so and thus provide the planner with complete information.  But if
 		 * the MCV list is not complete, it's generally worth being more
 		 * selective, and not just filling it all the way up to the stats
-		 * target.  So for an incomplete list, we try to take only MCVs that
-		 * are significantly more common than average.
+		 * target.
 		 */
 		if (track_cnt < track_max && toowide_cnt == 0 &&
 			stats->stadistinct > 0 &&
@@ -2210,28 +2213,22 @@ compute_distinct_stats(VacAttrStatsP stats,
 		}
 		else
 		{
-			double		ndistinct_table = stats->stadistinct;
-			double		avgcount,
-						mincount;
-
-			/* Re-extract estimate of # distinct nonnull values in table */
-			if (ndistinct_table < 0)
-				ndistinct_table = -ndistinct_table * totalrows;
-			/* estimate # occurrences in sample of a typical nonnull value */
-			avgcount = (double) nonnull_cnt / ndistinct_table;
-			/* set minimum threshold count to store a value */
-			mincount = avgcount * 1.25;
-			if (mincount < 2)
-				mincount = 2;
+			int		   *mcv_counts;
+
+			/* Incomplete list; decide how many values are worth keeping */
 			if (num_mcv > track_cnt)
 				num_mcv = track_cnt;
-			for (i = 0; i < num_mcv; i++)
+
+			if (num_mcv > 0)
 			{
-				if (track[i].count < mincount)
-				{
-					num_mcv = i;
-					break;
-				}
+				mcv_counts = (int *) palloc(num_mcv * sizeof(int));
+				for (i = 0; i < num_mcv; i++)
+					mcv_counts[i] = track[i].count;
+
+				num_mcv = analyze_mcv_list(mcv_counts, num_mcv,
+										   stats->stadistinct,
+										   stats->stanullfrac,
+										   samplerows, totalrows);
 			}
 		}
 
@@ -2561,14 +2558,7 @@ compute_scalar_stats(VacAttrStatsP stats,
 		 * we are able to generate a complete MCV list (all the values in the
 		 * sample will fit, and we think these are all the ones in the table),
 		 * then do so.  Otherwise, store only those values that are
-		 * significantly more common than the (estimated) average. We set the
-		 * threshold rather arbitrarily at 25% more than average, with at
-		 * least 2 instances in the sample.  Also, we won't suppress values
-		 * that have a frequency of at least 1/K where K is the intended
-		 * number of histogram bins; such values might otherwise cause us to
-		 * emit duplicate histogram bin boundaries.  (We might end up with
-		 * duplicate histogram entries anyway, if the distribution is skewed;
-		 * but we prefer to treat such values as MCVs if at all possible.)
+		 * significantly more common than the values not in the list.
 		 *
 		 * Note: the first of these cases is meant to address columns with
 		 * small, fixed sets of possible values, such as boolean or enum
@@ -2577,8 +2567,7 @@ compute_scalar_stats(VacAttrStatsP stats,
 		 * so and thus provide the planner with complete information.  But if
 		 * the MCV list is not complete, it's generally worth being more
 		 * selective, and not just filling it all the way up to the stats
-		 * target.  So for an incomplete list, we try to take only MCVs that
-		 * are significantly more common than average.
+		 * target.
 		 */
 		if (track_cnt == ndistinct && toowide_cnt == 0 &&
 			stats->stadistinct > 0 &&
@@ -2589,33 +2578,22 @@ compute_scalar_stats(VacAttrStatsP stats,
 		}
 		else
 		{
-			double		ndistinct_table = stats->stadistinct;
-			double		avgcount,
-						mincount,
-						maxmincount;
-
-			/* Re-extract estimate of # distinct nonnull values in table */
-			if (ndistinct_table < 0)
-				ndistinct_table = -ndistinct_table * totalrows;
-			/* estimate # occurrences in sample of a typical nonnull value */
-			avgcount = (double) nonnull_cnt / ndistinct_table;
-			/* set minimum threshold count to store a value */
-			mincount = avgcount * 1.25;
-			if (mincount < 2)
-				mincount = 2;
-			/* don't let threshold exceed 1/K, however */
-			maxmincount = (double) values_cnt / (double) num_bins;
-			if (mincount > maxmincount)
-				mincount = maxmincount;
+			int		   *mcv_counts;
+
+			/* Incomplete list; decide how many values are worth keeping */
 			if (num_mcv > track_cnt)
 				num_mcv = track_cnt;
-			for (i = 0; i < num_mcv; i++)
+
+			if (num_mcv > 0)
 			{
-				if (track[i].count < mincount)
-				{
-					num_mcv = i;
-					break;
-				}
+				mcv_counts = (int *) palloc(num_mcv * sizeof(int));
+				for (i = 0; i < num_mcv; i++)
+					mcv_counts[i] = track[i].count;
+
+				num_mcv = analyze_mcv_list(mcv_counts, num_mcv,
+										   stats->stadistinct,
+										   stats->stanullfrac,
+										   samplerows, totalrows);
 			}
 		}
 
@@ -2881,3 +2859,125 @@ compare_mcvs(const void *a, const void *b)
 
 	return da - db;
 }
+
+/*
+ * Analyze the list of common values in the sample and decide how many are
+ * worth storing in the table's MCV list.
+ *
+ * mcv_counts is assumed to be a list of the counts of the most common values
+ * seen in the sample, starting with the most common.  The return value is the
+ * number that are significantly more common than the values not in the list,
+ * and which are therefore deemed worth storing in the table's MCV list.
+ */
+static int
+analyze_mcv_list(int *mcv_counts,
+				 int num_mcv,
+				 double stadistinct,
+				 double stanullfrac,
+				 int samplerows,
+				 double totalrows)
+{
+	double		ndistinct_table;
+	double		sumcount;
+	int			i;
+
+	/*
+	 * If the entire table was sampled, keep the whole list.  This also
+	 * protects us against division by zero in the code below.
+	 */
+	if (samplerows == totalrows || totalrows <= 1.0)
+		return num_mcv;
+
+	/* Re-extract the estimated number of distinct nonnull values in table */
+	ndistinct_table = stadistinct;
+	if (ndistinct_table < 0)
+		ndistinct_table = -ndistinct_table * totalrows;
+
+	/*
+	 * Exclude the least common values from the MCV list, if they are not
+	 * significantly more common than the estimated selectivity they would
+	 * have if they weren't in the list.  All non-MCV values are assumed to be
+	 * equally common, after taking into account the frequencies of all the
+	 * the values in the MCV list and the number of nulls (c.f. eqsel()).
+	 *
+	 * Here sumcount tracks the total count of all but the last (least common)
+	 * value in the MCV list, allowing us to determine the effect of excluding
+	 * that value from the list.
+	 *
+	 * Note that we deliberately do this by removing values from the full
+	 * list, rather than starting with an empty list and adding values,
+	 * because the latter approach can fail to add any values if all the most
+	 * common values have around the same frequency and make up the majority
+	 * of the table, so that the overall average frequency of all values is
+	 * roughly the same as that of the common values.  This would lead to any
+	 * uncommon values being significantly overestimated.
+	 */
+	sumcount = 0.0;
+	for (i = 0; i < num_mcv - 1; i++)
+		sumcount += mcv_counts[i];
+
+	while (num_mcv > 0)
+	{
+		double		selec,
+					otherdistinct,
+					N,
+					n,
+					K,
+					variance,
+					stddev;
+
+		/*
+		 * Estimated selectivity the least common value would have if it
+		 * wasn't in the MCV list (c.f. eqsel()).
+		 */
+		selec = 1.0 - sumcount / samplerows - stanullfrac;
+		if (selec < 0.0)
+			selec = 0.0;
+		if (selec > 1.0)
+			selec = 1.0;
+		otherdistinct = ndistinct_table - (num_mcv - 1);
+		if (otherdistinct > 1)
+			selec /= otherdistinct;
+
+		/*
+		 * If the value is kept in the MCV list, its population frequency is
+		 * assumed to equal its sample frequency.  We use the lower end of a
+		 * textbook continuity-corrected Wald-type confidence interval to
+		 * determine if that is significantly more common than the non-MCV
+		 * frequency --- specifically we assume the population frequency is
+		 * highly likely to be within around 2 standard errors of the sample
+		 * frequency, which equates to an interval of 2 standard deviations
+		 * either side of the sample count, plus an additional 0.5 for the
+		 * continuity correction.  Since we are sampling without replacement,
+		 * this is a hypergeometric distribution.
+		 *
+		 * XXX: Empirically, this approach seems to work quite well, but it
+		 * may be worth considering more advanced techniques for estimating
+		 * the confidence interval of the hypergeometric distribution.
+		 */
+		N = totalrows;
+		n = samplerows;
+		K = N * mcv_counts[num_mcv - 1] / n;
+		variance = n * K * (N - K) * (N - n) / (N * N * (N - 1));
+		stddev = sqrt(variance);
+
+		if (mcv_counts[num_mcv - 1] > selec * samplerows + 2 * stddev + 0.5)
+		{
+			/*
+			 * The value is significantly more common than the non-MCV
+			 * selectivity would suggest.  Keep it, and all the other more
+			 * common values in the list.
+			 */
+			break;
+		}
+		else
+		{
+			/* Discard this value and consider the next least common value */
+			num_mcv--;
+			if (num_mcv == 0)
+				break;
+			sumcount -= mcv_counts[num_mcv - 1];
+		}
+	}
+	return num_mcv;
+}