@@ -2055,7 +2055,11 @@ compute_distinct_stats(VacAttrStatsP stats,
20552055 /*
20562056 * Our track list includes every value in the sample, and every
20572057 * value appeared more than once. Assume the column has just
2058- * these values.
2058+ * these values. (This case is meant to address columns with
2059+ * small, fixed sets of possible values, such as boolean or enum
2060+ * columns. If there are any values that appear just once in the
2061+ * sample, including too-wide values, we should assume that that's
2062+ * not what we're dealing with.)
20592063 */
20602064 stats -> stadistinct = track_cnt ;
20612065 }
@@ -2123,6 +2127,16 @@ compute_distinct_stats(VacAttrStatsP stats,
21232127 * significantly more common than the (estimated) average. We set the
21242128 * threshold rather arbitrarily at 25% more than average, with at
21252129 * least 2 instances in the sample.
2130+ *
2131+ * Note: the first of these cases is meant to address columns with
2132+ * small, fixed sets of possible values, such as boolean or enum
2133+ * columns. If we can *completely* represent the column population by
2134+ * an MCV list that will fit into the stats target, then we should do
2135+ * so and thus provide the planner with complete information. But if
2136+ * the MCV list is not complete, it's generally worth being more
2137+ * selective, and not just filling it all the way up to the stats
2138+ * target. So for an incomplete list, we try to take only MCVs that
2139+ * are significantly more common than average.
21262140 */
21272141 if (track_cnt < track_max && toowide_cnt == 0 &&
21282142 stats -> stadistinct > 0 &&
@@ -2416,7 +2430,11 @@ compute_scalar_stats(VacAttrStatsP stats,
24162430 {
24172431 /*
24182432 * Every value in the sample appeared more than once. Assume the
2419- * column has just these values.
2433+ * column has just these values. (This case is meant to address
2434+ * columns with small, fixed sets of possible values, such as
2435+ * boolean or enum columns. If there are any values that appear
2436+ * just once in the sample, including too-wide values, we should
2437+ * assume that that's not what we're dealing with.)
24202438 */
24212439 stats -> stadistinct = ndistinct ;
24222440 }
@@ -2485,6 +2503,16 @@ compute_scalar_stats(VacAttrStatsP stats,
24852503 * emit duplicate histogram bin boundaries. (We might end up with
24862504 * duplicate histogram entries anyway, if the distribution is skewed;
24872505 * but we prefer to treat such values as MCVs if at all possible.)
2506+ *
2507+ * Note: the first of these cases is meant to address columns with
2508+ * small, fixed sets of possible values, such as boolean or enum
2509+ * columns. If we can *completely* represent the column population by
2510+ * an MCV list that will fit into the stats target, then we should do
2511+ * so and thus provide the planner with complete information. But if
2512+ * the MCV list is not complete, it's generally worth being more
2513+ * selective, and not just filling it all the way up to the stats
2514+ * target. So for an incomplete list, we try to take only MCVs that
2515+ * are significantly more common than average.
24882516 */
24892517 if (track_cnt == ndistinct && toowide_cnt == 0 &&
24902518 stats -> stadistinct > 0 &&
0 commit comments