@@ -78,6 +78,9 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
7878static SortItem * build_distinct_groups (int numrows , SortItem * items ,
7979 MultiSortSupport mss , int * ndistinct );
8080
81+ static SortItem * * build_column_frequencies (SortItem * groups , int ngroups ,
82+ MultiSortSupport mss , int * ncounts );
83+
8184static int count_distinct_groups (int numrows , SortItem * items ,
8285 MultiSortSupport mss );
8386
@@ -242,6 +245,20 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
242245 if (nitems > 0 )
243246 {
244247 int j ;
248+ SortItem key ;
249+ MultiSortSupport tmp ;
250+
251+ /* frequencies for values in each attribute */
252+ SortItem * * freqs ;
253+ int * nfreqs ;
254+
255+ /* used to search values */
256+ tmp = (MultiSortSupport ) palloc (offsetof(MultiSortSupportData , ssup )
257+ + sizeof (SortSupportData ));
258+
259+ /* compute frequencies for values in each column */
260+ nfreqs = (int * ) palloc0 (sizeof (int ) * numattrs );
261+ freqs = build_column_frequencies (groups , ngroups , mss , nfreqs );
245262
246263 /*
247264 * Allocate the MCV list structure, set the global parameters.
@@ -281,18 +298,26 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
281298 item -> base_frequency = 1.0 ;
282299 for (j = 0 ; j < numattrs ; j ++ )
283300 {
284- int count = 0 ;
285- int k ;
301+ SortItem * freq ;
286302
287- for (k = 0 ; k < ngroups ; k ++ )
288- {
289- if (multi_sort_compare_dim (j , & groups [i ], & groups [k ], mss ) == 0 )
290- count += groups [k ].count ;
291- }
303+ /* single dimension */
304+ tmp -> ndims = 1 ;
305+ tmp -> ssup [0 ] = mss -> ssup [j ];
292306
293- item -> base_frequency *= (double ) count / numrows ;
307+ /* fill search key */
308+ key .values = & groups [i ].values [j ];
309+ key .isnull = & groups [i ].isnull [j ];
310+
311+ freq = (SortItem * ) bsearch_arg (& key , freqs [j ], nfreqs [j ],
312+ sizeof (SortItem ),
313+ multi_sort_compare , tmp );
314+
315+ item -> base_frequency *= ((double ) freq -> count ) / numrows ;
294316 }
295317 }
318+
319+ pfree (nfreqs );
320+ pfree (freqs );
296321 }
297322
298323 pfree (items );
@@ -419,6 +444,95 @@ build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
419444 return groups ;
420445}
421446
447+ /* compare sort items (single dimension) */
448+ static int
449+ sort_item_compare (const void * a , const void * b , void * arg )
450+ {
451+ SortSupport ssup = (SortSupport ) arg ;
452+ SortItem * ia = (SortItem * ) a ;
453+ SortItem * ib = (SortItem * ) b ;
454+
455+ return ApplySortComparator (ia -> values [0 ], ia -> isnull [0 ],
456+ ib -> values [0 ], ib -> isnull [0 ],
457+ ssup );
458+ }
459+
460+ /*
461+ * build_column_frequencies
462+ * compute frequencies of values in each column
463+ *
464+ * This returns an array of SortItems for each attibute the MCV is built
465+ * on, with a frequency (number of occurrences) for each value. This is
466+ * then used to compute "base" frequency of MCV items.
467+ *
468+ * All the memory is allocated in a single chunk, so that a single pfree
469+ * is enough to release it. We do not allocate space for values/isnull
470+ * arrays in the SortItems, because we can simply point into the input
471+ * groups directly.
472+ */
473+ static SortItem * *
474+ build_column_frequencies (SortItem * groups , int ngroups ,
475+ MultiSortSupport mss , int * ncounts )
476+ {
477+ int i ,
478+ dim ;
479+ SortItem * * result ;
480+ char * ptr ;
481+
482+ Assert (groups );
483+ Assert (ncounts );
484+
485+ /* allocate arrays for all columns as a single chunk */
486+ ptr = palloc (MAXALIGN (sizeof (SortItem * ) * mss -> ndims ) +
487+ mss -> ndims * MAXALIGN (sizeof (SortItem ) * ngroups ));
488+
489+ /* initial array of pointers */
490+ result = (SortItem * * ) ptr ;
491+ ptr += MAXALIGN (sizeof (SortItem * ) * mss -> ndims );
492+
493+ for (dim = 0 ; dim < mss -> ndims ; dim ++ )
494+ {
495+ SortSupport ssup = & mss -> ssup [dim ];
496+
497+ /* array of values for a single column */
498+ result [dim ] = (SortItem * ) ptr ;
499+ ptr += MAXALIGN (sizeof (SortItem ) * ngroups );
500+
501+ /* extract data for the dimension */
502+ for (i = 0 ; i < ngroups ; i ++ )
503+ {
504+ /* point into the input groups */
505+ result [dim ][i ].values = & groups [i ].values [dim ];
506+ result [dim ][i ].isnull = & groups [i ].isnull [dim ];
507+ result [dim ][i ].count = groups [i ].count ;
508+ }
509+
510+ /* sort the values, deduplicate */
511+ qsort_arg ((void * ) result [dim ], ngroups , sizeof (SortItem ),
512+ sort_item_compare , ssup );
513+
514+ /*
515+ * Identify distinct values, compute frequency (there might be
516+ * multiple MCV items containing this value, so we need to sum
517+ * counts from all of them.
518+ */
519+ ncounts [dim ] = 1 ;
520+ for (i = 1 ; i < ngroups ; i ++ )
521+ {
522+ if (sort_item_compare (& result [dim ][i - 1 ], & result [dim ][i ], ssup ) == 0 )
523+ {
524+ result [dim ][ncounts [dim ]- 1 ].count += result [dim ][i ].count ;
525+ continue ;
526+ }
527+
528+ result [dim ][ncounts [dim ]] = result [dim ][i ];
529+
530+ ncounts [dim ]++ ;
531+ }
532+ }
533+
534+ return result ;
535+ }
422536
423537/*
424538 * statext_mcv_load
0 commit comments