From: Tom Lane Date: Wed, 19 Nov 2025 18:22:12 +0000 (-0500) Subject: Speed up eqjoinsel() with lots of MCV entries. X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=057012b205a082ec930cd7bb7f6663516be011e2;p=postgresql.git Speed up eqjoinsel() with lots of MCV entries. If both sides of the operator have most-common-value statistics, eqjoinsel wants to check which MCVs have matches on the other side. Formerly it did this with a dumb compare-all-the-entries loop, which had O(N^2) behavior for long MCV lists. When that code was written, twenty-plus years ago, that seemed tolerable; but nowadays people frequently use much larger statistics targets, so that the O(N^2) behavior can hurt quite a bit. To add insult to injury, when asked for semijoin semantics, the entire comparison loop was done over, even though we frequently know that it will yield exactly the same results. To improve matters, switch to using a hash table to perform the matching. Testing suggests that depending on the data type, we may need up to about 100 MCVs on each side to amortize the extra costs of setting up the hash table and performing hash-value computations; so continue to use the old looping method when there are fewer MCVs than that. Also, refactor so that we don't repeat the matching work unless we really need to, which occurs only in the uncommon case where eqjoinsel_semi decides to truncate the set of inner MCVs it considers. The refactoring also got rid of the need to use the presented operator's commutator. Real-world operators that are using eqjoinsel should pretty much always have commutators, but at the very least this saves a few syscache lookups. Author: Ilia Evdokimov Co-authored-by: David Geier Reviewed-by: Tom Lane Discussion: https://postgr.es/m/20ea8bf5-3569-4e46-92ef-ebb2666debf6@tantorlabs.com --- diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index cb23ad52782..540aa9628d7 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -143,26 +143,78 @@ #define DEFAULT_PAGE_CPU_MULTIPLIER 50.0 +/* + * In production builds, switch to hash-based MCV matching when the lists are + * large enough to amortize hash setup cost. (This threshold is compared to + * the sum of the lengths of the two MCV lists. This is simplistic but seems + * to work well enough.) In debug builds, we use a smaller threshold so that + * the regression tests cover both paths well. + */ +#ifndef USE_ASSERT_CHECKING +#define EQJOINSEL_MCV_HASH_THRESHOLD 200 +#else +#define EQJOINSEL_MCV_HASH_THRESHOLD 20 +#endif + +/* Entries in the simplehash hash table used by eqjoinsel_find_matches */ +typedef struct MCVHashEntry +{ + Datum value; /* the value represented by this entry */ + int index; /* its index in the relevant AttStatsSlot */ + uint32 hash; /* hash code for the Datum */ + char status; /* status code used by simplehash.h */ +} MCVHashEntry; + +/* private_data for the simplehash hash table */ +typedef struct MCVHashContext +{ + FunctionCallInfo equal_fcinfo; /* the equality join operator */ + FunctionCallInfo hash_fcinfo; /* the hash function to use */ + bool op_is_reversed; /* equality compares hash type to probe type */ + bool insert_mode; /* doing inserts or lookups? */ + bool hash_typbyval; /* typbyval of hashed data type */ + int16 hash_typlen; /* typlen of hashed data type */ +} MCVHashContext; + +/* forward reference */ +typedef struct MCVHashTable_hash MCVHashTable_hash; + /* Hooks for plugins to get control when we ask for stats */ get_relation_stats_hook_type get_relation_stats_hook = NULL; get_index_stats_hook_type get_index_stats_hook = NULL; static double eqsel_internal(PG_FUNCTION_ARGS, bool negate); -static double eqjoinsel_inner(Oid opfuncoid, Oid collation, +static double eqjoinsel_inner(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, - bool have_mcvs1, bool have_mcvs2); -static double eqjoinsel_semi(Oid opfuncoid, Oid collation, + bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches); +static double eqjoinsel_semi(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, RelOptInfo *inner_rel); +static void eqjoinsel_find_matches(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + int nvalues1, int nvalues2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, double *p_matchprodfreq); +static uint32 hash_mcv(MCVHashTable_hash *tab, Datum key); +static bool mcvs_equal(MCVHashTable_hash *tab, Datum key0, Datum key1); static bool estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, List **varinfos, double *ndistinct); static bool convert_to_scalar(Datum value, Oid valuetypid, Oid collid, @@ -218,6 +270,20 @@ static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids); static double btcost_correlation(IndexOptInfo *index, VariableStatData *vardata); +/* Define support routines for MCV hash tables */ +#define SH_PREFIX MCVHashTable +#define SH_ELEMENT_TYPE MCVHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab,key) hash_mcv(tab, key) +#define SH_EQUAL(tab,key0,key1) mcvs_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab,ent) (ent)->hash +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + /* * eqsel - Selectivity of "=" for any data types. @@ -2304,12 +2370,18 @@ eqjoinsel(PG_FUNCTION_ARGS) bool isdefault1; bool isdefault2; Oid opfuncoid; + FmgrInfo eqproc; + Oid hashLeft = InvalidOid; + Oid hashRight = InvalidOid; AttStatsSlot sslot1; AttStatsSlot sslot2; Form_pg_statistic stats1 = NULL; Form_pg_statistic stats2 = NULL; bool have_mcvs1 = false; bool have_mcvs2 = false; + bool *hasmatch1 = NULL; + bool *hasmatch2 = NULL; + int nmatches = 0; bool get_mcv_stats; bool join_is_reversed; RelOptInfo *inner_rel; @@ -2360,14 +2432,34 @@ eqjoinsel(PG_FUNCTION_ARGS) ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); } + /* Prepare info usable by both eqjoinsel_inner and eqjoinsel_semi */ + if (have_mcvs1 && have_mcvs2) + { + fmgr_info(opfuncoid, &eqproc); + hasmatch1 = (bool *) palloc0(sslot1.nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(sslot2.nvalues * sizeof(bool)); + + /* + * If the MCV lists are long enough to justify hashing, try to look up + * hash functions for the join operator. + */ + if ((sslot1.nvalues + sslot2.nvalues) >= EQJOINSEL_MCV_HASH_THRESHOLD) + (void) get_op_hash_functions(operator, &hashLeft, &hashRight); + } + else + memset(&eqproc, 0, sizeof(eqproc)); /* silence uninit-var warnings */ + /* We need to compute the inner-join selectivity in all cases */ - selec_inner = eqjoinsel_inner(opfuncoid, collation, + selec_inner = eqjoinsel_inner(&eqproc, collation, + hashLeft, hashRight, &vardata1, &vardata2, nd1, nd2, isdefault1, isdefault2, &sslot1, &sslot2, stats1, stats2, - have_mcvs1, have_mcvs2); + have_mcvs1, have_mcvs2, + hasmatch1, hasmatch2, + &nmatches); switch (sjinfo->jointype) { @@ -2388,28 +2480,31 @@ eqjoinsel(PG_FUNCTION_ARGS) inner_rel = find_join_input_rel(root, sjinfo->min_righthand); if (!join_is_reversed) - selec = eqjoinsel_semi(opfuncoid, collation, + selec = eqjoinsel_semi(&eqproc, collation, + hashLeft, hashRight, + false, &vardata1, &vardata2, nd1, nd2, isdefault1, isdefault2, &sslot1, &sslot2, stats1, stats2, have_mcvs1, have_mcvs2, + hasmatch1, hasmatch2, + &nmatches, inner_rel); else - { - Oid commop = get_commutator(operator); - Oid commopfuncoid = OidIsValid(commop) ? get_opcode(commop) : InvalidOid; - - selec = eqjoinsel_semi(commopfuncoid, collation, + selec = eqjoinsel_semi(&eqproc, collation, + hashLeft, hashRight, + true, &vardata2, &vardata1, nd2, nd1, isdefault2, isdefault1, &sslot2, &sslot1, stats2, stats1, have_mcvs2, have_mcvs1, + hasmatch2, hasmatch1, + &nmatches, inner_rel); - } /* * We should never estimate the output of a semijoin to be more @@ -2437,6 +2532,11 @@ eqjoinsel(PG_FUNCTION_ARGS) ReleaseVariableStats(vardata1); ReleaseVariableStats(vardata2); + if (hasmatch1) + pfree(hasmatch1); + if (hasmatch2) + pfree(hasmatch2); + CLAMP_PROBABILITY(selec); PG_RETURN_FLOAT8((float8) selec); @@ -2445,17 +2545,24 @@ eqjoinsel(PG_FUNCTION_ARGS) /* * eqjoinsel_inner --- eqjoinsel for normal inner join * + * In addition to computing the selectivity estimate, this will fill + * hasmatch1[], hasmatch2[], and *p_nmatches (if have_mcvs1 && have_mcvs2). + * We may be able to re-use that data in eqjoinsel_semi. + * * We also use this for LEFT/FULL outer joins; it's not presently clear * that it's worth trying to distinguish them here. */ static double -eqjoinsel_inner(Oid opfuncoid, Oid collation, +eqjoinsel_inner(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, - bool have_mcvs1, bool have_mcvs2) + bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches) { double selec; @@ -2473,10 +2580,6 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, * results", Technical Report 1018, Computer Science Dept., University * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu). */ - LOCAL_FCINFO(fcinfo, 2); - FmgrInfo eqproc; - bool *hasmatch1; - bool *hasmatch2; double nullfrac1 = stats1->stanullfrac; double nullfrac2 = stats2->stanullfrac; double matchprodfreq, @@ -2491,55 +2594,17 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, int i, nmatches; - fmgr_info(opfuncoid, &eqproc); - - /* - * Save a few cycles by setting up the fcinfo struct just once. Using - * FunctionCallInvoke directly also avoids failure if the eqproc - * returns NULL, though really equality functions should never do - * that. - */ - InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, - NULL, NULL); - fcinfo->args[0].isnull = false; - fcinfo->args[1].isnull = false; - - hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); - hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool)); - - /* - * Note we assume that each MCV will match at most one member of the - * other MCV list. If the operator isn't really equality, there could - * be multiple matches --- but we don't look for them, both for speed - * and because the math wouldn't add up... - */ - matchprodfreq = 0.0; - nmatches = 0; - for (i = 0; i < sslot1->nvalues; i++) - { - int j; - - fcinfo->args[0].value = sslot1->values[i]; - - for (j = 0; j < sslot2->nvalues; j++) - { - Datum fresult; - - if (hasmatch2[j]) - continue; - fcinfo->args[1].value = sslot2->values[j]; - fcinfo->isnull = false; - fresult = FunctionCallInvoke(fcinfo); - if (!fcinfo->isnull && DatumGetBool(fresult)) - { - hasmatch1[i] = hasmatch2[j] = true; - matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j]; - nmatches++; - break; - } - } - } + /* Fill the match arrays */ + eqjoinsel_find_matches(eqproc, collation, + hashLeft, hashRight, + false, + sslot1, sslot2, + sslot1->nvalues, sslot2->nvalues, + hasmatch1, hasmatch2, + p_nmatches, &matchprodfreq); + nmatches = *p_nmatches; CLAMP_PROBABILITY(matchprodfreq); + /* Sum up frequencies of matched and unmatched MCVs */ matchfreq1 = unmatchfreq1 = 0.0; for (i = 0; i < sslot1->nvalues; i++) @@ -2561,8 +2626,6 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, } CLAMP_PROBABILITY(matchfreq2); CLAMP_PROBABILITY(unmatchfreq2); - pfree(hasmatch1); - pfree(hasmatch2); /* * Compute total frequency of non-null values that are not in the MCV @@ -2642,17 +2705,24 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation, * eqjoinsel_semi --- eqjoinsel for semi join * * (Also used for anti join, which we are supposed to estimate the same way.) - * Caller has ensured that vardata1 is the LHS variable. - * Unlike eqjoinsel_inner, we have to cope with opfuncoid being InvalidOid. + * Caller has ensured that vardata1 is the LHS variable; however, eqproc + * is for the original join operator, which might now need to have the inputs + * swapped in order to apply correctly. Also, if have_mcvs1 && have_mcvs2 + * then hasmatch1[], hasmatch2[], and *p_nmatches were filled by + * eqjoinsel_inner. */ static double -eqjoinsel_semi(Oid opfuncoid, Oid collation, +eqjoinsel_semi(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, VariableStatData *vardata1, VariableStatData *vardata2, double nd1, double nd2, bool isdefault1, bool isdefault2, AttStatsSlot *sslot1, AttStatsSlot *sslot2, Form_pg_statistic stats1, Form_pg_statistic stats2, bool have_mcvs1, bool have_mcvs2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, RelOptInfo *inner_rel) { double selec; @@ -2690,7 +2760,7 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, isdefault2 = false; } - if (have_mcvs1 && have_mcvs2 && OidIsValid(opfuncoid)) + if (have_mcvs1 && have_mcvs2) { /* * We have most-common-value lists for both relations. Run through @@ -2700,12 +2770,9 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, * lists. We still have to estimate for the remaining population, but * in a skewed distribution this gives us a big leg up in accuracy. */ - LOCAL_FCINFO(fcinfo, 2); - FmgrInfo eqproc; - bool *hasmatch1; - bool *hasmatch2; double nullfrac1 = stats1->stanullfrac; - double matchfreq1, + double matchprodfreq, + matchfreq1, uncertainfrac, uncertain; int i, @@ -2721,52 +2788,32 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, */ clamped_nvalues2 = Min(sslot2->nvalues, nd2); - fmgr_info(opfuncoid, &eqproc); - - /* - * Save a few cycles by setting up the fcinfo struct just once. Using - * FunctionCallInvoke directly also avoids failure if the eqproc - * returns NULL, though really equality functions should never do - * that. - */ - InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, - NULL, NULL); - fcinfo->args[0].isnull = false; - fcinfo->args[1].isnull = false; - - hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); - hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool)); - /* - * Note we assume that each MCV will match at most one member of the - * other MCV list. If the operator isn't really equality, there could - * be multiple matches --- but we don't look for them, both for speed - * and because the math wouldn't add up... + * If we did not set clamped_nvalues2 to less than sslot2->nvalues, + * then the hasmatch1[] and hasmatch2[] match flags computed by + * eqjoinsel_inner are still perfectly applicable, so we need not + * re-do the matching work. Note that it does not matter if + * op_is_reversed: we'd get the same answers. + * + * If we did clamp, then a different set of sslot2 values is to be + * compared, so we have to re-do the matching. */ - nmatches = 0; - for (i = 0; i < sslot1->nvalues; i++) + if (clamped_nvalues2 != sslot2->nvalues) { - int j; - - fcinfo->args[0].value = sslot1->values[i]; - - for (j = 0; j < clamped_nvalues2; j++) - { - Datum fresult; - - if (hasmatch2[j]) - continue; - fcinfo->args[1].value = sslot2->values[j]; - fcinfo->isnull = false; - fresult = FunctionCallInvoke(fcinfo); - if (!fcinfo->isnull && DatumGetBool(fresult)) - { - hasmatch1[i] = hasmatch2[j] = true; - nmatches++; - break; - } - } + /* Must re-zero the arrays */ + memset(hasmatch1, 0, sslot1->nvalues * sizeof(bool)); + memset(hasmatch2, 0, clamped_nvalues2 * sizeof(bool)); + /* Re-fill the match arrays */ + eqjoinsel_find_matches(eqproc, collation, + hashLeft, hashRight, + op_is_reversed, + sslot1, sslot2, + sslot1->nvalues, clamped_nvalues2, + hasmatch1, hasmatch2, + p_nmatches, &matchprodfreq); } + nmatches = *p_nmatches; + /* Sum up frequencies of matched MCVs */ matchfreq1 = 0.0; for (i = 0; i < sslot1->nvalues; i++) @@ -2775,8 +2822,6 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, matchfreq1 += sslot1->numbers[i]; } CLAMP_PROBABILITY(matchfreq1); - pfree(hasmatch1); - pfree(hasmatch2); /* * Now we need to estimate the fraction of relation 1 that has at @@ -2830,6 +2875,273 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation, return selec; } +/* + * Identify matching MCVs for eqjoinsel_inner or eqjoinsel_semi. + * + * Inputs: + * eqproc: FmgrInfo for equality function to use (might be reversed) + * collation: OID of collation to use + * hashLeft, hashRight: OIDs of hash functions associated with equality op, + * or InvalidOid if we're not to use hashing + * op_is_reversed: indicates that eqproc compares right type to left type + * sslot1, sslot2: MCV values for the lefthand and righthand inputs + * nvalues1, nvalues2: number of values to be considered (can be less than + * sslotN->nvalues, but not more) + * Outputs: + * hasmatch1[], hasmatch2[]: pre-zeroed arrays of lengths nvalues1, nvalues2; + * entries are set to true if that MCV has a match on the other side + * *p_nmatches: receives number of MCV pairs that match + * *p_matchprodfreq: receives sum(sslot1->numbers[i] * sslot2->numbers[j]) + * for matching MCVs + * + * Note that hashLeft is for the eqproc's left-hand input type, hashRight + * for its right, regardless of op_is_reversed. + * + * Note we assume that each MCV will match at most one member of the other + * MCV list. If the operator isn't really equality, there could be multiple + * matches --- but we don't look for them, both for speed and because the + * math wouldn't add up... + */ +static void +eqjoinsel_find_matches(FmgrInfo *eqproc, Oid collation, + Oid hashLeft, Oid hashRight, + bool op_is_reversed, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + int nvalues1, int nvalues2, + bool *hasmatch1, bool *hasmatch2, + int *p_nmatches, double *p_matchprodfreq) +{ + LOCAL_FCINFO(fcinfo, 2); + double matchprodfreq = 0.0; + int nmatches = 0; + + /* + * Save a few cycles by setting up the fcinfo struct just once. Using + * FunctionCallInvoke directly also avoids failure if the eqproc returns + * NULL, though really equality functions should never do that. + */ + InitFunctionCallInfoData(*fcinfo, eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + if (OidIsValid(hashLeft) && OidIsValid(hashRight)) + { + /* Use a hash table to speed up the matching */ + LOCAL_FCINFO(hash_fcinfo, 1); + FmgrInfo hash_proc; + MCVHashContext hashContext; + MCVHashTable_hash *hashTable; + AttStatsSlot *statsProbe; + AttStatsSlot *statsHash; + bool *hasMatchProbe; + bool *hasMatchHash; + int nvaluesProbe; + int nvaluesHash; + + /* Make sure we build the hash table on the smaller array. */ + if (sslot1->nvalues >= sslot2->nvalues) + { + statsProbe = sslot1; + statsHash = sslot2; + hasMatchProbe = hasmatch1; + hasMatchHash = hasmatch2; + nvaluesProbe = nvalues1; + nvaluesHash = nvalues2; + } + else + { + /* We'll have to reverse the direction of use of the operator. */ + op_is_reversed = !op_is_reversed; + statsProbe = sslot2; + statsHash = sslot1; + hasMatchProbe = hasmatch2; + hasMatchHash = hasmatch1; + nvaluesProbe = nvalues2; + nvaluesHash = nvalues1; + } + + /* + * Build the hash table on the smaller array, using the appropriate + * hash function for its data type. + */ + fmgr_info(op_is_reversed ? hashLeft : hashRight, &hash_proc); + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + + hashContext.equal_fcinfo = fcinfo; + hashContext.hash_fcinfo = hash_fcinfo; + hashContext.op_is_reversed = op_is_reversed; + hashContext.insert_mode = true; + get_typlenbyval(statsHash->valuetype, + &hashContext.hash_typlen, + &hashContext.hash_typbyval); + + hashTable = MCVHashTable_create(CurrentMemoryContext, + nvaluesHash, + &hashContext); + + for (int i = 0; i < nvaluesHash; i++) + { + bool found = false; + MCVHashEntry *entry = MCVHashTable_insert(hashTable, + statsHash->values[i], + &found); + + /* + * MCVHashTable_insert will only report "found" if the new value + * is equal to some previous one per datum_image_eq(). That + * probably shouldn't happen, since we're not expecting duplicates + * in the MCV list. If we do find a dup, just ignore it, leaving + * the hash entry's index pointing at the first occurrence. That + * matches the behavior that the non-hashed code path would have. + */ + if (likely(!found)) + entry->index = i; + } + + /* + * Prepare to probe the hash table. If the probe values are of a + * different data type, then we need to change hash functions. (This + * code relies on the assumption that since we defined SH_STORE_HASH, + * simplehash.h will never need to compute hash values for existing + * hash table entries.) + */ + hashContext.insert_mode = false; + if (hashLeft != hashRight) + { + fmgr_info(op_is_reversed ? hashRight : hashLeft, &hash_proc); + /* Resetting hash_fcinfo is probably unnecessary, but be safe */ + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + } + + /* Look up each probe value in turn. */ + for (int i = 0; i < nvaluesProbe; i++) + { + MCVHashEntry *entry = MCVHashTable_lookup(hashTable, + statsProbe->values[i]); + + /* As in the other code path, skip already-matched hash entries */ + if (entry != NULL && !hasMatchHash[entry->index]) + { + hasMatchHash[entry->index] = hasMatchProbe[i] = true; + nmatches++; + matchprodfreq += statsHash->numbers[entry->index] * statsProbe->numbers[i]; + } + } + + MCVHashTable_destroy(hashTable); + } + else + { + /* We're not to use hashing, so do it the O(N^2) way */ + int index1, + index2; + + /* Set up to supply the values in the order the operator expects */ + if (op_is_reversed) + { + index1 = 1; + index2 = 0; + } + else + { + index1 = 0; + index2 = 1; + } + + for (int i = 0; i < nvalues1; i++) + { + fcinfo->args[index1].value = sslot1->values[i]; + + for (int j = 0; j < nvalues2; j++) + { + Datum fresult; + + if (hasmatch2[j]) + continue; + fcinfo->args[index2].value = sslot2->values[j]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + { + hasmatch1[i] = hasmatch2[j] = true; + matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j]; + nmatches++; + break; + } + } + } + } + + *p_nmatches = nmatches; + *p_matchprodfreq = matchprodfreq; +} + +/* + * Support functions for the hash tables used by eqjoinsel_find_matches + */ +static uint32 +hash_mcv(MCVHashTable_hash *tab, Datum key) +{ + MCVHashContext *context = (MCVHashContext *) tab->private_data; + FunctionCallInfo fcinfo = context->hash_fcinfo; + Datum fresult; + + fcinfo->args[0].value = key; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + Assert(!fcinfo->isnull); + return DatumGetUInt32(fresult); +} + +static bool +mcvs_equal(MCVHashTable_hash *tab, Datum key0, Datum key1) +{ + MCVHashContext *context = (MCVHashContext *) tab->private_data; + + if (context->insert_mode) + { + /* + * During the insertion step, any comparisons will be between two + * Datums of the hash table's data type, so if the given operator is + * cross-type it will be the wrong thing to use. Fortunately, we can + * use datum_image_eq instead. The MCV values should all be distinct + * anyway, so it's mostly pro-forma to compare them at all. + */ + return datum_image_eq(key0, key1, + context->hash_typbyval, context->hash_typlen); + } + else + { + FunctionCallInfo fcinfo = context->equal_fcinfo; + Datum fresult; + + /* + * Apply the operator the correct way around. Although simplehash.h + * doesn't document this explicitly, during lookups key0 is from the + * hash table while key1 is the probe value, so we should compare them + * in that order only if op_is_reversed. + */ + if (context->op_is_reversed) + { + fcinfo->args[0].value = key0; + fcinfo->args[1].value = key1; + } + else + { + fcinfo->args[0].value = key1; + fcinfo->args[1].value = key0; + } + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + return (!fcinfo->isnull && DatumGetBool(fresult)); + } +} + /* * neqjoinsel - Join selectivity of "!=" */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 23bce72ae64..57f2a9ccdc5 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1649,6 +1649,9 @@ LtreeGistOptions LtreeSignature MAGIC MBuf +MCVHashContext +MCVHashEntry +MCVHashTable_hash MCVItem MCVList MEMORY_BASIC_INFORMATION