1515 *
1616 *
1717 * IDENTIFICATION
18- * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
18+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
1919 *
2020 *-------------------------------------------------------------------------
2121 */
@@ -940,9 +940,7 @@ Datum
940940eqjoinsel (PG_FUNCTION_ARGS )
941941{
942942 Query * root = (Query * ) PG_GETARG_POINTER (0 );
943- #ifdef NOT_USED /* see neqjoinsel() before removing me! */
944943 Oid operator = PG_GETARG_OID (1 );
945- #endif
946944 List * args = (List * ) PG_GETARG_POINTER (2 );
947945 Var * var1 ;
948946 Var * var2 ;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
958956 HeapTuple statsTuple2 = NULL ;
959957 Form_pg_statistic stats1 = NULL ;
960958 Form_pg_statistic stats2 = NULL ;
961- double nd1 ,
962- nd2 ;
963-
964- if (var1 == NULL )
965- {
966- nd1 = DEFAULT_NUM_DISTINCT ;
967- }
968- else
959+ double nd1 = DEFAULT_NUM_DISTINCT ;
960+ double nd2 = DEFAULT_NUM_DISTINCT ;
961+ bool have_mcvs1 = false;
962+ Datum * values1 = NULL ;
963+ int nvalues1 = 0 ;
964+ float4 * numbers1 = NULL ;
965+ int nnumbers1 = 0 ;
966+ bool have_mcvs2 = false;
967+ Datum * values2 = NULL ;
968+ int nvalues2 = 0 ;
969+ float4 * numbers2 = NULL ;
970+ int nnumbers2 = 0 ;
971+
972+ if (var1 != NULL )
969973 {
970974 /* get stats for the attribute, if available */
971975 Oid relid1 = getrelid (var1 -> varno , root -> rtable );
972976
973- if (relid1 == InvalidOid )
974- nd1 = DEFAULT_NUM_DISTINCT ;
975- else
977+ if (relid1 != InvalidOid )
976978 {
977979 statsTuple1 = SearchSysCache (STATRELATT ,
978980 ObjectIdGetDatum (relid1 ),
979981 Int16GetDatum (var1 -> varattno ),
980982 0 , 0 );
981983 if (HeapTupleIsValid (statsTuple1 ))
984+ {
982985 stats1 = (Form_pg_statistic ) GETSTRUCT (statsTuple1 );
986+ have_mcvs1 = get_attstatsslot (statsTuple1 ,
987+ var1 -> vartype ,
988+ var1 -> vartypmod ,
989+ STATISTIC_KIND_MCV ,
990+ InvalidOid ,
991+ & values1 , & nvalues1 ,
992+ & numbers1 , & nnumbers1 );
993+ }
983994
984995 nd1 = get_att_numdistinct (root , var1 , stats1 );
985996 }
986997 }
987998
988- if (var2 == NULL )
989- {
990- nd2 = DEFAULT_NUM_DISTINCT ;
991- }
992- else
999+ if (var2 != NULL )
9931000 {
9941001 /* get stats for the attribute, if available */
9951002 Oid relid2 = getrelid (var2 -> varno , root -> rtable );
9961003
997- if (relid2 == InvalidOid )
998- nd2 = DEFAULT_NUM_DISTINCT ;
999- else
1004+ if (relid2 != InvalidOid )
10001005 {
10011006 statsTuple2 = SearchSysCache (STATRELATT ,
10021007 ObjectIdGetDatum (relid2 ),
10031008 Int16GetDatum (var2 -> varattno ),
10041009 0 , 0 );
10051010 if (HeapTupleIsValid (statsTuple2 ))
1011+ {
10061012 stats2 = (Form_pg_statistic ) GETSTRUCT (statsTuple2 );
1013+ have_mcvs2 = get_attstatsslot (statsTuple2 ,
1014+ var2 -> vartype ,
1015+ var2 -> vartypmod ,
1016+ STATISTIC_KIND_MCV ,
1017+ InvalidOid ,
1018+ & values2 , & nvalues2 ,
1019+ & numbers2 , & nnumbers2 );
1020+ }
10071021
10081022 nd2 = get_att_numdistinct (root , var2 , stats2 );
10091023 }
10101024 }
10111025
1012- /*
1013- * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
1014- * (can we produce any theory for this)?
1015- *
1016- * XXX possibility to do better: if both attributes have histograms
1017- * then we could determine the exact join selectivity between the
1018- * MCV sets, and only have to assume the join behavior of the non-MCV
1019- * values. This could be a big win when the MCVs cover a large part
1020- * of the population.
1021- *
1022- * XXX what about nulls?
1023- */
1024- selec = 1.0 / sqrt (nd1 * nd2 );
1025- if (selec > 1.0 )
1026- selec = 1.0 ;
1026+ if (have_mcvs1 && have_mcvs2 )
1027+ {
1028+ /*
1029+ * We have most-common-value lists for both relations. Run
1030+ * through the lists to see which MCVs actually join to each
1031+ * other with the given operator. This allows us to determine
1032+ * the exact join selectivity for the portion of the relations
1033+ * represented by the MCV lists. We still have to estimate for
1034+ * the remaining population, but in a skewed distribution this
1035+ * gives us a big leg up in accuracy. For motivation see the
1036+ * analysis in Y. Ioannidis and S. Christodoulakis, "On the
1037+ * propagation of errors in the size of join results", Technical
1038+ * Report 1018, Computer Science Dept., University of Wisconsin,
1039+ * Madison, March 1991 (available from ftp.cs.wisc.edu).
1040+ */
1041+ FmgrInfo eqproc ;
1042+ bool * hasmatch1 ;
1043+ bool * hasmatch2 ;
1044+ double matchprodfreq ,
1045+ matchfreq1 ,
1046+ matchfreq2 ,
1047+ unmatchfreq1 ,
1048+ unmatchfreq2 ,
1049+ otherfreq1 ,
1050+ otherfreq2 ,
1051+ totalsel1 ,
1052+ totalsel2 ;
1053+ int i ,
1054+ nmatches ;
1055+
1056+ fmgr_info (get_opcode (operator ), & eqproc );
1057+ hasmatch1 = (bool * ) palloc (nvalues1 * sizeof (bool ));
1058+ memset (hasmatch1 , 0 , nvalues1 * sizeof (bool ));
1059+ hasmatch2 = (bool * ) palloc (nvalues2 * sizeof (bool ));
1060+ memset (hasmatch2 , 0 , nvalues2 * sizeof (bool ));
1061+ /*
1062+ * Note we assume that each MCV will match at most one member of
1063+ * the other MCV list. If the operator isn't really equality,
1064+ * there could be multiple matches --- but we don't look for them,
1065+ * both for speed and because the math wouldn't add up...
1066+ */
1067+ matchprodfreq = 0.0 ;
1068+ nmatches = 0 ;
1069+ for (i = 0 ; i < nvalues1 ; i ++ )
1070+ {
1071+ int j ;
10271072
1073+ for (j = 0 ; j < nvalues2 ; j ++ )
1074+ {
1075+ if (hasmatch2 [j ])
1076+ continue ;
1077+ if (DatumGetBool (FunctionCall2 (& eqproc ,
1078+ values1 [i ],
1079+ values2 [j ])))
1080+ {
1081+ hasmatch1 [i ] = hasmatch2 [j ] = true;
1082+ matchprodfreq += numbers1 [i ] * numbers2 [j ];
1083+ nmatches ++ ;
1084+ break ;
1085+ }
1086+ }
1087+ }
1088+ /* Sum up frequencies of matched and unmatched MCVs */
1089+ matchfreq1 = unmatchfreq1 = 0.0 ;
1090+ for (i = 0 ; i < nvalues1 ; i ++ )
1091+ {
1092+ if (hasmatch1 [i ])
1093+ matchfreq1 += numbers1 [i ];
1094+ else
1095+ unmatchfreq1 += numbers1 [i ];
1096+ }
1097+ matchfreq2 = unmatchfreq2 = 0.0 ;
1098+ for (i = 0 ; i < nvalues2 ; i ++ )
1099+ {
1100+ if (hasmatch2 [i ])
1101+ matchfreq2 += numbers2 [i ];
1102+ else
1103+ unmatchfreq2 += numbers2 [i ];
1104+ }
1105+ pfree (hasmatch1 );
1106+ pfree (hasmatch2 );
1107+ /*
1108+ * Compute total frequency of non-null values that are not in
1109+ * the MCV lists.
1110+ */
1111+ otherfreq1 = 1.0 - stats1 -> stanullfrac - matchfreq1 - unmatchfreq1 ;
1112+ otherfreq2 = 1.0 - stats2 -> stanullfrac - matchfreq2 - unmatchfreq2 ;
1113+ /*
1114+ * We can estimate the total selectivity from the point of view
1115+ * of relation 1 as: the known selectivity for matched MCVs, plus
1116+ * unmatched MCVs that are assumed to match against random members
1117+ * of relation 2's non-MCV population, plus non-MCV values that
1118+ * are assumed to match against random members of relation 2's
1119+ * unmatched MCVs plus non-MCV values.
1120+ */
1121+ totalsel1 = matchprodfreq ;
1122+ if (nd2 > nvalues2 )
1123+ totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2 );
1124+ if (nd2 > nmatches )
1125+ totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2 ) /
1126+ (nd2 - nmatches );
1127+ /* Same estimate from the point of view of relation 2. */
1128+ totalsel2 = matchprodfreq ;
1129+ if (nd1 > nvalues1 )
1130+ totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1 );
1131+ if (nd1 > nmatches )
1132+ totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1 ) /
1133+ (nd1 - nmatches );
1134+ /*
1135+ * For robustness, we average the two estimates. (Can a case
1136+ * be made for taking the min or max instead?)
1137+ */
1138+ selec = (totalsel1 + totalsel2 ) * 0.5 ;
1139+ }
1140+ else
1141+ {
1142+ /*
1143+ * We do not have MCV lists for both sides. Estimate the
1144+ * join selectivity as MIN(1/nd1, 1/nd2). This is plausible
1145+ * if we assume that the values are about equally distributed:
1146+ * a given tuple of rel1 will join to either 0 or N2/nd2 rows
1147+ * of rel2, so total join rows are at most N1*N2/nd2 giving
1148+ * a join selectivity of not more than 1/nd2. By the same logic
1149+ * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
1150+ * bound. Using the MIN() means we estimate from the point of
1151+ * view of the relation with smaller nd (since the larger nd is
1152+ * determining the MIN). It is reasonable to assume that most
1153+ * tuples in this rel will have join partners, so the bound is
1154+ * probably reasonably tight and should be taken as-is.
1155+ *
1156+ * XXX Can we be smarter if we have an MCV list for just one side?
1157+ * It seems that if we assume equal distribution for the other
1158+ * side, we end up with the same answer anyway.
1159+ */
1160+ if (nd1 > nd2 )
1161+ selec = 1.0 / nd1 ;
1162+ else
1163+ selec = 1.0 / nd2 ;
1164+ }
1165+
1166+ if (have_mcvs1 )
1167+ free_attstatsslot (var1 -> vartype , values1 , nvalues1 ,
1168+ numbers1 , nnumbers1 );
1169+ if (have_mcvs2 )
1170+ free_attstatsslot (var2 -> vartype , values2 , nvalues2 ,
1171+ numbers2 , nnumbers2 );
10281172 if (HeapTupleIsValid (statsTuple1 ))
10291173 ReleaseSysCache (statsTuple1 );
10301174 if (HeapTupleIsValid (statsTuple2 ))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
10391183Datum
10401184neqjoinsel (PG_FUNCTION_ARGS )
10411185{
1186+ Query * root = (Query * ) PG_GETARG_POINTER (0 );
1187+ Oid operator = PG_GETARG_OID (1 );
1188+ List * args = (List * ) PG_GETARG_POINTER (2 );
1189+ Oid eqop ;
10421190 float8 result ;
10431191
10441192 /*
1045- * XXX we skip looking up the negator operator here because we know
1046- * eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
1047- * look, this routine will need to look more like neqsel() does.
1193+ * We want 1 - eqjoinsel() where the equality operator is the one
1194+ * associated with this != operator, that is, its negator.
10481195 */
1049- result = DatumGetFloat8 (eqjoinsel (fcinfo ));
1196+ eqop = get_negator (operator );
1197+ if (eqop )
1198+ {
1199+ result = DatumGetFloat8 (DirectFunctionCall3 (eqjoinsel ,
1200+ PointerGetDatum (root ),
1201+ ObjectIdGetDatum (eqop ),
1202+ PointerGetDatum (args )));
1203+
1204+ }
1205+ else
1206+ {
1207+ /* Use default selectivity (should we raise an error instead?) */
1208+ result = DEFAULT_EQ_SEL ;
1209+ }
10501210 result = 1.0 - result ;
10511211 PG_RETURN_FLOAT8 (result );
10521212}
0 commit comments