Fix eqjoinsel() to make use of new statistics.

tglsfdc · tglsfdc · commit 73d1040bd924 · 2001-05-27T17:37:48.000Z
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -940,9 +940,7 @@ Datum
 eqjoinsel(PG_FUNCTION_ARGS)
 {
 	Query	   *root = (Query *) PG_GETARG_POINTER(0);
-#ifdef NOT_USED					/* see neqjoinsel() before removing me! */
 	Oid			operator = PG_GETARG_OID(1);
-#endif
 	List	   *args = (List *) PG_GETARG_POINTER(2);
 	Var		   *var1;
 	Var		   *var2;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
 		HeapTuple	statsTuple2 = NULL;
 		Form_pg_statistic stats1 = NULL;
 		Form_pg_statistic stats2 = NULL;
-		double		nd1,
-					nd2;
-
-		if (var1 == NULL)
-		{
-			nd1 = DEFAULT_NUM_DISTINCT;
-		}
-		else
+		double		nd1 = DEFAULT_NUM_DISTINCT;
+		double		nd2 = DEFAULT_NUM_DISTINCT;
+		bool		have_mcvs1 = false;
+		Datum	   *values1 = NULL;
+		int			nvalues1 = 0;
+		float4	   *numbers1 = NULL;
+		int			nnumbers1 = 0;
+		bool		have_mcvs2 = false;
+		Datum	   *values2 = NULL;
+		int			nvalues2 = 0;
+		float4	   *numbers2 = NULL;
+		int			nnumbers2 = 0;
+
+		if (var1 != NULL)
 		{
 			/* get stats for the attribute, if available */
 			Oid		relid1 = getrelid(var1->varno, root->rtable);
 
-			if (relid1 == InvalidOid)
-				nd1 = DEFAULT_NUM_DISTINCT;
-			else
+			if (relid1 != InvalidOid)
 			{
 				statsTuple1 = SearchSysCache(STATRELATT,
 											 ObjectIdGetDatum(relid1),
 											 Int16GetDatum(var1->varattno),
 											 0, 0);
 				if (HeapTupleIsValid(statsTuple1))
+				{
 					stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
+					have_mcvs1 = get_attstatsslot(statsTuple1,
+												  var1->vartype,
+												  var1->vartypmod,
+												  STATISTIC_KIND_MCV,
+												  InvalidOid,
+												  &values1, &nvalues1,
+												  &numbers1, &nnumbers1);
+				}
 
 				nd1 = get_att_numdistinct(root, var1, stats1);
 			}
 		}
 
-		if (var2 == NULL)
-		{
-			nd2 = DEFAULT_NUM_DISTINCT;
-		}
-		else
+		if (var2 != NULL)
 		{
 			/* get stats for the attribute, if available */
 			Oid		relid2 = getrelid(var2->varno, root->rtable);
 
-			if (relid2 == InvalidOid)
-				nd2 = DEFAULT_NUM_DISTINCT;
-			else
+			if (relid2 != InvalidOid)
 			{
 				statsTuple2 = SearchSysCache(STATRELATT,
 											 ObjectIdGetDatum(relid2),
 											 Int16GetDatum(var2->varattno),
 											 0, 0);
 				if (HeapTupleIsValid(statsTuple2))
+				{
 					stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
+					have_mcvs2 = get_attstatsslot(statsTuple2,
+												  var2->vartype,
+												  var2->vartypmod,
+												  STATISTIC_KIND_MCV,
+												  InvalidOid,
+												  &values2, &nvalues2,
+												  &numbers2, &nnumbers2);
+				}
 
 				nd2 = get_att_numdistinct(root, var2, stats2);
 			}
 		}
 
-		/*
-		 * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
-		 * (can we produce any theory for this)?
-		 *
-		 * XXX possibility to do better: if both attributes have histograms
-		 * then we could determine the exact join selectivity between the
-		 * MCV sets, and only have to assume the join behavior of the non-MCV
-		 * values.  This could be a big win when the MCVs cover a large part
-		 * of the population.
-		 *
-		 * XXX what about nulls?
-		 */
-		selec = 1.0 / sqrt(nd1 * nd2);
-		if (selec > 1.0)
-			selec = 1.0;
+		if (have_mcvs1 && have_mcvs2)
+		{
+			/*
+			 * We have most-common-value lists for both relations.  Run
+			 * through the lists to see which MCVs actually join to each
+			 * other with the given operator.  This allows us to determine
+			 * the exact join selectivity for the portion of the relations
+			 * represented by the MCV lists.  We still have to estimate for
+			 * the remaining population, but in a skewed distribution this
+			 * gives us a big leg up in accuracy.  For motivation see the
+			 * analysis in Y. Ioannidis and S. Christodoulakis, "On the
+			 * propagation of errors in the size of join results", Technical
+			 * Report 1018, Computer Science Dept., University of Wisconsin,
+			 * Madison, March 1991 (available from ftp.cs.wisc.edu).
+			 */
+			FmgrInfo	eqproc;
+			bool	   *hasmatch1;
+			bool	   *hasmatch2;
+			double		matchprodfreq,
+						matchfreq1,
+						matchfreq2,
+						unmatchfreq1,
+						unmatchfreq2,
+						otherfreq1,
+						otherfreq2,
+						totalsel1,
+						totalsel2;
+			int			i,
+						nmatches;
+
+			fmgr_info(get_opcode(operator), &eqproc);
+			hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
+			memset(hasmatch1, 0, nvalues1 * sizeof(bool));
+			hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
+			memset(hasmatch2, 0, nvalues2 * sizeof(bool));
+			/*
+			 * Note we assume that each MCV will match at most one member of
+			 * the other MCV list.  If the operator isn't really equality,
+			 * there could be multiple matches --- but we don't look for them,
+			 * both for speed and because the math wouldn't add up...
+			 */
+			matchprodfreq = 0.0;
+			nmatches = 0;
+			for (i = 0; i < nvalues1; i++)
+			{
+				int		j;
 
+				for (j = 0; j < nvalues2; j++)
+				{
+					if (hasmatch2[j])
+						continue;
+					if (DatumGetBool(FunctionCall2(&eqproc,
+												   values1[i],
+												   values2[j])))
+					{
+						hasmatch1[i] = hasmatch2[j] = true;
+						matchprodfreq += numbers1[i] * numbers2[j];
+						nmatches++;
+						break;
+					}
+				}
+			}
+			/* Sum up frequencies of matched and unmatched MCVs */
+			matchfreq1 = unmatchfreq1 = 0.0;
+			for (i = 0; i < nvalues1; i++)
+			{
+				if (hasmatch1[i])
+					matchfreq1 += numbers1[i];
+				else
+					unmatchfreq1 += numbers1[i];
+			}
+			matchfreq2 = unmatchfreq2 = 0.0;
+			for (i = 0; i < nvalues2; i++)
+			{
+				if (hasmatch2[i])
+					matchfreq2 += numbers2[i];
+				else
+					unmatchfreq2 += numbers2[i];
+			}
+			pfree(hasmatch1);
+			pfree(hasmatch2);
+			/*
+			 * Compute total frequency of non-null values that are not in
+			 * the MCV lists.
+			 */
+			otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
+			otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
+			/*
+			 * We can estimate the total selectivity from the point of view
+			 * of relation 1 as: the known selectivity for matched MCVs, plus
+			 * unmatched MCVs that are assumed to match against random members
+			 * of relation 2's non-MCV population, plus non-MCV values that
+			 * are assumed to match against random members of relation 2's
+			 * unmatched MCVs plus non-MCV values.
+			 */
+			totalsel1 = matchprodfreq;
+			if (nd2 > nvalues2)
+				totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
+			if (nd2 > nmatches)
+				totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
+					(nd2 - nmatches);
+			/* Same estimate from the point of view of relation 2. */
+			totalsel2 = matchprodfreq;
+			if (nd1 > nvalues1)
+				totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
+			if (nd1 > nmatches)
+				totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
+					(nd1 - nmatches);
+			/*
+			 * For robustness, we average the two estimates.  (Can a case
+			 * be made for taking the min or max instead?)
+			 */
+			selec = (totalsel1 + totalsel2) * 0.5;
+		}
+		else
+		{
+			/*
+			 * We do not have MCV lists for both sides.  Estimate the
+			 * join selectivity as MIN(1/nd1, 1/nd2).  This is plausible
+			 * if we assume that the values are about equally distributed:
+			 * a given tuple of rel1 will join to either 0 or N2/nd2 rows
+			 * of rel2, so total join rows are at most N1*N2/nd2 giving
+			 * a join selectivity of not more than 1/nd2.  By the same logic
+			 * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
+			 * bound.  Using the MIN() means we estimate from the point of
+			 * view of the relation with smaller nd (since the larger nd is
+			 * determining the MIN).  It is reasonable to assume that most
+			 * tuples in this rel will have join partners, so the bound is
+			 * probably reasonably tight and should be taken as-is.
+			 *
+			 * XXX Can we be smarter if we have an MCV list for just one side?
+			 * It seems that if we assume equal distribution for the other
+			 * side, we end up with the same answer anyway.
+			 */
+			if (nd1 > nd2)
+				selec = 1.0 / nd1;
+			else
+				selec = 1.0 / nd2;
+		}
+
+		if (have_mcvs1)
+			free_attstatsslot(var1->vartype, values1, nvalues1,
+							  numbers1, nnumbers1);
+		if (have_mcvs2)
+			free_attstatsslot(var2->vartype, values2, nvalues2,
+							  numbers2, nnumbers2);
 		if (HeapTupleIsValid(statsTuple1))
 			ReleaseSysCache(statsTuple1);
 		if (HeapTupleIsValid(statsTuple2))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
 Datum
 neqjoinsel(PG_FUNCTION_ARGS)
 {
+	Query	   *root = (Query *) PG_GETARG_POINTER(0);
+	Oid			operator = PG_GETARG_OID(1);
+	List	   *args = (List *) PG_GETARG_POINTER(2);
+	Oid			eqop;
 	float8		result;
 
 	/*
-	 * XXX we skip looking up the negator operator here because we know
-	 * eqjoinsel() won't look at it anyway.  If eqjoinsel() ever does
-	 * look, this routine will need to look more like neqsel() does.
+	 * We want 1 - eqjoinsel() where the equality operator is the one
+	 * associated with this != operator, that is, its negator.
 	 */
-	result = DatumGetFloat8(eqjoinsel(fcinfo));
+	eqop = get_negator(operator);
+	if (eqop)
+	{
+		result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
+											 PointerGetDatum(root),
+											 ObjectIdGetDatum(eqop),
+											 PointerGetDatum(args)));
+
+	}
+	else
+	{
+		/* Use default selectivity (should we raise an error instead?) */
+		result = DEFAULT_EQ_SEL;
+	}
 	result = 1.0 - result;
 	PG_RETURN_FLOAT8(result);
 }