postgrespro
diff --git a/‎contrib/file_fdw/file_fdw.c‎
Lines changed: 1 addition & 0 deletions b/‎contrib/file_fdw/file_fdw.c‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎contrib/postgres_fdw/postgres_fdw.c‎
Lines changed: 4 additions & 1 deletion b/‎contrib/postgres_fdw/postgres_fdw.c‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎doc/src/sgml/catalogs.sgml‎
Lines changed: 9 additions & 0 deletions b/‎doc/src/sgml/catalogs.sgml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎doc/src/sgml/planstats.sgml‎
Lines changed: 154 additions & 0 deletions b/‎doc/src/sgml/planstats.sgml‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎doc/src/sgml/ref/create_statistics.sgml‎
Lines changed: 39 additions & 3 deletions b/‎doc/src/sgml/ref/create_statistics.sgml‎
Lines changed: 39 additions & 3 deletions
diff --git a/‎src/backend/catalog/system_views.sql‎
Lines changed: 2 additions & 1 deletion b/‎src/backend/catalog/system_views.sql‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/backend/commands/statscmds.c‎
Lines changed: 15 additions & 2 deletions b/‎src/backend/commands/statscmds.c‎
Lines changed: 15 additions & 2 deletions
@@ -1013,6 +1013,7 @@ estimate_size(PlannerInfo *root, RelOptInfo *baserel,
 							   baserel->baserestrictinfo,
 							   0,
 							   JOIN_INNER,
+							   NULL,
 							   NULL);
 
 	nrows = clamp_row_est(nrows);
 
@@ -591,6 +591,7 @@ postgresGetForeignRelSize(PlannerInfo *root,
 													 fpinfo->local_conds,
 													 baserel->relid,
 													 JOIN_INNER,
+													 NULL,
 													 NULL);
 
 	cost_qual_eval(&fpinfo->local_conds_cost, fpinfo->local_conds, root);
@@ -2572,6 +2573,7 @@ estimate_path_cost_size(PlannerInfo *root,
 										   local_param_join_conds,
 										   foreignrel->relid,
 										   JOIN_INNER,
+										   NULL,
 										   NULL);
 		local_sel *= fpinfo->local_conds_sel;
 
@@ -4455,6 +4457,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
 													 fpinfo->local_conds,
 													 0,
 													 JOIN_INNER,
+													 NULL,
 													 NULL);
 	cost_qual_eval(&fpinfo->local_conds_cost, fpinfo->local_conds, root);
 
@@ -4465,7 +4468,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
 	if (!fpinfo->use_remote_estimate)
 		fpinfo->joinclause_sel = clauselist_selectivity(root, fpinfo->joinclauses,
 														0, fpinfo->jointype,
-														extra->sjinfo);
+														extra->sjinfo, NULL);
 
 	/* Estimate costs for bare join relation */
 	estimate_path_cost_size(root, joinrel, NIL, NIL, &rows,
 
@@ -4339,6 +4339,15 @@
       </entry>
      </row>
 
+     <row>
+      <entry><structfield>stadependencies</structfield></entry>
+      <entry><type>pg_dependencies</type></entry>
+      <entry></entry>
+      <entry>
+       Functional dependencies, serialized as <structname>pg_dependencies</> type.
+      </entry>
+     </row>
+
     </tbody>
    </tgroup>
   </table>
 
@@ -446,6 +446,160 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
    in <filename>src/backend/utils/adt/selfuncs.c</filename>.
   </para>
 
+  <sect2 id="functional-dependencies">
+   <title>Functional Dependencies</title>
+
+   <para>
+    The simplest type of extended statistics are functional dependencies,
+    used in definitions of database normal forms. When simplified, saying that
+    <literal>b</> is functionally dependent on <literal>a</> means that
+    knowledge of value of <literal>a</> is sufficient to determine value of
+    <literal>b</>.
+   </para>
+
+   <para>
+    In normalized databases, only functional dependencies on primary keys
+    and superkeys are allowed. However, in practice, many data sets are not
+    fully normalized, for example, due to intentional denormalization for
+    performance reasons.
+   </para>
+
+   <para>
+    Functional dependencies directly affect accuracy of the estimates, as
+    conditions on the dependent column(s) do not restrict the result set,
+    resulting in underestimates.
+   </para>
+
+   <para>
+    To inform the planner about the functional dependencies, we collect
+    measurements of dependency during <command>ANALYZE</>. Assessing
+    dependency between all sets of columns would be prohibitively
+    expensive, so we limit our search to potential dependencies defined
+    using the <command>CREATE STATISTICS</> command.
+
+<programlisting>
+CREATE TABLE t (a INT, b INT);
+INSERT INTO t SELECT i/100, i/100 FROM generate_series(1,10000) s(i);
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;
+ANALYZE t;
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
+                                          QUERY PLAN                                             
+-------------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual time=0.095..3.118 rows=100 loops=1)
+   Filter: ((a = 1) AND (b = 1))
+   Rows Removed by Filter: 9900
+ Planning time: 0.367 ms
+ Execution time: 3.380 ms
+(5 rows)
+</programlisting>
+
+    The planner is now aware of the functional dependencies and considers
+    them when computing the selectivity of the second condition.  Running
+    the query without the statistics would lead to quite different estimates.
+
+<programlisting>
+DROP STATISTICS s1;
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=1 width=8) (actual time=0.000..6.379 rows=100 loops=1)
+   Filter: ((a = 1) AND (b = 1))
+   Rows Removed by Filter: 9900
+ Planning time: 0.000 ms
+ Execution time: 6.379 ms
+(5 rows)
+</programlisting>
+   </para>
+
+   <para>
+    If no dependency exists, the collected statistics do not influence the
+    query plan. The only effect is to slow down <command>ANALYZE</>. Should
+    partial dependencies exist these will also be stored and applied
+    during planning.
+   </para>
+
+   <para>
+    Similarly to per-column statistics, extended statistics are stored in
+    a system catalog called <structname>pg_statistic_ext</structname>, but
+    there is also a more convenient view <structname>pg_stats_ext</structname>.
+    To inspect the statistics <literal>s1</literal> defined above,
+    you may do this:
+
+<programlisting>
+SELECT tablename, staname, attnums, depsbytes
+  FROM pg_stats_ext WHERE staname = 's1';
+ tablename | staname | attnums | depsbytes 
+-----------+---------+---------+-----------
+ t         | s1      | 1 2     |        40
+(1 row)
+</programlisting>
+
+     This shows that the statistics are defined on table <structname>t</>,
+     <structfield>attnums</structfield> lists attribute numbers of columns
+     (references <structname>pg_attribute</structname>). It also shows
+     the length in bytes of the functional dependencies, as found by
+     <command>ANALYZE</> when serialized into a <literal>bytea</> column.
+   </para>
+
+   <para>
+    When computing the selectivity, the planner inspects all conditions and
+    attempts to identify which conditions are already implied by other
+    conditions.  The selectivity estimates from any redundant conditions are
+    ignored from a selectivity point of view. In the example query above,
+    the selectivity estimates for either of the conditions may be eliminated,
+    thus improving the overall estimate.
+   </para>
+
+    <sect3 id="functional-dependencies-limitations">
+     <title>Limitations of functional dependencies</title>
+
+     <para>
+      Functional dependencies are a very simple type of statistics, and
+      as such have several limitations. The first limitation is that they
+      only work with simple equality conditions, comparing columns and constant
+      values. It's not possible to use them to eliminate equality conditions
+      comparing two columns or a column to an expression, range clauses,
+      <literal>LIKE</> or any other type of conditions.
+     </para>
+
+     <para>
+      When eliminating the implied conditions, the planner assumes that the
+      conditions are compatible. Consider the following example, violating
+      this assumption:
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 10;
+                                          QUERY PLAN
+-----------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual time=2.992..2.992 rows=0 loops=1)
+   Filter: ((a = 1) AND (b = 10))
+   Rows Removed by Filter: 10000
+ Planning time: 0.232 ms
+ Execution time: 3.033 ms
+(5 rows)
+</programlisting>
+
+      While there are no rows with such combination of values, the planner
+      is unable to verify whether the values match - it only knows that
+      the columns are functionally dependent.
+     </para>
+
+     <para>
+      This assumption is more about queries executed on the database - in many
+      cases, it's actually satisfied (e.g. when the GUI only allows selecting
+      compatible values). But if that's not the case, functional dependencies
+      may not be a viable option.
+     </para>
+
+     <para>
+      For additional information about functional dependencies, see
+      <filename>src/backend/statistics/README.dependencies</>.
+     </para>
+
+    </sect3>
+
+  </sect2>
+
  </sect1>
 
 </chapter>
@@ -21,8 +21,9 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable> ON (
-  <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable>
+  WITH ( <replaceable class="PARAMETER">option</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] )
+  ON ( <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
   FROM <replaceable class="PARAMETER">table_name</replaceable>
 </synopsis>
 
@@ -94,6 +95,41 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
 
   </variablelist>
 
+  <refsect2 id="SQL-CREATESTATISTICS-parameters">
+   <title id="SQL-CREATESTATISTICS-parameters-title">Parameters</title>
+
+ <indexterm zone="sql-createstatistics-parameters">
+  <primary>statistics parameters</primary>
+ </indexterm>
+
+   <para>
+    The <literal>WITH</> clause can specify <firstterm>options</>
+    for the statistics. Available options are listed below.
+   </para>
+
+   <variablelist>
+
+   <varlistentry>
+    <term><literal>dependencies</> (<type>boolean</>)</term>
+    <listitem>
+     <para>
+      Enables functional dependencies for the statistics.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><literal>ndistinct</> (<type>boolean</>)</term>
+    <listitem>
+     <para>
+      Enables ndistinct coefficients for the statistics.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   </variablelist>
+
+  </refsect2>
  </refsect1>
 
  <refsect1>
@@ -122,7 +158,7 @@ CREATE TABLE t1 (
 INSERT INTO t1 SELECT i/100, i/500
                  FROM generate_series(1,1000000) s(i);
 
-CREATE STATISTICS s1 ON (a, b) FROM t1;
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t1;
 
 ANALYZE t1;
 
 
@@ -192,7 +192,8 @@ CREATE VIEW pg_stats_ext AS
         C.relname AS tablename,
         S.staname AS staname,
         S.stakeys AS attnums,
-        length(s.standistinct) AS ndistbytes
+        length(s.standistinct::bytea) AS ndistbytes,
+        length(S.stadependencies::bytea) AS depsbytes
     FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid))
         LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
 
 
@@ -62,10 +62,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 	Oid			relid;
 	ObjectAddress parentobject,
 				childobject;
-	Datum		types[1];		/* only ndistinct defined now */
+	Datum		types[2];		/* one for each possible type of statistics */
 	int			ntypes;
 	ArrayType  *staenabled;
 	bool		build_ndistinct;
+	bool		build_dependencies;
 	bool		requested_type = false;
 
 	Assert(IsA(stmt, CreateStatsStmt));
@@ -159,7 +160,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 				 errmsg("statistics require at least 2 columns")));
 
 	/*
-	 * Sort the attnums, which makes detecting duplicies somewhat easier, and
+	 * Sort the attnums, which makes detecting duplicities somewhat easier, and
 	 * it does not hurt (it does not affect the efficiency, unlike for
 	 * indexes, for example).
 	 */
@@ -182,6 +183,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	 * recognized.
 	 */
 	build_ndistinct = false;
+	build_dependencies = false;
 	foreach(l, stmt->options)
 	{
 		DefElem    *opt = (DefElem *) lfirst(l);
@@ -191,6 +193,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 			build_ndistinct = defGetBoolean(opt);
 			requested_type = true;
 		}
+		else if (strcmp(opt->defname, "dependencies") == 0)
+		{
+			build_dependencies = defGetBoolean(opt);
+			requested_type = true;
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -199,12 +206,17 @@ CreateStatistics(CreateStatsStmt *stmt)
 	}
 	/* If no statistic type was specified, build them all. */
 	if (!requested_type)
+	{
 		build_ndistinct = true;
+		build_dependencies = true;
+	}
 
 	/* construct the char array of enabled statistic types */
 	ntypes = 0;
 	if (build_ndistinct)
 		types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
+	if (build_dependencies)
+		types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
 	Assert(ntypes > 0);
 	staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c');
 
@@ -222,6 +234,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 
 	/* no statistics build yet */
 	nulls[Anum_pg_statistic_ext_standistinct - 1] = true;
+	nulls[Anum_pg_statistic_ext_stadependencies - 1] = true;
 
 	/* insert it into pg_statistic_ext */
 	statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);