@@ -224,6 +224,10 @@ typedef struct
224224 * of aliases to columns of the right input. Thus, positions in the printable
225225 * column alias list are not necessarily one-for-one with varattnos of the
226226 * JOIN, so we need a separate new_colnames[] array for printing purposes.
227+ *
228+ * Finally, when dealing with wide tables we risk O(N^2) costs in assigning
229+ * non-duplicate column names. We ameliorate that by using a hash table that
230+ * holds all the strings appearing in colnames, new_colnames, and parentUsing.
227231 */
228232typedef struct
229233{
@@ -291,6 +295,15 @@ typedef struct
291295 int * leftattnos ; /* left-child varattnos of join cols, or 0 */
292296 int * rightattnos ; /* right-child varattnos of join cols, or 0 */
293297 List * usingNames ; /* names assigned to merged columns */
298+
299+ /*
300+ * Hash table holding copies of all the strings appearing in this struct's
301+ * colnames, new_colnames, and parentUsing. We use a hash table only for
302+ * sufficiently wide relations, and only during the colname-assignment
303+ * functions set_relation_column_names and set_join_column_names;
304+ * otherwise, names_hash is NULL.
305+ */
306+ HTAB * names_hash ; /* entries are just strings */
294307} deparse_columns ;
295308
296309/* This macro is analogous to rt_fetch(), but for deparse_columns structs */
@@ -376,6 +389,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns,
376389static char * make_colname_unique (char * colname , deparse_namespace * dpns ,
377390 deparse_columns * colinfo );
378391static void expand_colnames_array_to (deparse_columns * colinfo , int n );
392+ static void build_colinfo_names_hash (deparse_columns * colinfo );
393+ static void add_to_names_hash (deparse_columns * colinfo , const char * name );
394+ static void destroy_colinfo_names_hash (deparse_columns * colinfo );
379395static void identify_join_columns (JoinExpr * j , RangeTblEntry * jrte ,
380396 deparse_columns * colinfo );
381397static char * get_rtable_name (int rtindex , deparse_context * context );
@@ -4133,6 +4149,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
41334149 *
41344150 * parentUsing is a list of all USING aliases assigned in parent joins of
41354151 * the current jointree node. (The passed-in list must not be modified.)
4152+ *
4153+ * Note that we do not use per-deparse_columns hash tables in this function.
4154+ * The number of names that need to be assigned should be small enough that
4155+ * we don't need to trouble with that.
41364156 */
41374157static void
41384158set_using_names (deparse_namespace * dpns , Node * jtnode , List * parentUsing )
@@ -4408,6 +4428,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
44084428 colinfo -> new_colnames = (char * * ) palloc (ncolumns * sizeof (char * ));
44094429 colinfo -> is_new_col = (bool * ) palloc (ncolumns * sizeof (bool ));
44104430
4431+ /* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
4432+ build_colinfo_names_hash (colinfo );
4433+
44114434 /*
44124435 * Scan the columns, select a unique alias for each one, and store it in
44134436 * colinfo->colnames and colinfo->new_colnames. The former array has NULL
@@ -4443,6 +4466,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
44434466 colname = make_colname_unique (colname , dpns , colinfo );
44444467
44454468 colinfo -> colnames [i ] = colname ;
4469+ add_to_names_hash (colinfo , colname );
44464470 }
44474471
44484472 /* Put names of non-dropped columns in new_colnames[] too */
@@ -4456,6 +4480,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
44564480 changed_any = true;
44574481 }
44584482
4483+ /* We're now done needing the colinfo's names_hash */
4484+ destroy_colinfo_names_hash (colinfo );
4485+
44594486 /*
44604487 * Set correct length for new_colnames[] array. (Note: if columns have
44614488 * been added, colinfo->num_cols includes them, which is not really quite
@@ -4526,6 +4553,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
45264553 expand_colnames_array_to (colinfo , noldcolumns );
45274554 Assert (colinfo -> num_cols == noldcolumns );
45284555
4556+ /* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
4557+ build_colinfo_names_hash (colinfo );
4558+
45294559 /*
45304560 * Scan the join output columns, select an alias for each one, and store
45314561 * it in colinfo->colnames. If there are USING columns, set_using_names()
@@ -4563,6 +4593,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
45634593 if (rte -> alias == NULL )
45644594 {
45654595 colinfo -> colnames [i ] = real_colname ;
4596+ add_to_names_hash (colinfo , real_colname );
45664597 continue ;
45674598 }
45684599
@@ -4579,6 +4610,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
45794610 colname = make_colname_unique (colname , dpns , colinfo );
45804611
45814612 colinfo -> colnames [i ] = colname ;
4613+ add_to_names_hash (colinfo , colname );
45824614 }
45834615
45844616 /* Remember if any assigned aliases differ from "real" name */
@@ -4677,6 +4709,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
46774709 }
46784710 else
46794711 colinfo -> new_colnames [j ] = child_colname ;
4712+ add_to_names_hash (colinfo , colinfo -> new_colnames [j ]);
46804713 }
46814714
46824715 colinfo -> is_new_col [j ] = leftcolinfo -> is_new_col [jc ];
@@ -4726,6 +4759,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
47264759 }
47274760 else
47284761 colinfo -> new_colnames [j ] = child_colname ;
4762+ add_to_names_hash (colinfo , colinfo -> new_colnames [j ]);
47294763 }
47304764
47314765 colinfo -> is_new_col [j ] = rightcolinfo -> is_new_col [jc ];
@@ -4740,6 +4774,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
47404774 Assert (j == nnewcolumns );
47414775#endif
47424776
4777+ /* We're now done needing the colinfo's names_hash */
4778+ destroy_colinfo_names_hash (colinfo );
4779+
47434780 /*
47444781 * For a named join, print column aliases if we changed any from the child
47454782 * names. Unnamed joins cannot print aliases.
@@ -4762,38 +4799,59 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
47624799 int i ;
47634800 ListCell * lc ;
47644801
4765- /* Check against already-assigned column aliases within RTE */
4766- for (i = 0 ; i < colinfo -> num_cols ; i ++ )
4767- {
4768- char * oldname = colinfo -> colnames [i ];
4769-
4770- if (oldname && strcmp (oldname , colname ) == 0 )
4771- return false;
4772- }
4773-
47744802 /*
4775- * If we're building a new_colnames array, check that too (this will be
4776- * partially but not completely redundant with the previous checks)
4803+ * If we have a hash table, consult that instead of linearly scanning the
4804+ * colinfo's strings.
47774805 */
4778- for ( i = 0 ; i < colinfo -> num_new_cols ; i ++ )
4806+ if ( colinfo -> names_hash )
47794807 {
4780- char * oldname = colinfo -> new_colnames [i ];
4781-
4782- if (oldname && strcmp (oldname , colname ) == 0 )
4808+ if (hash_search (colinfo -> names_hash ,
4809+ colname ,
4810+ HASH_FIND ,
4811+ NULL ) != NULL )
47834812 return false;
47844813 }
4785-
4786- /* Also check against USING-column names that must be globally unique */
4787- foreach (lc , dpns -> using_names )
4814+ else
47884815 {
4789- char * oldname = (char * ) lfirst (lc );
4816+ /* Check against already-assigned column aliases within RTE */
4817+ for (i = 0 ; i < colinfo -> num_cols ; i ++ )
4818+ {
4819+ char * oldname = colinfo -> colnames [i ];
47904820
4791- if (strcmp (oldname , colname ) == 0 )
4792- return false;
4821+ if (oldname && strcmp (oldname , colname ) == 0 )
4822+ return false;
4823+ }
4824+
4825+ /*
4826+ * If we're building a new_colnames array, check that too (this will
4827+ * be partially but not completely redundant with the previous checks)
4828+ */
4829+ for (i = 0 ; i < colinfo -> num_new_cols ; i ++ )
4830+ {
4831+ char * oldname = colinfo -> new_colnames [i ];
4832+
4833+ if (oldname && strcmp (oldname , colname ) == 0 )
4834+ return false;
4835+ }
4836+
4837+ /*
4838+ * Also check against names already assigned for parent-join USING
4839+ * cols
4840+ */
4841+ foreach (lc , colinfo -> parentUsing )
4842+ {
4843+ char * oldname = (char * ) lfirst (lc );
4844+
4845+ if (strcmp (oldname , colname ) == 0 )
4846+ return false;
4847+ }
47934848 }
47944849
4795- /* Also check against names already assigned for parent-join USING cols */
4796- foreach (lc , colinfo -> parentUsing )
4850+ /*
4851+ * Also check against USING-column names that must be globally unique.
4852+ * These are not hashed, but there should be few of them.
4853+ */
4854+ foreach (lc , dpns -> using_names )
47974855 {
47984856 char * oldname = (char * ) lfirst (lc );
47994857
@@ -4861,6 +4919,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n)
48614919 }
48624920}
48634921
4922+ /*
4923+ * build_colinfo_names_hash: optionally construct a hash table for colinfo
4924+ */
4925+ static void
4926+ build_colinfo_names_hash (deparse_columns * colinfo )
4927+ {
4928+ HASHCTL hash_ctl ;
4929+ int i ;
4930+ ListCell * lc ;
4931+
4932+ /*
4933+ * Use a hash table only for RTEs with at least 32 columns. (The cutoff
4934+ * is somewhat arbitrary, but let's choose it so that this code does get
4935+ * exercised in the regression tests.)
4936+ */
4937+ if (colinfo -> num_cols < 32 )
4938+ return ;
4939+
4940+ /*
4941+ * Set up the hash table. The entries are just strings with no other
4942+ * payload.
4943+ */
4944+ hash_ctl .keysize = NAMEDATALEN ;
4945+ hash_ctl .entrysize = NAMEDATALEN ;
4946+ hash_ctl .hcxt = CurrentMemoryContext ;
4947+ colinfo -> names_hash = hash_create ("deparse_columns names" ,
4948+ colinfo -> num_cols + colinfo -> num_new_cols ,
4949+ & hash_ctl ,
4950+ HASH_ELEM | HASH_STRINGS | HASH_CONTEXT );
4951+
4952+ /*
4953+ * Preload the hash table with any names already present (these would have
4954+ * come from set_using_names).
4955+ */
4956+ for (i = 0 ; i < colinfo -> num_cols ; i ++ )
4957+ {
4958+ char * oldname = colinfo -> colnames [i ];
4959+
4960+ if (oldname )
4961+ add_to_names_hash (colinfo , oldname );
4962+ }
4963+
4964+ for (i = 0 ; i < colinfo -> num_new_cols ; i ++ )
4965+ {
4966+ char * oldname = colinfo -> new_colnames [i ];
4967+
4968+ if (oldname )
4969+ add_to_names_hash (colinfo , oldname );
4970+ }
4971+
4972+ foreach (lc , colinfo -> parentUsing )
4973+ {
4974+ char * oldname = (char * ) lfirst (lc );
4975+
4976+ add_to_names_hash (colinfo , oldname );
4977+ }
4978+ }
4979+
4980+ /*
4981+ * add_to_names_hash: add a string to the names_hash, if we're using one
4982+ */
4983+ static void
4984+ add_to_names_hash (deparse_columns * colinfo , const char * name )
4985+ {
4986+ if (colinfo -> names_hash )
4987+ (void ) hash_search (colinfo -> names_hash ,
4988+ name ,
4989+ HASH_ENTER ,
4990+ NULL );
4991+ }
4992+
4993+ /*
4994+ * destroy_colinfo_names_hash: destroy hash table when done with it
4995+ */
4996+ static void
4997+ destroy_colinfo_names_hash (deparse_columns * colinfo )
4998+ {
4999+ if (colinfo -> names_hash )
5000+ {
5001+ hash_destroy (colinfo -> names_hash );
5002+ colinfo -> names_hash = NULL ;
5003+ }
5004+ }
5005+
48645006/*
48655007 * identify_join_columns: figure out where columns of a join come from
48665008 *
0 commit comments