@@ -207,124 +207,160 @@ clean_NOT(QueryItem *ptr, int *len)
207207}
208208
209209
210- #ifdef V_UNKNOWN /* exists in Windows headers */
211- #undef V_UNKNOWN
212- #endif
213- #ifdef V_FALSE /* exists in Solaris headers */
214- #undef V_FALSE
215- #endif
216-
217- /*
218- * output values for result output parameter of clean_fakeval_intree
219- */
220- #define V_UNKNOWN 0 /* the expression can't be evaluated
221- * statically */
222- #define V_TRUE 1 /* the expression is always true (not
223- * implemented) */
224- #define V_FALSE 2 /* the expression is always false (not
225- * implemented) */
226- #define V_STOP 3 /* the expression is a stop word */
227-
228210/*
229- * Remove QI_VALSTOP (stopword nodes) from query tree.
211+ * Remove QI_VALSTOP (stopword) nodes from query tree.
212+ *
213+ * Returns NULL if the query degenerates to nothing. Input must not be NULL.
214+ *
215+ * When we remove a phrase operator due to removing one or both of its
216+ * arguments, we might need to adjust the distance of a parent phrase
217+ * operator. For example, 'a' is a stopword, so:
218+ * (b <-> a) <-> c should become b <2> c
219+ * b <-> (a <-> c) should become b <2> c
220+ * (b <-> (a <-> a)) <-> c should become b <3> c
221+ * b <-> ((a <-> a) <-> c) should become b <3> c
222+ * To handle that, we define two output parameters:
223+ * ladd: amount to add to a phrase distance to the left of this node
224+ * radd: amount to add to a phrase distance to the right of this node
225+ * We need two outputs because we could need to bubble up adjustments to two
226+ * different parent phrase operators. Consider
227+ * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
228+ * After we've removed the two a's and are considering the <2> node (which is
229+ * now just x <2> y), we have an ladd distance of 1 that needs to propagate
230+ * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
231+ * propagate to the rightmost <->, so that we'll end up with
232+ * w <2> ((x <2> y) <4> z)
233+ * Near the bottom of the tree, we may have subtrees consisting only of
234+ * stopwords. The distances of any phrase operators within such a subtree are
235+ * summed and propagated to both ladd and radd, since we don't know which side
236+ * of the lowest surviving phrase operator we are in. The rule is that any
237+ * subtree that degenerates to NULL must return equal values of ladd and radd,
238+ * and the parent node dealing with it should incorporate only one of those.
239+ *
240+ * Currently, we only implement this adjustment for adjacent phrase operators.
241+ * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
242+ * isn't ideal, but there is no way to represent the really desired semantics
243+ * without some redesign of the tsquery structure. Certainly it would not be
244+ * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
245+ * corner case, let it go for now. But we can fix it in cases where the
246+ * intervening non-phrase operator also gets removed, for example
247+ * '((x <-> a) | a) <-> y' will become 'x <2> y'.
230248 */
231249static NODE *
232- clean_fakeval_intree (NODE * node , char * result , int * adddistance )
250+ clean_stopword_intree (NODE * node , int * ladd , int * radd )
233251{
234- char lresult = V_UNKNOWN ,
235- rresult = V_UNKNOWN ;
236-
237252 /* since this function recurses, it could be driven to stack overflow. */
238253 check_stack_depth ();
239254
240- if ( adddistance )
241- * adddistance = 0 ;
255+ /* default output parameters indicate no change in parent distance */
256+ * ladd = * radd = 0 ;
242257
243258 if (node -> valnode -> type == QI_VAL )
244259 return node ;
245260 else if (node -> valnode -> type == QI_VALSTOP )
246261 {
247262 pfree (node );
248- * result = V_STOP ;
249263 return NULL ;
250264 }
251265
252266 Assert (node -> valnode -> type == QI_OPR );
253267
254268 if (node -> valnode -> qoperator .oper == OP_NOT )
255269 {
256- node -> right = clean_fakeval_intree (node -> right , & rresult , NULL );
270+ /* NOT doesn't change pattern width, so just report child distances */
271+ node -> right = clean_stopword_intree (node -> right , ladd , radd );
257272 if (!node -> right )
258273 {
259- * result = V_STOP ;
260274 freetree (node );
261275 return NULL ;
262276 }
263277 }
264278 else
265279 {
266280 NODE * res = node ;
281+ bool isphrase ;
267282 int ndistance ,
268- ldistance = 0 ,
269- rdistance = 0 ;
283+ lladd ,
284+ lradd ,
285+ rladd ,
286+ rradd ;
270287
271- ndistance = ( node -> valnode -> qoperator . oper == OP_PHRASE ) ?
272- node -> valnode -> qoperator . distance :
273- 0 ;
288+ /* First, recurse */
289+ node -> left = clean_stopword_intree ( node -> left , & lladd , & lradd );
290+ node -> right = clean_stopword_intree ( node -> right , & rladd , & rradd ) ;
274291
275- node -> left = clean_fakeval_intree ( node -> left ,
276- & lresult ,
277- ndistance ? & ldistance : NULL ) ;
292+ /* Check if current node is OP_PHRASE, get its distance */
293+ isphrase = ( node -> valnode -> qoperator . oper == OP_PHRASE );
294+ ndistance = isphrase ? node -> valnode -> qoperator . distance : 0 ;
278295
279- node -> right = clean_fakeval_intree (node -> right ,
280- & rresult ,
281- ndistance ? & rdistance : NULL );
282-
283- /*
284- * ndistance, ldistance and rdistance are greater than zero if their
285- * corresponding nodes are OP_PHRASE
286- */
287-
288- if (lresult == V_STOP && rresult == V_STOP )
296+ if (node -> left == NULL && node -> right == NULL )
289297 {
290- if (adddistance && ndistance )
291- * adddistance = ldistance + ndistance + rdistance ;
298+ /*
299+ * When we collapse out a phrase node entirely, propagate its own
300+ * distance into both *ladd and *radd; it is the responsibility of
301+ * the parent node to count it only once. Also, for a phrase
302+ * node, distances coming from children are summed and propagated
303+ * up to parent (we assume lladd == lradd and rladd == rradd, else
304+ * rule was broken at a lower level). But if this isn't a phrase
305+ * node, take the larger of the two child distances; that
306+ * corresponds to what TS_execute will do in non-stopword cases.
307+ */
308+ if (isphrase )
309+ * ladd = * radd = lladd + ndistance + rladd ;
310+ else
311+ * ladd = * radd = Max (lladd , rladd );
292312 freetree (node );
293- * result = V_STOP ;
294313 return NULL ;
295314 }
296- else if (lresult == V_STOP )
315+ else if (node -> left == NULL )
297316 {
317+ /* Removing this operator and left subnode */
318+ /* lladd and lradd are equal/redundant, don't count both */
319+ if (isphrase )
320+ {
321+ /* operator's own distance must propagate to left */
322+ * ladd = lladd + ndistance + rladd ;
323+ * radd = rradd ;
324+ }
325+ else
326+ {
327+ /* at non-phrase op, just forget the left subnode entirely */
328+ * ladd = rladd ;
329+ * radd = rradd ;
330+ }
298331 res = node -> right ;
299-
300- /*
301- * propagate distance from current node to the right upper
302- * subtree.
303- */
304- if (adddistance && ndistance )
305- * adddistance = rdistance ;
306332 pfree (node );
307333 }
308- else if (rresult == V_STOP )
334+ else if (node -> right == NULL )
309335 {
336+ /* Removing this operator and right subnode */
337+ /* rladd and rradd are equal/redundant, don't count both */
338+ if (isphrase )
339+ {
340+ /* operator's own distance must propagate to right */
341+ * ladd = lladd ;
342+ * radd = lradd + ndistance + rradd ;
343+ }
344+ else
345+ {
346+ /* at non-phrase op, just forget the right subnode entirely */
347+ * ladd = lladd ;
348+ * radd = lradd ;
349+ }
310350 res = node -> left ;
311-
312- /*
313- * propagate distance from current node to the upper tree.
314- */
315- if (adddistance && ndistance )
316- * adddistance = ndistance + ldistance ;
317351 pfree (node );
318352 }
319- else if (ndistance )
353+ else if (isphrase )
320354 {
321- node -> valnode -> qoperator .distance += ldistance ;
322- if (adddistance )
323- * adddistance = 0 ;
355+ /* Absorb appropriate corrections at this level */
356+ node -> valnode -> qoperator .distance += lradd + rladd ;
357+ /* Propagate up any unaccounted-for corrections */
358+ * ladd = lladd ;
359+ * radd = rradd ;
324360 }
325- else if ( adddistance )
361+ else
326362 {
327- * adddistance = 0 ;
363+ /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
328364 }
329365
330366 return res ;
@@ -585,7 +621,8 @@ cleanup_fakeval_and_phrase(TSQuery in)
585621 commonlen ,
586622 i ;
587623 NODE * root ;
588- char result = V_UNKNOWN ;
624+ int ladd ,
625+ radd ;
589626 TSQuery out ;
590627 QueryItem * items ;
591628 char * operands ;
@@ -594,8 +631,8 @@ cleanup_fakeval_and_phrase(TSQuery in)
594631 return in ;
595632
596633 /* eliminate stop words */
597- root = clean_fakeval_intree (maketree (GETQUERY (in )), & result , NULL );
598- if (result != V_UNKNOWN )
634+ root = clean_stopword_intree (maketree (GETQUERY (in )), & ladd , & radd );
635+ if (root == NULL )
599636 {
600637 ereport (NOTICE ,
601638 (errmsg ("text-search query contains only stop words or doesn't contain lexemes, ignored" )));
0 commit comments