|
7 | 7 | * |
8 | 8 | * |
9 | 9 | * IDENTIFICATION |
10 | | - * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $ |
| 10 | + * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $ |
11 | 11 | * |
12 | 12 | *------------------------------------------------------------------------- |
13 | 13 | */ |
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, |
257 | 257 | * |
258 | 258 | * 1 - select(oper) in NOT nodes |
259 | 259 | * |
260 | | - * freq[val] in VAL nodes, if the value is in MCELEM |
| 260 | + * histogram-based estimation in prefix VAL nodes |
| 261 | + * |
| 262 | + * freq[val] in exact VAL nodes, if the value is in MCELEM |
261 | 263 | * min(freq[MCELEM]) / 2 in VAL nodes, if it is not |
262 | 264 | * |
263 | 265 | * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use |
264 | 266 | * binary search for determining freq[MCELEM]. |
265 | 267 | * |
266 | 268 | * If we don't have stats for the tsvector, we still use this logic, |
267 | | - * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case |
268 | | - * is signaled by lookup == NULL. |
| 269 | + * except we use default estimates for VAL nodes. This case is signaled |
| 270 | + * by lookup == NULL. |
269 | 271 | */ |
270 | 272 | static Selectivity |
271 | 273 | tsquery_opr_selec(QueryItem *item, char *operand, |
272 | 274 | TextFreq *lookup, int length, float4 minfreq) |
273 | 275 | { |
274 | | - LexemeKey key; |
275 | | - TextFreq *searchres; |
276 | | - Selectivity selec, |
277 | | - s1, |
278 | | - s2; |
| 276 | + Selectivity selec; |
279 | 277 |
|
280 | 278 | /* since this function recurses, it could be driven to stack overflow */ |
281 | 279 | check_stack_depth(); |
282 | 280 |
|
283 | 281 | if (item->type == QI_VAL) |
284 | 282 | { |
285 | 283 | QueryOperand *oper = (QueryOperand *) item; |
286 | | - |
287 | | - /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
288 | | - if (lookup == NULL) |
289 | | - return (Selectivity) DEFAULT_TS_MATCH_SEL; |
| 284 | + LexemeKey key; |
290 | 285 |
|
291 | 286 | /* |
292 | 287 | * Prepare the key for bsearch(). |
293 | 288 | */ |
294 | 289 | key.lexeme = operand + oper->distance; |
295 | 290 | key.length = oper->length; |
296 | 291 |
|
297 | | - searchres = (TextFreq *) bsearch(&key, lookup, length, |
298 | | - sizeof(TextFreq), |
299 | | - compare_lexeme_textfreq); |
300 | | - |
301 | | - if (searchres) |
| 292 | + if (oper->prefix) |
302 | 293 | { |
| 294 | + /* Prefix match, ie the query item is lexeme:* */ |
| 295 | + Selectivity matched, |
| 296 | + allmcvs; |
| 297 | + int i; |
| 298 | + |
| 299 | + /* |
| 300 | + * Our strategy is to scan through the MCV list and add up the |
| 301 | + * frequencies of the ones that match the prefix, thereby |
| 302 | + * assuming that the MCVs are representative of the whole lexeme |
| 303 | + * population in this respect. Compare histogram_selectivity(). |
| 304 | + * |
| 305 | + * This is only a good plan if we have a pretty fair number of |
| 306 | + * MCVs available; we set the threshold at 100. If no stats or |
| 307 | + * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4. |
| 308 | + */ |
| 309 | + if (lookup == NULL || length < 100) |
| 310 | + return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); |
| 311 | + |
| 312 | + matched = allmcvs = 0; |
| 313 | + for (i = 0; i < length; i++) |
| 314 | + { |
| 315 | + TextFreq *t = lookup + i; |
| 316 | + int tlen = VARSIZE_ANY_EXHDR(t->element); |
| 317 | + |
| 318 | + if (tlen >= key.length && |
| 319 | + strncmp(key.lexeme, VARDATA_ANY(t->element), |
| 320 | + key.length) == 0) |
| 321 | + matched += t->frequency; |
| 322 | + allmcvs += t->frequency; |
| 323 | + } |
| 324 | + |
| 325 | + if (allmcvs > 0) /* paranoia about zero divide */ |
| 326 | + selec = matched / allmcvs; |
| 327 | + else |
| 328 | + selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); |
| 329 | + |
303 | 330 | /* |
304 | | - * The element is in MCELEM. Return precise selectivity (or at |
305 | | - * least as precise as ANALYZE could find out). |
| 331 | + * In any case, never believe that a prefix match has selectivity |
| 332 | + * less than DEFAULT_TS_MATCH_SEL. |
306 | 333 | */ |
307 | | - return (Selectivity) searchres->frequency; |
| 334 | + selec = Max(DEFAULT_TS_MATCH_SEL, selec); |
308 | 335 | } |
309 | 336 | else |
310 | 337 | { |
311 | | - /* |
312 | | - * The element is not in MCELEM. Punt, but assume that the |
313 | | - * selectivity cannot be more than minfreq / 2. |
314 | | - */ |
315 | | - return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); |
| 338 | + /* Regular exact lexeme match */ |
| 339 | + TextFreq *searchres; |
| 340 | + |
| 341 | + /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
| 342 | + if (lookup == NULL) |
| 343 | + return (Selectivity) DEFAULT_TS_MATCH_SEL; |
| 344 | + |
| 345 | + searchres = (TextFreq *) bsearch(&key, lookup, length, |
| 346 | + sizeof(TextFreq), |
| 347 | + compare_lexeme_textfreq); |
| 348 | + |
| 349 | + if (searchres) |
| 350 | + { |
| 351 | + /* |
| 352 | + * The element is in MCELEM. Return precise selectivity (or |
| 353 | + * at least as precise as ANALYZE could find out). |
| 354 | + */ |
| 355 | + selec = searchres->frequency; |
| 356 | + } |
| 357 | + else |
| 358 | + { |
| 359 | + /* |
| 360 | + * The element is not in MCELEM. Punt, but assume that the |
| 361 | + * selectivity cannot be more than minfreq / 2. |
| 362 | + */ |
| 363 | + selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); |
| 364 | + } |
316 | 365 | } |
317 | 366 | } |
318 | | - |
319 | | - /* Current TSQuery node is an operator */ |
320 | | - switch (item->qoperator.oper) |
| 367 | + else |
321 | 368 | { |
322 | | - case OP_NOT: |
323 | | - selec = 1.0 - tsquery_opr_selec(item + 1, operand, |
324 | | - lookup, length, minfreq); |
325 | | - break; |
326 | | - |
327 | | - case OP_AND: |
328 | | - s1 = tsquery_opr_selec(item + 1, operand, |
329 | | - lookup, length, minfreq); |
330 | | - s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
331 | | - lookup, length, minfreq); |
332 | | - selec = s1 * s2; |
333 | | - break; |
334 | | - |
335 | | - case OP_OR: |
336 | | - s1 = tsquery_opr_selec(item + 1, operand, |
337 | | - lookup, length, minfreq); |
338 | | - s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
339 | | - lookup, length, minfreq); |
340 | | - selec = s1 + s2 - s1 * s2; |
341 | | - break; |
342 | | - |
343 | | - default: |
344 | | - elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); |
345 | | - selec = 0; /* keep compiler quiet */ |
346 | | - break; |
| 369 | + /* Current TSQuery node is an operator */ |
| 370 | + Selectivity s1, |
| 371 | + s2; |
| 372 | + |
| 373 | + switch (item->qoperator.oper) |
| 374 | + { |
| 375 | + case OP_NOT: |
| 376 | + selec = 1.0 - tsquery_opr_selec(item + 1, operand, |
| 377 | + lookup, length, minfreq); |
| 378 | + break; |
| 379 | + |
| 380 | + case OP_AND: |
| 381 | + s1 = tsquery_opr_selec(item + 1, operand, |
| 382 | + lookup, length, minfreq); |
| 383 | + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
| 384 | + lookup, length, minfreq); |
| 385 | + selec = s1 * s2; |
| 386 | + break; |
| 387 | + |
| 388 | + case OP_OR: |
| 389 | + s1 = tsquery_opr_selec(item + 1, operand, |
| 390 | + lookup, length, minfreq); |
| 391 | + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
| 392 | + lookup, length, minfreq); |
| 393 | + selec = s1 + s2 - s1 * s2; |
| 394 | + break; |
| 395 | + |
| 396 | + default: |
| 397 | + elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); |
| 398 | + selec = 0; /* keep compiler quiet */ |
| 399 | + break; |
| 400 | + } |
347 | 401 | } |
348 | 402 |
|
349 | 403 | /* Clamp intermediate results to stay sane despite roundoff error */ |
|
0 commit comments