@@ -54,7 +54,9 @@ typedef struct varlena VarString;
5454 */
5555typedef struct
5656{
57+ pg_locale_t locale ; /* collation used for substring matching */
5758 bool is_multibyte_char_in_char ; /* need to check char boundaries? */
59+ bool greedy ; /* find longest possible substring? */
5860
5961 char * str1 ; /* haystack string */
6062 char * str2 ; /* needle string */
@@ -65,7 +67,13 @@ typedef struct
6567 int skiptablemask ; /* mask for ANDing with skiptable subscripts */
6668 int skiptable [256 ]; /* skip distance for given mismatched char */
6769
70+ /*
71+ * Note that with nondeterministic collations, the length of the last
72+ * match is not necessarily equal to the length of the "needle" passed in.
73+ */
6874 char * last_match ; /* pointer to last match in 'str1' */
75+ int last_match_len ; /* length of last match */
76+ int last_match_len_tmp ; /* same but for internal use */
6977
7078 /*
7179 * Sometimes we need to convert the byte position of a match to a
@@ -1178,15 +1186,21 @@ text_position(text *t1, text *t2, Oid collid)
11781186 TextPositionState state ;
11791187 int result ;
11801188
1189+ check_collation_set (collid );
1190+
11811191 /* Empty needle always matches at position 1 */
11821192 if (VARSIZE_ANY_EXHDR (t2 ) < 1 )
11831193 return 1 ;
11841194
11851195 /* Otherwise, can't match if haystack is shorter than needle */
1186- if (VARSIZE_ANY_EXHDR (t1 ) < VARSIZE_ANY_EXHDR (t2 ))
1196+ if (VARSIZE_ANY_EXHDR (t1 ) < VARSIZE_ANY_EXHDR (t2 ) &&
1197+ pg_newlocale_from_collation (collid )-> deterministic )
11871198 return 0 ;
11881199
11891200 text_position_setup (t1 , t2 , collid , & state );
1201+ /* don't need greedy mode here */
1202+ state .greedy = false;
1203+
11901204 if (!text_position_next (& state ))
11911205 result = 0 ;
11921206 else
@@ -1217,18 +1231,17 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12171231{
12181232 int len1 = VARSIZE_ANY_EXHDR (t1 );
12191233 int len2 = VARSIZE_ANY_EXHDR (t2 );
1220- pg_locale_t mylocale ;
12211234
12221235 check_collation_set (collid );
12231236
1224- mylocale = pg_newlocale_from_collation (collid );
1237+ state -> locale = pg_newlocale_from_collation (collid );
12251238
1226- if (!mylocale -> deterministic )
1227- ereport (ERROR ,
1228- (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
1229- errmsg ("nondeterministic collations are not supported for substring searches" )));
1239+ /*
1240+ * Most callers need greedy mode, but some might want to unset this to
1241+ * optimize.
1242+ */
1243+ state -> greedy = true;
12301244
1231- Assert (len1 > 0 );
12321245 Assert (len2 > 0 );
12331246
12341247 /*
@@ -1264,8 +1277,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12641277 * point in wasting cycles initializing the table. We also choose not to
12651278 * use B-M-H for needles of length 1, since the skip table can't possibly
12661279 * save anything in that case.
1280+ *
1281+ * (With nondeterministic collations, the search is already
1282+ * multibyte-aware, so we don't need this.)
12671283 */
1268- if (len1 >= len2 && len2 > 1 )
1284+ if (len1 >= len2 && len2 > 1 && state -> locale -> deterministic )
12691285 {
12701286 int searchlength = len1 - len2 ;
12711287 int skiptablemask ;
@@ -1343,7 +1359,7 @@ text_position_next(TextPositionState *state)
13431359
13441360 /* Start from the point right after the previous match. */
13451361 if (state -> last_match )
1346- start_ptr = state -> last_match + needle_len ;
1362+ start_ptr = state -> last_match + state -> last_match_len ;
13471363 else
13481364 start_ptr = state -> str1 ;
13491365
@@ -1359,7 +1375,7 @@ text_position_next(TextPositionState *state)
13591375 * multi-byte character, we need to verify that the match was at a
13601376 * character boundary, not in the middle of a multi-byte character.
13611377 */
1362- if (state -> is_multibyte_char_in_char )
1378+ if (state -> is_multibyte_char_in_char && state -> locale -> deterministic )
13631379 {
13641380 /* Walk one character at a time, until we reach the match. */
13651381
@@ -1387,6 +1403,7 @@ text_position_next(TextPositionState *state)
13871403 }
13881404
13891405 state -> last_match = matchptr ;
1406+ state -> last_match_len = state -> last_match_len_tmp ;
13901407 return true;
13911408}
13921409
@@ -1408,7 +1425,62 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14081425
14091426 Assert (start_ptr >= haystack && start_ptr <= haystack_end );
14101427
1411- if (needle_len == 1 )
1428+ state -> last_match_len_tmp = needle_len ;
1429+
1430+ if (!state -> locale -> deterministic )
1431+ {
1432+ /*
1433+ * With a nondeterministic collation, we have to use an unoptimized
1434+ * route. We walk through the haystack and see if at each position
1435+ * there is a substring of the remaining string that is equal to the
1436+ * needle under the given collation.
1437+ *
1438+ * Note, the found substring could have a different length than the
1439+ * needle, including being empty. Callers that want to skip over the
1440+ * found string need to read the length of the found substring from
1441+ * last_match_len rather than just using the length of their needle.
1442+ *
1443+ * Most callers will require "greedy" semantics, meaning that we need
1444+ * to find the longest such substring, not the shortest. For callers
1445+ * that don't need greedy semantics, we can finish on the first match.
1446+ */
1447+ const char * result_hptr = NULL ;
1448+
1449+ hptr = start_ptr ;
1450+ while (hptr < haystack_end )
1451+ {
1452+ /*
1453+ * First check the common case that there is a match in the
1454+ * haystack of exactly the length of the needle.
1455+ */
1456+ if (!state -> greedy &&
1457+ haystack_end - hptr >= needle_len &&
1458+ pg_strncoll (hptr , needle_len , needle , needle_len , state -> locale ) == 0 )
1459+ return (char * ) hptr ;
1460+
1461+ /*
1462+ * Else check if any of the possible substrings starting at hptr
1463+ * are equal to the needle.
1464+ */
1465+ for (const char * test_end = hptr ; test_end < haystack_end ; test_end += pg_mblen (test_end ))
1466+ {
1467+ if (pg_strncoll (hptr , (test_end - hptr ), needle , needle_len , state -> locale ) == 0 )
1468+ {
1469+ state -> last_match_len_tmp = (test_end - hptr );
1470+ result_hptr = hptr ;
1471+ if (!state -> greedy )
1472+ break ;
1473+ }
1474+ }
1475+ if (result_hptr )
1476+ break ;
1477+
1478+ hptr += pg_mblen (hptr );
1479+ }
1480+
1481+ return (char * ) result_hptr ;
1482+ }
1483+ else if (needle_len == 1 )
14121484 {
14131485 /* No point in using B-M-H for a one-character needle */
14141486 char nchar = * needle ;
@@ -4055,7 +4127,7 @@ replace_text(PG_FUNCTION_ARGS)
40554127
40564128 appendStringInfoText (& str , to_sub_text );
40574129
4058- start_ptr = curr_ptr + from_sub_text_len ;
4130+ start_ptr = curr_ptr + state . last_match_len ;
40594131
40604132 found = text_position_next (& state );
40614133 if (found )
@@ -4445,7 +4517,7 @@ split_part(PG_FUNCTION_ARGS)
44454517 /* special case of last field does not require an extra pass */
44464518 if (fldnum == -1 )
44474519 {
4448- start_ptr = text_position_get_match_ptr (& state ) + fldsep_len ;
4520+ start_ptr = text_position_get_match_ptr (& state ) + state . last_match_len ;
44494521 end_ptr = VARDATA_ANY (inputstring ) + inputstring_len ;
44504522 text_position_cleanup (& state );
44514523 PG_RETURN_TEXT_P (cstring_to_text_with_len (start_ptr ,
@@ -4475,7 +4547,7 @@ split_part(PG_FUNCTION_ARGS)
44754547 while (found && -- fldnum > 0 )
44764548 {
44774549 /* identify bounds of next field */
4478- start_ptr = end_ptr + fldsep_len ;
4550+ start_ptr = end_ptr + state . last_match_len ;
44794551 found = text_position_next (& state );
44804552 if (found )
44814553 end_ptr = text_position_get_match_ptr (& state );
@@ -4691,7 +4763,7 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
46914763 if (!found )
46924764 break ;
46934765
4694- start_ptr = end_ptr + fldsep_len ;
4766+ start_ptr = end_ptr + state . last_match_len ;
46954767 }
46964768
46974769 text_position_cleanup (& state );
0 commit comments