3535#include "regex/regex.h"
3636#include "utils/array.h"
3737#include "utils/builtins.h"
38+ #include "utils/memutils.h"
3839#include "utils/varlena.h"
3940
4041#define PG_GETARG_TEXT_PP_IF_EXISTS (_n ) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
6162 /* workspace for build_regexp_match_result() */
6263 Datum * elems ; /* has npatterns elements */
6364 bool * nulls ; /* has npatterns elements */
65+ pg_wchar * wide_str ; /* wide-char version of original string */
66+ char * conv_buf ; /* conversion buffer */
67+ int conv_bufsiz ; /* size thereof */
6468} regexp_matches_ctx ;
6569
6670/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115 pg_re_flags * flags ,
112116 Oid collation ,
113117 bool use_subpatterns ,
114- bool ignore_degenerate );
115- static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx );
118+ bool ignore_degenerate ,
119+ bool fetching_unmatched );
116120static ArrayType * build_regexp_match_result (regexp_matches_ctx * matchctx );
117121static Datum build_regexp_split_result (regexp_matches_ctx * splitctx );
118122
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
863867 errhint ("Use the regexp_matches function instead." )));
864868
865869 matchctx = setup_regexp_matches (orig_str , pattern , & re_flags ,
866- PG_GET_COLLATION (), true, false);
870+ PG_GET_COLLATION (), true, false, false );
867871
868872 if (matchctx -> nmatches == 0 )
869873 PG_RETURN_NULL ();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
911915 matchctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
912916 & re_flags ,
913917 PG_GET_COLLATION (),
914- true, false);
918+ true, false, false );
915919
916920 /* Pre-create workspace that build_regexp_match_result needs */
917921 matchctx -> elems = (Datum * ) palloc (sizeof (Datum ) * matchctx -> npatterns );
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
933937 SRF_RETURN_NEXT (funcctx , PointerGetDatum (result_ary ));
934938 }
935939
936- /* release space in multi-call ctx to avoid intraquery memory leak */
937- cleanup_regexp_matches (matchctx );
938-
939940 SRF_RETURN_DONE (funcctx );
940941}
941942
@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
954955 * all the matching in one swoop. The returned regexp_matches_ctx contains
955956 * the locations of all the substrings matching the pattern.
956957 *
957- * The two bool parameters have only two patterns (one for matching, one for
958+ * The three bool parameters have only two patterns (one for matching, one for
958959 * splitting) but it seems clearer to distinguish the functionality this way
959- * than to key it all off one "is_split" flag.
960+ * than to key it all off one "is_split" flag. We don't currently assume that
961+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
962+ * set, the conversion buffer is large enough to fetch any single matched or
963+ * unmatched string, but not any larger substring. (In practice, when splitting
964+ * the matches are usually small anyway, and it didn't seem worth complicating
965+ * the code further.)
960966 */
961967static regexp_matches_ctx *
962968setup_regexp_matches (text * orig_str , text * pattern , pg_re_flags * re_flags ,
963969 Oid collation ,
964970 bool use_subpatterns ,
965- bool ignore_degenerate )
971+ bool ignore_degenerate ,
972+ bool fetching_unmatched )
966973{
967974 regexp_matches_ctx * matchctx = palloc0 (sizeof (regexp_matches_ctx ));
975+ int eml = pg_database_encoding_max_length ();
968976 int orig_len ;
969977 pg_wchar * wide_str ;
970978 int wide_len ;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
975983 int array_idx ;
976984 int prev_match_end ;
977985 int start_search ;
986+ int maxlen = 0 ; /* largest fetch length in characters */
978987
979988 /* save original string --- we'll extract result substrings from it */
980989 matchctx -> orig_str = orig_str ;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10031012 /* temporary output space for RE package */
10041013 pmatch = palloc (sizeof (regmatch_t ) * pmatch_len );
10051014
1006- /* the real output space (grown dynamically if needed) */
1007- array_len = re_flags -> glob ? 256 : 32 ;
1015+ /*
1016+ * the real output space (grown dynamically if needed)
1017+ *
1018+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1019+ * than at 2^27
1020+ */
1021+ array_len = re_flags -> glob ? 255 : 31 ;
10081022 matchctx -> match_locs = (int * ) palloc (sizeof (int ) * array_len );
10091023 array_idx = 0 ;
10101024
@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10241038 pmatch [0 ].rm_eo > prev_match_end ))
10251039 {
10261040 /* enlarge output space if needed */
1027- while (array_idx + matchctx -> npatterns * 2 > array_len )
1041+ while (array_idx + matchctx -> npatterns * 2 + 1 > array_len )
10281042 {
1029- array_len *= 2 ;
1043+ array_len += array_len + 1 ; /* 2^n-1 => 2^(n+1)-1 */
1044+ if (array_len > MaxAllocSize /sizeof (int ))
1045+ ereport (ERROR ,
1046+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
1047+ errmsg ("too many regular expression matches" )));
10301048 matchctx -> match_locs = (int * ) repalloc (matchctx -> match_locs ,
10311049 sizeof (int ) * array_len );
10321050 }
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10381056
10391057 for (i = 1 ; i <= matchctx -> npatterns ; i ++ )
10401058 {
1041- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_so ;
1042- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_eo ;
1059+ int so = pmatch [i ].rm_so ;
1060+ int eo = pmatch [i ].rm_eo ;
1061+ matchctx -> match_locs [array_idx ++ ] = so ;
1062+ matchctx -> match_locs [array_idx ++ ] = eo ;
1063+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
1064+ maxlen = (eo - so );
10431065 }
10441066 }
10451067 else
10461068 {
1047- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_so ;
1048- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_eo ;
1069+ int so = pmatch [0 ].rm_so ;
1070+ int eo = pmatch [0 ].rm_eo ;
1071+ matchctx -> match_locs [array_idx ++ ] = so ;
1072+ matchctx -> match_locs [array_idx ++ ] = eo ;
1073+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
1074+ maxlen = (eo - so );
10491075 }
10501076 matchctx -> nmatches ++ ;
1077+
1078+ /*
1079+ * check length of unmatched portion between end of previous match
1080+ * and start of current one
1081+ */
1082+ if (fetching_unmatched &&
1083+ pmatch [0 ].rm_so >= 0 &&
1084+ (pmatch [0 ].rm_so - prev_match_end ) > maxlen )
1085+ maxlen = (pmatch [0 ].rm_so - prev_match_end );
10511086 }
10521087 prev_match_end = pmatch [0 ].rm_eo ;
10531088
@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10681103 break ;
10691104 }
10701105
1106+ /*
1107+ * check length of unmatched portion between end of last match and end of
1108+ * input string
1109+ */
1110+ if (fetching_unmatched &&
1111+ (wide_len - prev_match_end ) > maxlen )
1112+ maxlen = (wide_len - prev_match_end );
1113+
1114+ /*
1115+ * Keep a note of the end position of the string for the benefit of
1116+ * splitting code.
1117+ */
1118+ matchctx -> match_locs [array_idx ] = wide_len ;
1119+
1120+ if (eml > 1 )
1121+ {
1122+ int64 maxsiz = eml * (int64 ) maxlen ;
1123+ int conv_bufsiz ;
1124+
1125+ /*
1126+ * Make the conversion buffer large enough for any substring of
1127+ * interest.
1128+ *
1129+ * Worst case: assume we need the maximum size (maxlen*eml), but take
1130+ * advantage of the fact that the original string length in bytes is an
1131+ * upper bound on the byte length of any fetched substring (and we know
1132+ * that len+1 is safe to allocate because the varlena header is longer
1133+ * than 1 byte).
1134+ */
1135+ if (maxsiz > orig_len )
1136+ conv_bufsiz = orig_len + 1 ;
1137+ else
1138+ conv_bufsiz = maxsiz + 1 ; /* safe since maxsiz < 2^30 */
1139+
1140+ matchctx -> conv_buf = palloc (conv_bufsiz );
1141+ matchctx -> conv_bufsiz = conv_bufsiz ;
1142+ matchctx -> wide_str = wide_str ;
1143+ }
1144+ else
1145+ {
1146+ /* No need to keep the wide string if we're in a single-byte charset. */
1147+ pfree (wide_str );
1148+ matchctx -> wide_str = NULL ;
1149+ matchctx -> conv_buf = NULL ;
1150+ matchctx -> conv_bufsiz = 0 ;
1151+ }
1152+
10711153 /* Clean up temp storage */
1072- pfree (wide_str );
10731154 pfree (pmatch );
10741155
10751156 return matchctx ;
10761157}
10771158
1078- /*
1079- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
1080- */
1081- static void
1082- cleanup_regexp_matches (regexp_matches_ctx * matchctx )
1083- {
1084- pfree (matchctx -> orig_str );
1085- pfree (matchctx -> match_locs );
1086- if (matchctx -> elems )
1087- pfree (matchctx -> elems );
1088- if (matchctx -> nulls )
1089- pfree (matchctx -> nulls );
1090- pfree (matchctx );
1091- }
1092-
10931159/*
10941160 * build_regexp_match_result - build output array for current match
10951161 */
10961162static ArrayType *
10971163build_regexp_match_result (regexp_matches_ctx * matchctx )
10981164{
1165+ char * buf = matchctx -> conv_buf ;
1166+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx -> conv_bufsiz ;
10991167 Datum * elems = matchctx -> elems ;
11001168 bool * nulls = matchctx -> nulls ;
11011169 int dims [1 ];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
11151183 elems [i ] = (Datum ) 0 ;
11161184 nulls [i ] = true;
11171185 }
1186+ else if (buf )
1187+ {
1188+ int len = pg_wchar2mb_with_len (matchctx -> wide_str + so ,
1189+ buf ,
1190+ eo - so );
1191+ Assert (len < bufsiz );
1192+ elems [i ] = PointerGetDatum (cstring_to_text_with_len (buf , len ));
1193+ nulls [i ] = false;
1194+ }
11181195 else
11191196 {
11201197 elems [i ] = DirectFunctionCall3 (text_substr ,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11681245 splitctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
11691246 & re_flags ,
11701247 PG_GET_COLLATION (),
1171- false, true);
1248+ false, true, true );
11721249
11731250 MemoryContextSwitchTo (oldcontext );
11741251 funcctx -> user_fctx = (void * ) splitctx ;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11851262 SRF_RETURN_NEXT (funcctx , result );
11861263 }
11871264
1188- /* release space in multi-call ctx to avoid intraquery memory leak */
1189- cleanup_regexp_matches (splitctx );
1190-
11911265 SRF_RETURN_DONE (funcctx );
11921266}
11931267
@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12241298 PG_GETARG_TEXT_PP (1 ),
12251299 & re_flags ,
12261300 PG_GET_COLLATION (),
1227- false, true);
1301+ false, true, true );
12281302
12291303 while (splitctx -> next_match <= splitctx -> nmatches )
12301304 {
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12361310 splitctx -> next_match ++ ;
12371311 }
12381312
1239- /*
1240- * We don't call cleanup_regexp_matches here; it would try to pfree the
1241- * input string, which we didn't copy. The space is not in a long-lived
1242- * memory context anyway.
1243- */
1244-
12451313 PG_RETURN_ARRAYTYPE_P (makeArrayResult (astate , CurrentMemoryContext ));
12461314}
12471315
@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
12611329static Datum
12621330build_regexp_split_result (regexp_matches_ctx * splitctx )
12631331{
1332+ char * buf = splitctx -> conv_buf ;
12641333 int startpos ;
12651334 int endpos ;
12661335
@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
12711340 if (startpos < 0 )
12721341 elog (ERROR , "invalid match ending position" );
12731342
1274- if (splitctx -> next_match < splitctx -> nmatches )
1343+ if (buf )
12751344 {
1345+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx -> conv_bufsiz ;
1346+ int len ;
1347+
12761348 endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
12771349 if (endpos < startpos )
12781350 elog (ERROR , "invalid match starting position" );
1279- return DirectFunctionCall3 (text_substr ,
1280- PointerGetDatum (splitctx -> orig_str ),
1281- Int32GetDatum (startpos + 1 ),
1282- Int32GetDatum (endpos - startpos ));
1351+ len = pg_wchar2mb_with_len (splitctx -> wide_str + startpos ,
1352+ buf ,
1353+ endpos - startpos );
1354+ Assert (len < bufsiz );
1355+ return PointerGetDatum (cstring_to_text_with_len (buf , len ));
12831356 }
12841357 else
12851358 {
1286- /* no more matches, return rest of string */
1287- return DirectFunctionCall2 (text_substr_no_len ,
1359+ endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1360+ if (endpos < startpos )
1361+ elog (ERROR , "invalid match starting position" );
1362+ return DirectFunctionCall3 (text_substr ,
12881363 PointerGetDatum (splitctx -> orig_str ),
1289- Int32GetDatum (startpos + 1 ));
1364+ Int32GetDatum (startpos + 1 ),
1365+ Int32GetDatum (endpos - startpos ));
12901366 }
12911367}
12921368
0 commit comments