postgrespro
diff --git a/‎src/backend/utils/adt/pg_locale_builtin.c‎
Lines changed: 3 additions & 3 deletions b/‎src/backend/utils/adt/pg_locale_builtin.c‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/common/unicode/Makefile‎
Lines changed: 2 additions & 2 deletions b/‎src/common/unicode/Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/common/unicode/case_test.c‎
Lines changed: 191 additions & 11 deletions b/‎src/common/unicode/case_test.c‎
Lines changed: 191 additions & 11 deletions
@@ -78,7 +78,7 @@ size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strlower(dest, destsize, src, srclen);
+	return unicode_strlower(dest, destsize, src, srclen, false);
 }
 
 size_t
@@ -93,15 +93,15 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		.prev_alnum = false,
 	};
 
-	return unicode_strtitle(dest, destsize, src, srclen,
+	return unicode_strtitle(dest, destsize, src, srclen, false,
 							initcap_wbnext, &wbstate);
 }
 
 size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strupper(dest, destsize, src, srclen);
+	return unicode_strupper(dest, destsize, src, srclen, false);
 }
 
 pg_locale_t
 
@@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
+CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
 unicode_version.h: generate-unicode_version.pl
@@ -91,4 +91,4 @@ clean:
 	rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
 
 distclean: clean
-	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
+	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
@@ -18,12 +18,61 @@
 #include <wctype.h>
 
 #ifdef USE_ICU
+#include <unicode/ucasemap.h>
 #include <unicode/uchar.h>
 #endif
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
 
+/* enough to hold largest source or result string, including NUL */
+#define BUFSZ 256
+
+#ifdef USE_ICU
+static UCaseMap * casemap = NULL;
+#endif
+
+typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
+							ssize_t srclen);
+
+/* simple boundary iterator copied from pg_locale_builtin.c */
+struct WordBoundaryState
+{
+	const char *str;
+	size_t		len;
+	size_t		offset;
+	bool		init;
+	bool		prev_alnum;
+};
+
+static size_t
+initcap_wbnext(void *state)
+{
+	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+	while (wbstate->offset < wbstate->len &&
+		   wbstate->str[wbstate->offset] != '\0')
+	{
+		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+										wbstate->offset);
+		bool		curr_alnum = pg_u_isalnum(u, true);
+
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
+			size_t		prev_offset = wbstate->offset;
+
+			wbstate->init = true;
+			wbstate->offset += unicode_utf8len(u);
+			wbstate->prev_alnum = curr_alnum;
+			return prev_offset;
+		}
+
+		wbstate->offset += unicode_utf8len(u);
+	}
+
+	return wbstate->len;
+}
+
 #ifdef USE_ICU
 
 static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
 	}
 }
 
+static void
+icu_test_full(char *str)
+{
+	char		lower[BUFSZ];
+	char		title[BUFSZ];
+	char		upper[BUFSZ];
+	char		icu_lower[BUFSZ];
+	char		icu_title[BUFSZ];
+	char		icu_upper[BUFSZ];
+	UErrorCode	status;
+	struct WordBoundaryState wbstate = {
+		.str = str,
+		.len = strlen(str),
+		.offset = 0,
+		.init = false,
+		.prev_alnum = false,
+	};
+
+	unicode_strlower(lower, BUFSZ, str, -1, true);
+	unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
+	unicode_strupper(upper, BUFSZ, str, -1, true);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
+
+	if (strcmp(lower, icu_lower) != 0)
+	{
+		printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
+			   icu_lower);
+		exit(1);
+	}
+	if (strcmp(title, icu_title) != 0)
+	{
+		printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
+			   icu_title);
+		exit(1);
+	}
+	if (strcmp(upper, icu_upper) != 0)
+	{
+		printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
+			   icu_upper);
+		exit(1);
+	}
+}
+
 /*
  * Exhaustively compare case mappings with the results from ICU.
  */
@@ -64,6 +161,7 @@ test_icu(void)
 		if (category != PG_U_UNASSIGNED)
 		{
 			uint8_t		icu_category = u_charType(code);
+			char		code_str[5] = {0};
 
 			if (icu_category == PG_U_UNASSIGNED)
 			{
@@ -72,6 +170,9 @@ test_icu(void)
 			}
 
 			icu_test_simple(code);
+			unicode_to_utf8(code, (unsigned char *) code_str);
+			icu_test_full(code_str);
+
 			successful++;
 		}
 	}
@@ -86,7 +187,7 @@ test_icu(void)
 #endif
 
 static void
-test_strlower(const char *test_string, const char *expected)
+test_convert(TestFunc tfunc, const char *test_string, const char *expected)
 {
 	size_t		src1len = strlen(test_string);
 	size_t		src2len = -1;	/* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* neither source nor destination are NUL-terminated */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src1, src1len);
+	needed = tfunc(dst1, dst1len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (memcmp(dst1, expected, dst1len) != 0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* destination is NUL-terminated and source is not */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src1, src1len);
+	needed = tfunc(dst2, dst2len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* source is NUL-terminated and destination is not */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src2, src2len);
+	needed = tfunc(dst1, dst1len, src2, src2len);
 	if (needed != strlen(expected))
 	{
+		printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
 		exit(1);
 	}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
 
 	/* both source and destination are NUL-terminated */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src2, src2len);
+	needed = tfunc(dst2, dst2len, src2, src2len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
 	free(dst2);
 }
 
+static size_t
+tfunc_lower(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	return unicode_strlower(dst, dstsize, src, srclen, true);
+}
+
+static size_t
+tfunc_title(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	struct WordBoundaryState wbstate = {
+		.str = src,
+		.len = srclen,
+		.offset = 0,
+		.init = false,
+		.prev_alnum = false,
+	};
+
+	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
+							&wbstate);
+}
+
+static size_t
+tfunc_upper(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	return unicode_strupper(dst, dstsize, src, srclen, true);
+}
+
+
 static void
 test_convert_case()
 {
 	/* test string with no case changes */
-	test_strlower("√∞", "√∞");
+	test_convert(tfunc_lower, "√∞", "√∞");
+	/* test adjust-to-cased behavior */
+	test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
 	/* test string with case changes */
-	test_strlower("ABC", "abc");
+	test_convert(tfunc_upper, "abc", "ABC");
 	/* test string with case changes and byte length changes */
-	test_strlower("ȺȺȺ", "ⱥⱥⱥ");
+	test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
+	/* test special case conversions */
+	test_convert(tfunc_upper, "ß", "SS");
+	test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
+	test_convert(tfunc_upper, "ıiIİ", "IIIİ");
+	/* test final sigma */
+	test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
+	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
+	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
+
+#ifdef USE_ICU
+	icu_test_full("");
+	icu_test_full("ȺȺȺ");
+	icu_test_full("ßßß");
+	icu_test_full("√∞");
+	icu_test_full("a b");
+	icu_test_full("abc 123xyz");
+	icu_test_full("σςΣ ΣΣΣ");
+	icu_test_full("ıiIİ");
+	/* test <alpha><iota_subscript><acute> */
+	icu_test_full("\u0391\u0345\u0301");
+#endif
 
 	printf("case_test: convert_case: success\n");
 }
 
 int
 main(int argc, char **argv)
 {
+#ifdef USE_ICU
+	UErrorCode	status = U_ZERO_ERROR;
+
+	/*
+	 * Disable ICU's word break adjustment for titlecase to match the expected
+	 * behavior of unicode_strtitle().
+	 */
+	casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
+	if (U_FAILURE(status))
+	{
+		printf("case_test: failure opening UCaseMap: %s\n",
+			   u_errorName(status));
+		exit(1);
+	}
+#endif
+
 	printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
 #ifdef USE_ICU
 	printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
 #endif
 
 	test_convert_case();
+
+#ifdef USE_ICU
+	ucasemap_close(casemap);
+#endif
 	exit(0);
 }
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ size_t`
`78`	`78`	`strlower_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
`79`	`79`	`pg_locale_t locale)`
`80`	`80`	`{`
`81`		`- return unicode_strlower(dest, destsize, src, srclen);`
	`81`	`+ return unicode_strlower(dest, destsize, src, srclen, false);`
`82`	`82`	`}`
`83`	`83`
`84`	`84`	`size_t`
`@@ -93,15 +93,15 @@ strtitle_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
`93`	`93`	`.prev_alnum = false,`
`94`	`94`	`};`
`95`	`95`
`96`		`- return unicode_strtitle(dest, destsize, src, srclen,`
	`96`	`+ return unicode_strtitle(dest, destsize, src, srclen, false,`
`97`	`97`	`initcap_wbnext, &wbstate);`
`98`	`98`	`}`
`99`	`99`
`100`	`100`	`size_t`
`101`	`101`	`strupper_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
`102`	`102`	`pg_locale_t locale)`
`103`	`103`	`{`
`104`		`- return unicode_strupper(dest, destsize, src, srclen);`
	`104`	`+ return unicode_strupper(dest, destsize, src, srclen, false);`
`105`	`105`	`}`
`106`	`106`
`107`	`107`	`pg_locale_t`