1818#include <wctype.h>
1919
2020#ifdef USE_ICU
21+ #include <unicode/ucasemap.h>
2122#include <unicode/uchar.h>
2223#endif
2324#include "common/unicode_case.h"
2425#include "common/unicode_category.h"
2526#include "common/unicode_version.h"
2627
28+ /* enough to hold largest source or result string, including NUL */
29+ #define BUFSZ 256
30+
31+ #ifdef USE_ICU
32+ static UCaseMap * casemap = NULL ;
33+ #endif
34+
35+ typedef size_t (* TestFunc ) (char * dst , size_t dstsize , const char * src ,
36+ ssize_t srclen );
37+
38+ /* simple boundary iterator copied from pg_locale_builtin.c */
39+ struct WordBoundaryState
40+ {
41+ const char * str ;
42+ size_t len ;
43+ size_t offset ;
44+ bool init ;
45+ bool prev_alnum ;
46+ };
47+
48+ static size_t
49+ initcap_wbnext (void * state )
50+ {
51+ struct WordBoundaryState * wbstate = (struct WordBoundaryState * ) state ;
52+
53+ while (wbstate -> offset < wbstate -> len &&
54+ wbstate -> str [wbstate -> offset ] != '\0' )
55+ {
56+ pg_wchar u = utf8_to_unicode ((unsigned char * ) wbstate -> str +
57+ wbstate -> offset );
58+ bool curr_alnum = pg_u_isalnum (u , true);
59+
60+ if (!wbstate -> init || curr_alnum != wbstate -> prev_alnum )
61+ {
62+ size_t prev_offset = wbstate -> offset ;
63+
64+ wbstate -> init = true;
65+ wbstate -> offset += unicode_utf8len (u );
66+ wbstate -> prev_alnum = curr_alnum ;
67+ return prev_offset ;
68+ }
69+
70+ wbstate -> offset += unicode_utf8len (u );
71+ }
72+
73+ return wbstate -> len ;
74+ }
75+
2776#ifdef USE_ICU
2877
2978static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
4897 }
4998}
5099
100+ static void
101+ icu_test_full (char * str )
102+ {
103+ char lower [BUFSZ ];
104+ char title [BUFSZ ];
105+ char upper [BUFSZ ];
106+ char icu_lower [BUFSZ ];
107+ char icu_title [BUFSZ ];
108+ char icu_upper [BUFSZ ];
109+ UErrorCode status ;
110+ struct WordBoundaryState wbstate = {
111+ .str = str ,
112+ .len = strlen (str ),
113+ .offset = 0 ,
114+ .init = false,
115+ .prev_alnum = false,
116+ };
117+
118+ unicode_strlower (lower , BUFSZ , str , -1 , true);
119+ unicode_strtitle (title , BUFSZ , str , -1 , true, initcap_wbnext , & wbstate );
120+ unicode_strupper (upper , BUFSZ , str , -1 , true);
121+ status = U_ZERO_ERROR ;
122+ ucasemap_utf8ToLower (casemap , icu_lower , BUFSZ , str , -1 , & status );
123+ status = U_ZERO_ERROR ;
124+ ucasemap_utf8ToTitle (casemap , icu_title , BUFSZ , str , -1 , & status );
125+ status = U_ZERO_ERROR ;
126+ ucasemap_utf8ToUpper (casemap , icu_upper , BUFSZ , str , -1 , & status );
127+
128+ if (strcmp (lower , icu_lower ) != 0 )
129+ {
130+ printf ("case_test: str='%s' lower='%s' icu_lower='%s'\n" , str , lower ,
131+ icu_lower );
132+ exit (1 );
133+ }
134+ if (strcmp (title , icu_title ) != 0 )
135+ {
136+ printf ("case_test: str='%s' title='%s' icu_title='%s'\n" , str , title ,
137+ icu_title );
138+ exit (1 );
139+ }
140+ if (strcmp (upper , icu_upper ) != 0 )
141+ {
142+ printf ("case_test: str='%s' upper='%s' icu_upper='%s'\n" , str , upper ,
143+ icu_upper );
144+ exit (1 );
145+ }
146+ }
147+
51148/*
52149 * Exhaustively compare case mappings with the results from ICU.
53150 */
@@ -64,6 +161,7 @@ test_icu(void)
64161 if (category != PG_U_UNASSIGNED )
65162 {
66163 uint8_t icu_category = u_charType (code );
164+ char code_str [5 ] = {0 };
67165
68166 if (icu_category == PG_U_UNASSIGNED )
69167 {
@@ -72,6 +170,9 @@ test_icu(void)
72170 }
73171
74172 icu_test_simple (code );
173+ unicode_to_utf8 (code , (unsigned char * ) code_str );
174+ icu_test_full (code_str );
175+
75176 successful ++ ;
76177 }
77178 }
@@ -86,7 +187,7 @@ test_icu(void)
86187#endif
87188
88189static void
89- test_strlower ( const char * test_string , const char * expected )
190+ test_convert ( TestFunc tfunc , const char * test_string , const char * expected )
90191{
91192 size_t src1len = strlen (test_string );
92193 size_t src2len = -1 ; /* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
102203
103204 /* neither source nor destination are NUL-terminated */
104205 memset (dst1 , 0x7F , dst1len );
105- needed = unicode_strlower (dst1 , dst1len , src1 , src1len );
206+ needed = tfunc (dst1 , dst1len , src1 , src1len );
106207 if (needed != strlen (expected ))
107208 {
108- printf ("case_test: convert_case test1 FAILURE: needed %zu\n" , needed );
209+ printf ("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n" ,
210+ test_string , needed , strlen (expected ));
109211 exit (1 );
110212 }
111213 if (memcmp (dst1 , expected , dst1len ) != 0 )
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
117219
118220 /* destination is NUL-terminated and source is not */
119221 memset (dst2 , 0x7F , dst2len );
120- needed = unicode_strlower (dst2 , dst2len , src1 , src1len );
222+ needed = tfunc (dst2 , dst2len , src1 , src1len );
121223 if (needed != strlen (expected ))
122224 {
123- printf ("case_test: convert_case test2 FAILURE: needed %zu\n" , needed );
225+ printf ("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n" ,
226+ test_string , needed , strlen (expected ));
124227 exit (1 );
125228 }
126229 if (strcmp (dst2 , expected ) != 0 )
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
132235
133236 /* source is NUL-terminated and destination is not */
134237 memset (dst1 , 0x7F , dst1len );
135- needed = unicode_strlower (dst1 , dst1len , src2 , src2len );
238+ needed = tfunc (dst1 , dst1len , src2 , src2len );
136239 if (needed != strlen (expected ))
137240 {
241+ printf ("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n" ,
242+ test_string , needed , strlen (expected ));
138243 printf ("case_test: convert_case test3 FAILURE: needed %zu\n" , needed );
139244 exit (1 );
140245 }
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
147252
148253 /* both source and destination are NUL-terminated */
149254 memset (dst2 , 0x7F , dst2len );
150- needed = unicode_strlower (dst2 , dst2len , src2 , src2len );
255+ needed = tfunc (dst2 , dst2len , src2 , src2len );
151256 if (needed != strlen (expected ))
152257 {
153- printf ("case_test: convert_case test4 FAILURE: needed %zu\n" , needed );
258+ printf ("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n" ,
259+ test_string , needed , strlen (expected ));
154260 exit (1 );
155261 }
156262 if (strcmp (dst2 , expected ) != 0 )
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
166272 free (dst2 );
167273}
168274
275+ static size_t
276+ tfunc_lower (char * dst , size_t dstsize , const char * src ,
277+ ssize_t srclen )
278+ {
279+ return unicode_strlower (dst , dstsize , src , srclen , true);
280+ }
281+
282+ static size_t
283+ tfunc_title (char * dst , size_t dstsize , const char * src ,
284+ ssize_t srclen )
285+ {
286+ struct WordBoundaryState wbstate = {
287+ .str = src ,
288+ .len = srclen ,
289+ .offset = 0 ,
290+ .init = false,
291+ .prev_alnum = false,
292+ };
293+
294+ return unicode_strtitle (dst , dstsize , src , srclen , true, initcap_wbnext ,
295+ & wbstate );
296+ }
297+
298+ static size_t
299+ tfunc_upper (char * dst , size_t dstsize , const char * src ,
300+ ssize_t srclen )
301+ {
302+ return unicode_strupper (dst , dstsize , src , srclen , true);
303+ }
304+
305+
169306static void
170307test_convert_case ()
171308{
172309 /* test string with no case changes */
173- test_strlower ("√∞" , "√∞" );
310+ test_convert (tfunc_lower , "√∞" , "√∞" );
311+ /* test adjust-to-cased behavior */
312+ test_convert (tfunc_title , "abc 123xyz" , "Abc 123xyz" );
174313 /* test string with case changes */
175- test_strlower ( "ABC " , "abc " );
314+ test_convert ( tfunc_upper , "abc " , "ABC " );
176315 /* test string with case changes and byte length changes */
177- test_strlower ("ȺȺȺ" , "ⱥⱥⱥ" );
316+ test_convert (tfunc_lower , "ȺȺȺ" , "ⱥⱥⱥ" );
317+ /* test special case conversions */
318+ test_convert (tfunc_upper , "ß" , "SS" );
319+ test_convert (tfunc_lower , "ıiIİ" , "ıiii\u0307" );
320+ test_convert (tfunc_upper , "ıiIİ" , "IIIİ" );
321+ /* test final sigma */
322+ test_convert (tfunc_lower , "σςΣ ΣΣΣ" , "σςς σσς" );
323+ test_convert (tfunc_lower , "σς'Σ' ΣΣ'Σ'" , "σς'ς' σσ'ς'" );
324+ test_convert (tfunc_title , "σςΣ ΣΣΣ" , "Σςς Σσς" );
325+
326+ #ifdef USE_ICU
327+ icu_test_full ("" );
328+ icu_test_full ("ȺȺȺ" );
329+ icu_test_full ("ßßß" );
330+ icu_test_full ("√∞" );
331+ icu_test_full ("a b" );
332+ icu_test_full ("abc 123xyz" );
333+ icu_test_full ("σςΣ ΣΣΣ" );
334+ icu_test_full ("ıiIİ" );
335+ /* test <alpha><iota_subscript><acute> */
336+ icu_test_full ("\u0391\u0345\u0301" );
337+ #endif
178338
179339 printf ("case_test: convert_case: success\n" );
180340}
181341
182342int
183343main (int argc , char * * argv )
184344{
345+ #ifdef USE_ICU
346+ UErrorCode status = U_ZERO_ERROR ;
347+
348+ /*
349+ * Disable ICU's word break adjustment for titlecase to match the expected
350+ * behavior of unicode_strtitle().
351+ */
352+ casemap = ucasemap_open ("und" , U_TITLECASE_NO_BREAK_ADJUSTMENT , & status );
353+ if (U_FAILURE (status ))
354+ {
355+ printf ("case_test: failure opening UCaseMap: %s\n" ,
356+ u_errorName (status ));
357+ exit (1 );
358+ }
359+ #endif
360+
185361 printf ("case_test: Postgres Unicode version:\t%s\n" , PG_UNICODE_VERSION );
186362#ifdef USE_ICU
187363 printf ("case_test: ICU Unicode version:\t\t%s\n" , U_UNICODE_VERSION );
@@ -191,5 +367,9 @@ main(int argc, char **argv)
191367#endif
192368
193369 test_convert_case ();
370+
371+ #ifdef USE_ICU
372+ ucasemap_close (casemap );
373+ #endif
194374 exit (0 );
195375}
0 commit comments