2424 * Portions Copyright (c) 1994, Regents of the University of California
2525 *
2626 * IDENTIFICATION
27- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
27+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
2828 *
2929 *-------------------------------------------------------------------------
3030 */
@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
8080static char *litbufdup (base_yyscan_t yyscanner);
8181static char *litbuf_udeescape (unsigned char escape, base_yyscan_t yyscanner);
8282static unsigned char unescape_single_char (unsigned char c, base_yyscan_t yyscanner);
83+ static bool is_utf16_surrogate_first (pg_wchar c);
84+ static bool is_utf16_surrogate_second (pg_wchar c);
85+ static pg_wchar surrogate_pair_to_codepoint (pg_wchar first, pg_wchar second);
8386
8487#define yyerror (msg ) scanner_yyerror(msg, yyscanner)
8588
@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
97100extern int base_yyget_column (yyscan_t yyscanner);
98101extern void base_yyset_column (int column_no, yyscan_t yyscanner);
99102
103+ static void addunicode (pg_wchar c, yyscan_t yyscanner);
104+
100105%}
101106
102107%option reentrant
@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
134139 * <xdolq> $foo$ quoted strings
135140 * <xui> quoted identifier with Unicode escapes
136141 * <xus> quoted string with Unicode escapes
142+ * <xeu> Unicode surrogate pair in extended quoted string
137143 */
138144
139145%x xb
@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
145151%x xdolq
146152%x xui
147153%x xus
154+ %x xeu
148155
149156/*
150157 * In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +230,8 @@ xeinside [^\\']+
223230xeescape [\\ ][^ 0 -7 ]
224231xeoctesc [\\ ][0 -7 ]{1,3 }
225232xehexesc [\\ ]x[0 -9A -Fa -f ]{1,2 }
233+ xeunicode [\\ ](u[0 -9A -Fa -f ]{4 }| U[0 -9A -Fa -f ]{8 })
234+ xeunicodebad [\\ ]([uU ])
226235
227236/* Extended quote
228237 * xqdouble implements embedded quote, ''''
@@ -535,6 +544,45 @@ other .
535544<xe >{xeinside } {
536545 addlit (yytext, yyleng, yyscanner);
537546 }
547+ <xe >{xeunicode } {
548+ pg_wchar c = strtoul (yytext+2 , NULL , 16 );
549+
550+ check_escape_warning (yyscanner);
551+
552+ if (is_utf16_surrogate_first (c))
553+ {
554+ yyextra->utf16_first_part = c;
555+ BEGIN (xeu);
556+ }
557+ else if (is_utf16_surrogate_second (c))
558+ yyerror (" invalid Unicode surrogate pair" );
559+ else
560+ addunicode (c, yyscanner);
561+ }
562+ <xeu >{xeunicode } {
563+ pg_wchar c = strtoul (yytext+2 , NULL , 16 );
564+
565+ if (!is_utf16_surrogate_second (c))
566+ yyerror (" invalid Unicode surrogate pair" );
567+
568+ c = surrogate_pair_to_codepoint (yyextra->utf16_first_part , c);
569+
570+ addunicode (c, yyscanner);
571+
572+ BEGIN (xe);
573+ }
574+ <xeu >. |
575+ <xeu >\n |
576+ <xeu ><<EOF>> { yyerror (" invalid Unicode surrogate pair" ); }
577+
578+ <xe >{xeunicodebad } {
579+ ereport (ERROR,
580+ (errcode (ERRCODE_INVALID_ESCAPE_SEQUENCE),
581+ errmsg (" invalid Unicode escape" ),
582+ errhint (" Unicode escapes must be \\ uXXXX or \\ UXXXXXXXX." ),
583+ lexer_errposition ()));
584+ }
585+
538586<xe >{xeescape } {
539587 if (yytext[1 ] == ' \' ' )
540588 {
@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
13301378 if (ptr)
13311379 pfree (ptr);
13321380}
1381+
1382+ static void
1383+ addunicode (pg_wchar c, base_yyscan_t yyscanner)
1384+ {
1385+ char buf[8 ];
1386+
1387+ if (c == 0 || c > 0x10FFFF )
1388+ yyerror (" invalid Unicode escape value" );
1389+ if (c > 0x7F )
1390+ {
1391+ if (GetDatabaseEncoding () != PG_UTF8)
1392+ yyerror (" Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8" );
1393+ yyextra->saw_non_ascii = true ;
1394+ }
1395+ unicode_to_utf8 (c, (unsigned char *)buf);
1396+ addlit (buf, pg_mblen (buf), yyscanner);
1397+ }
1398+
0 commit comments