2424 * Portions Copyright (c) 1994, Regents of the University of California
2525 *
2626 * IDENTIFICATION
27- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
27+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
2828 *
2929 *-------------------------------------------------------------------------
3030 */
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
10971097 }
10981098}
10991099
1100+ static bool
1101+ is_utf16_surrogate_first (pg_wchar c)
1102+ {
1103+ return (c >= 0xD800 && c <= 0xDBFF );
1104+ }
1105+
1106+ static bool
1107+ is_utf16_surrogate_second (pg_wchar c)
1108+ {
1109+ return (c >= 0xDC00 && c <= 0xDFFF );
1110+ }
1111+
1112+ static pg_wchar
1113+ surrogate_pair_to_codepoint (pg_wchar first, pg_wchar second)
1114+ {
1115+ return ((first & 0x3FF ) << 10 ) + 0x10000 + (second & 0x3FF );
1116+ }
1117+
11001118static char *
11011119litbuf_udeescape (unsigned char escape, base_yyscan_t yyscanner)
11021120{
11031121 char *new ;
11041122 char *litbuf, *in, *out;
1123+ pg_wchar pair_first = 0 ;
11051124
11061125 if (isxdigit (escape)
11071126 || escape == ' +'
@@ -1131,16 +1150,39 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11311150 {
11321151 if (in[1 ] == escape)
11331152 {
1153+ if (pair_first)
1154+ {
1155+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1156+ yyerror (" invalid Unicode surrogate pair" );
1157+ }
11341158 *out++ = escape;
11351159 in += 2 ;
11361160 }
11371161 else if (isxdigit (in[1 ]) && isxdigit (in[2 ]) && isxdigit (in[3 ]) && isxdigit (in[4 ]))
11381162 {
11391163 pg_wchar unicode = hexval (in[1 ]) * 16 *16 *16 + hexval (in[2 ]) * 16 *16 + hexval (in[3 ]) * 16 + hexval (in[4 ]);
11401164 check_unicode_value (unicode, in, yyscanner);
1141- unicode_to_utf8 (unicode, (unsigned char *) out);
1165+ if (pair_first)
1166+ {
1167+ if (is_utf16_surrogate_second (unicode))
1168+ {
1169+ unicode = surrogate_pair_to_codepoint (pair_first, unicode);
1170+ pair_first = 0 ;
1171+ }
1172+ else
1173+ {
1174+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1175+ yyerror (" invalid Unicode surrogate pair" );
1176+ }
1177+ }
1178+ if (is_utf16_surrogate_first (unicode))
1179+ pair_first = unicode;
1180+ else
1181+ {
1182+ unicode_to_utf8 (unicode, (unsigned char *) out);
1183+ out += pg_mblen (out);
1184+ }
11421185 in += 5 ;
1143- out += pg_mblen (out);
11441186 }
11451187 else if (in[1 ] == ' +'
11461188 && isxdigit (in[2 ]) && isxdigit (in[3 ])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11501192 pg_wchar unicode = hexval (in[2 ]) * 16 *16 *16 *16 *16 + hexval (in[3 ]) * 16 *16 *16 *16 + hexval (in[4 ]) * 16 *16 *16
11511193 + hexval (in[5 ]) * 16 *16 + hexval (in[6 ]) * 16 + hexval (in[7 ]);
11521194 check_unicode_value (unicode, in, yyscanner);
1153- unicode_to_utf8 (unicode, (unsigned char *) out);
1195+ if (pair_first)
1196+ {
1197+ if (is_utf16_surrogate_second (unicode))
1198+ {
1199+ unicode = surrogate_pair_to_codepoint (pair_first, unicode);
1200+ pair_first = 0 ;
1201+ }
1202+ else
1203+ {
1204+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1205+ yyerror (" invalid Unicode surrogate pair" );
1206+ }
1207+ }
1208+ if (is_utf16_surrogate_first (unicode))
1209+ pair_first = unicode;
1210+ else
1211+ {
1212+ unicode_to_utf8 (unicode, (unsigned char *) out);
1213+ out += pg_mblen (out);
1214+ }
11541215 in += 8 ;
1155- out += pg_mblen (out);
11561216 }
11571217 else
11581218 {
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11611221 }
11621222 }
11631223 else
1224+ {
1225+ if (pair_first)
1226+ {
1227+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1228+ yyerror (" invalid Unicode surrogate pair" );
1229+ }
11641230 *out++ = *in++;
1231+ }
11651232 }
11661233
11671234 *out = ' \0 ' ;
0 commit comments