3333 * Portions Copyright (c) 1994, Regents of the University of California
3434 *
3535 * IDENTIFICATION
36- * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.28 2009/01/01 17:23:55 momjian Exp $
36+ * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.29 2009/09/27 03:27:24 tgl Exp $
3737 *
3838 *-------------------------------------------------------------------------
3939 */
@@ -117,6 +117,7 @@ static void push_new_buffer(const char *newstr);
117117static YY_BUFFER_STATE prepare_buffer (const char *txt, int len,
118118 char **txtcopy);
119119static void emit (const char *txt, int len);
120+ static bool is_utf16_surrogate_first (uint32 c);
120121
121122#define ECHO emit (yytext, yyleng)
122123
@@ -158,6 +159,7 @@ static void emit(const char *txt, int len);
158159 * <xdolq> $foo$ quoted strings
159160 * <xui> quoted identifier with Unicode escapes
160161 * <xus> quoted string with Unicode escapes
162+ * <xeu> Unicode surrogate pair in extended quoted string
161163 */
162164
163165%x xb
@@ -169,6 +171,7 @@ static void emit(const char *txt, int len);
169171%x xdolq
170172%x xui
171173%x xus
174+ %x xeu
172175/* Additional exclusive states for psql only: lex backslash commands */
173176%x xslashcmd
174177%x xslasharg
@@ -192,6 +195,9 @@ static void emit(const char *txt, int len);
192195 * did not end with a newline.
193196 *
194197 * XXX perhaps \f (formfeed) should be treated as a newline as well?
198+ *
199+ * XXX if you change the set of whitespace characters, fix scanner_isspace()
200+ * to agree, and see also the plpgsql lexer.
195201 */
196202
197203space [ \t\n\r\f]
@@ -253,6 +259,8 @@ xeinside [^\\']+
253259xeescape [\\ ][^0-7]
254260xeoctesc [\\ ][0-7]{1,3}
255261xehexesc [\\ ]x[0-9A-Fa-f]{1,2}
262+ xeunicode [\\ ](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
263+ xeunicodefail [\\ ](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
256264
257265/* Extended quote
258266 * xqdouble implements embedded quote, ' ' ' '
@@ -334,6 +342,10 @@ identifier {ident_start}{ident_cont}*
334342
335343typecast " ::"
336344
345+ /* these two token types are used by PL/pgsql, though not in core SQL */
346+ dot_dot \.\.
347+ colon_equals " :="
348+
337349/*
338350 * "self" is the set of chars that should be returned as single-character
339351 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
@@ -511,6 +523,22 @@ other .
511523<xe>{xeinside} {
512524 ECHO;
513525 }
526+ <xe>{xeunicode} {
527+ uint32 c = strtoul (yytext+2 , NULL , 16 );
528+
529+ if (is_utf16_surrogate_first (c))
530+ BEGIN (xeu);
531+ ECHO;
532+ }
533+ <xeu>{xeunicode} {
534+ BEGIN (xe);
535+ ECHO;
536+ }
537+ <xeu>. { ECHO; }
538+ <xeu>\n { ECHO; }
539+ <xe,xeu>{xeunicodefail} {
540+ ECHO;
541+ }
514542<xe>{xeescape} {
515543 ECHO;
516544 }
@@ -605,6 +633,14 @@ other .
605633 ECHO;
606634 }
607635
636+ {dot_dot} {
637+ ECHO;
638+ }
639+
640+ {colon_equals} {
641+ ECHO;
642+ }
643+
608644 /*
609645 * These rules are specific to psql --- they implement parenthesis
610646 * counting and detection of command-ending semicolon. These must
@@ -1690,3 +1726,9 @@ emit(const char *txt, int len)
16901726 }
16911727 }
16921728}
1729+
1730+ static bool
1731+ is_utf16_surrogate_first (uint32 c)
1732+ {
1733+ return (c >= 0xD800 && c <= 0xDBFF );
1734+ }
0 commit comments