44 * scan.l
55 * lexical scanner for PostgreSQL
66 *
7- * XXX The rules in this file must be kept in sync with psql's lexer!!!
7+ * NOTE NOTE NOTE:
8+ *
9+ * The rules in this file must be kept in sync with psql's lexer!!!
10+ *
11+ * The rules are designed so that the scanner never has to backtrack,
12+ * in the sense that there is always a rule that can match the input
13+ * consumed so far (the rule action may internally throw back some input
14+ * with yyless(), however). As explained in the flex manual, this makes
15+ * for a useful speed increase --- about a third faster than a plain -CF
16+ * lexer, in simple testing. The extra complexity is mostly in the rules
17+ * for handling float numbers and continued string literals. If you change
18+ * the lexical rules, verify that you haven't broken the no-backtrack
19+ * property by running flex with the "-b" option and checking that the
20+ * resulting "lex.backup" file says that no backing up is needed.
21+ *
822 *
923 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
1024 * Portions Copyright (c) 1994, Regents of the University of California
1125 *
1226 * IDENTIFICATION
13- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.121 2005/03/11 19:13:42 momjian Exp $
27+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.122 2005/05/26 01:24:29 tgl Exp $
1428 *
1529 *-------------------------------------------------------------------------
1630 */
@@ -138,6 +152,20 @@ special_whitespace ({space}+|{comment}{newline})
138152horiz_whitespace ({horiz_space }| {comment })
139153whitespace_with_newline ({horiz_whitespace }* {newline }{special_whitespace }* )
140154
155+ /*
156+ * To ensure that {quotecontinue} can be scanned without having to back up
157+ * if the full pattern isn't matched, we include trailing whitespace in
158+ * {quotestop}. This matches all cases where {quotecontinue} fails to match,
159+ * except for {quote} followed by whitespace and just one "-" (not two,
160+ * which would start a {comment}). To cover that we have {quotefail}.
161+ * The actions for {quotestop} and {quotefail} must throw back characters
162+ * beyond the quote proper.
163+ */
164+ quote '
165+ quotestop {quote }{whitespace }*
166+ quotecontinue {quote }{whitespace_with_newline }{quote }
167+ quotefail {quote }{whitespace }* " -"
168+
141169/* Bit string
142170 * It is tempting to scan the string for only those characters
143171 * which are allowed. However, this leads to silently swallowed
@@ -148,43 +176,39 @@ whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
148176 * validate the contents.
149177 */
150178xbstart [bB ]{quote }
151- xbstop {quote }
152179xbinside [^ ' ]*
153- xbcat {quote }{whitespace_with_newline }{quote }
154180
155181/* Hexadecimal number
156182 */
157183xhstart [xX ]{quote }
158- xhstop {quote }
159184xhinside [^ ' ]*
160- xhcat {quote }{whitespace_with_newline }{quote }
161185
162186/* National character
163187 */
164188xnstart [nN ]{quote }
165189
166190/* Extended quote
167191 * xqdouble implements embedded quote
168- * xqcat allows strings to cross input lines
169192 */
170- quote '
171193xqstart {quote }
172- xqstop {quote }
173194xqdouble {quote }{quote }
174195xqinside [^ \\ ' ]+
175196xqescape [\\ ][^ 0 -7 ]
176197xqoctesc [\\ ][0 -7 ]{1,3 }
177- xqcat {quote }{whitespace_with_newline }{quote }
178198
179199/* $foo$ style quotes ("dollar quoting")
180200 * The quoted string starts with $foo$ where "foo" is an optional string
181201 * in the form of an identifier, except that it may not contain "$",
182202 * and extends to the first occurrence of an identical string.
183203 * There is *no* processing of the quoted text.
204+ *
205+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
206+ * fails to match its trailing "$".
184207 */
185208dolq_start [A -Za -z \200 -\377 _ ]
186209dolq_cont [A -Za -z \200 -\377 _0 -9 ]
187210dolqdelim \$ ({dolq_start }{dolq_cont }* )? \$
211+ dolqfailed \$ {dolq_start }{dolq_cont }*
188212dolqinside [^ $ ]+
189213
190214/* Double quote
@@ -242,12 +266,17 @@ operator {op_chars}+
242266
243267/* we no longer allow unary minus in numbers.
244268 * instead we pass it separately to parser. there it gets
245- * coerced via doNegate() -- Leon aug 20 1999
269+ * coerced via doNegate() -- Leon aug 20 1999
270+ *
271+ * {realfail1} and {realfail2} are added to prevent the need for scanner
272+ * backup when the {real} rule fails to match completely.
246273 */
247274
248275integer {digit }+
249276decimal (({digit }* \. {digit }+ )| ({digit }+ \. {digit }* ))
250- real ((({digit }* \. {digit }+ )| ({digit }+ \. {digit }* )| ({digit }+ ))([Ee ][-+ ]? {digit }+ ))
277+ real ({integer }| {decimal })[Ee ][-+ ]? {digit }+
278+ realfail1 ({integer }| {decimal })[Ee ]
279+ realfail2 ({integer }| {decimal })[Ee ][-+ ]
251280
252281param \$ {integer }
253282
@@ -310,6 +339,10 @@ other .
310339 /* ignore */
311340 }
312341
342+ <xc >\* + {
343+ /* ignore */
344+ }
345+
313346<xc ><<EOF>> { yyerror (" unterminated /* comment" ); }
314347
315348{xbstart } {
@@ -324,7 +357,9 @@ other .
324357 startlit ();
325358 addlitchar (' b' );
326359 }
327- <xb >{xbstop } {
360+ <xb >{quotestop } |
361+ <xb >{quotefail } {
362+ yyless (1 );
328363 BEGIN (INITIAL);
329364 yylval.str = litbufdup ();
330365 return BCONST;
@@ -333,8 +368,8 @@ other .
333368<xb >{xbinside } {
334369 addlit (yytext, yyleng);
335370 }
336- <xh >{xhcat } |
337- <xb >{xbcat } {
371+ <xh >{quotecontinue } |
372+ <xb >{quotecontinue } {
338373 /* ignore */
339374 }
340375<xb ><<EOF>> { yyerror (" unterminated bit string literal" ); }
@@ -351,7 +386,9 @@ other .
351386 startlit ();
352387 addlitchar (' x' );
353388 }
354- <xh >{xhstop } {
389+ <xh >{quotestop } |
390+ <xh >{quotefail } {
391+ yyless (1 );
355392 BEGIN (INITIAL);
356393 yylval.str = litbufdup ();
357394 return XCONST;
@@ -365,13 +402,11 @@ other .
365402 */
366403 const ScanKeyword *keyword;
367404
368- /* This had better be a keyword! */
405+ yyless (1 ); /* eat only 'n' this time */
406+ /* nchar had better be a keyword! */
369407 keyword = ScanKeywordLookup (" nchar" );
370408 Assert (keyword != NULL );
371409 yylval.keyword = keyword->name ;
372- token_start = yytext;
373- BEGIN (xq);
374- startlit ();
375410 return keyword->value ;
376411 }
377412
@@ -380,7 +415,9 @@ other .
380415 BEGIN (xq);
381416 startlit ();
382417 }
383- <xq >{xqstop } {
418+ <xq >{quotestop } |
419+ <xq >{quotefail } {
420+ yyless (1 );
384421 BEGIN (INITIAL);
385422 yylval.str = litbufdup ();
386423 return SCONST;
@@ -398,7 +435,7 @@ other .
398435 unsigned char c = strtoul (yytext+1 , NULL , 8 );
399436 addlitchar (c);
400437 }
401- <xq >{xqcat } {
438+ <xq >{quotecontinue } {
402439 /* ignore */
403440 }
404441<xq >. {
@@ -413,6 +450,12 @@ other .
413450 BEGIN (xdolq);
414451 startlit ();
415452 }
453+ {dolqfailed } {
454+ /* throw back all but the initial "$" */
455+ yyless (1 );
456+ /* and treat it as {other} */
457+ return yytext[0 ];
458+ }
416459<xdolq >{dolqdelim } {
417460 if (strcmp (yytext, dolqstart) == 0 )
418461 {
@@ -435,6 +478,9 @@ other .
435478<xdolq >{dolqinside } {
436479 addlit (yytext, yyleng);
437480 }
481+ <xdolq >{dolqfailed } {
482+ addlit (yytext, yyleng);
483+ }
438484<xdolq >. {
439485 /* This is only needed for $ inside the quoted text */
440486 addlitchar (yytext[0 ]);
@@ -576,6 +622,23 @@ other .
576622 yylval.str = pstrdup (yytext);
577623 return FCONST;
578624 }
625+ {realfail1 } {
626+ /*
627+ * throw back the [Ee], and treat as {decimal}. Note
628+ * that it is possible the input is actually {integer},
629+ * but since this case will almost certainly lead to a
630+ * syntax error anyway, we don't bother to distinguish.
631+ */
632+ yyless (yyleng-1 );
633+ yylval.str = pstrdup (yytext);
634+ return FCONST;
635+ }
636+ {realfail2 } {
637+ /* throw back the [Ee][+-], and proceed as above */
638+ yyless (yyleng-2 );
639+ yylval.str = pstrdup (yytext);
640+ return FCONST;
641+ }
579642
580643
581644{identifier } {
0 commit comments