postgrespro
diff --git a/‎src/backend/parser/scan.l‎
Lines changed: 85 additions & 22 deletions b/‎src/backend/parser/scan.l‎
Lines changed: 85 additions & 22 deletions
@@ -4,13 +4,27 @@
  * scan.l
  *	  lexical scanner for PostgreSQL
  *
- * XXX The rules in this file must be kept in sync with psql's lexer!!!
+ * NOTE NOTE NOTE:
+ *
+ * The rules in this file must be kept in sync with psql's lexer!!!
+ *
+ * The rules are designed so that the scanner never has to backtrack,
+ * in the sense that there is always a rule that can match the input
+ * consumed so far (the rule action may internally throw back some input
+ * with yyless(), however).  As explained in the flex manual, this makes
+ * for a useful speed increase --- about a third faster than a plain -CF
+ * lexer, in simple testing.  The extra complexity is mostly in the rules
+ * for handling float numbers and continued string literals.  If you change
+ * the lexical rules, verify that you haven't broken the no-backtrack
+ * property by running flex with the "-b" option and checking that the
+ * resulting "lex.backup" file says that no backing up is needed.
+ *
  *
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.121 2005/03/11 19:13:42 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.122 2005/05/26 01:24:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -138,6 +152,20 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 
+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}).  To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote			'
+quotestop		{quote}{whitespace}*
+quotecontinue	{quote}{whitespace_with_newline}{quote}
+quotefail		{quote}{whitespace}*"-"
+
 /* Bit string
  * It is tempting to scan the string for only those characters
  * which are allowed. However, this leads to silently swallowed
@@ -148,43 +176,39 @@ whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
  * validate the contents.
  */
 xbstart			[bB]{quote}
-xbstop			{quote}
 xbinside		[^']*
-xbcat			{quote}{whitespace_with_newline}{quote}
 
 /* Hexadecimal number
  */
 xhstart			[xX]{quote}
-xhstop			{quote}
 xhinside		[^']*
-xhcat			{quote}{whitespace_with_newline}{quote}
 
 /* National character
  */
 xnstart			[nN]{quote}
 
 /* Extended quote
  * xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
  */
-quote			'
 xqstart			{quote}
-xqstop			{quote}
 xqdouble		{quote}{quote}
 xqinside		[^\\']+
 xqescape		[\\][^0-7]
 xqoctesc		[\\][0-7]{1,3}
-xqcat			{quote}{whitespace_with_newline}{quote}
 
 /* $foo$ style quotes ("dollar quoting")
  * The quoted string starts with $foo$ where "foo" is an optional string
  * in the form of an identifier, except that it may not contain "$", 
  * and extends to the first occurrence of an identical string.  
  * There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
  */
 dolq_start		[A-Za-z\200-\377_]
 dolq_cont		[A-Za-z\200-\377_0-9]
 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
+dolqfailed		\${dolq_start}{dolq_cont}*
 dolqinside		[^$]+
 
 /* Double quote
@@ -242,12 +266,17 @@ operator		{op_chars}+
 
 /* we no longer allow unary minus in numbers. 
  * instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999 
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
  */
 
 integer			{digit}+
 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real			((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real			({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1		({integer}|{decimal})[Ee]
+realfail2		({integer}|{decimal})[Ee][-+]
 
 param			\${integer}
 
@@ -310,6 +339,10 @@ other			.
 					/* ignore */
 				}
 
+<xc>\*+			{
+					/* ignore */
+				}
+
 <xc><<EOF>>		{ yyerror("unterminated /* comment"); }
 
 {xbstart}		{
@@ -324,7 +357,9 @@ other			.
 					startlit();
 					addlitchar('b');
 				}
-<xb>{xbstop}	{
+<xb>{quotestop}	|
+<xb>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return BCONST;
@@ -333,8 +368,8 @@ other			.
 <xb>{xbinside}	{
 					addlit(yytext, yyleng);
 				}
-<xh>{xhcat}		|
-<xb>{xbcat}		{
+<xh>{quotecontinue}	|
+<xb>{quotecontinue}	{
 					/* ignore */
 				}
 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
@@ -351,7 +386,9 @@ other			.
 					startlit();
 					addlitchar('x');
 				}
-<xh>{xhstop}	{
+<xh>{quotestop}	|
+<xh>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return XCONST;
@@ -365,13 +402,11 @@ other			.
 					 */
 					const ScanKeyword *keyword;
 
-					/* This had better be a keyword! */
+					yyless(1);				/* eat only 'n' this time */
+					/* nchar had better be a keyword! */
 					keyword = ScanKeywordLookup("nchar");
 					Assert(keyword != NULL);
 					yylval.keyword = keyword->name;
-					token_start = yytext;
-					BEGIN(xq);
-					startlit();
 					return keyword->value;
 				}
 
@@ -380,7 +415,9 @@ other			.
 					BEGIN(xq);
 					startlit();
 				}
-<xq>{xqstop}	{
+<xq>{quotestop}	|
+<xq>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return SCONST;
@@ -398,7 +435,7 @@ other			.
 					unsigned char c = strtoul(yytext+1, NULL, 8);
 					addlitchar(c);
 				}
-<xq>{xqcat}		{
+<xq>{quotecontinue} {
 					/* ignore */
 				}
 <xq>.			{
@@ -413,6 +450,12 @@ other			.
 					BEGIN(xdolq);
 					startlit();
 				}
+{dolqfailed}	{
+					/* throw back all but the initial "$" */
+					yyless(1);
+					/* and treat it as {other} */
+					return yytext[0];
+				}
 <xdolq>{dolqdelim} {
 					if (strcmp(yytext, dolqstart) == 0)
 					{
@@ -435,6 +478,9 @@ other			.
 <xdolq>{dolqinside} {
 					addlit(yytext, yyleng);
 				}
+<xdolq>{dolqfailed} {
+					addlit(yytext, yyleng);
+				}
 <xdolq>.		{
 					/* This is only needed for $ inside the quoted text */
 					addlitchar(yytext[0]);
@@ -576,6 +622,23 @@ other			.
 					yylval.str = pstrdup(yytext);
 					return FCONST;
 				}
+{realfail1}		{
+					/*
+					 * throw back the [Ee], and treat as {decimal}.  Note
+					 * that it is possible the input is actually {integer},
+					 * but since this case will almost certainly lead to a
+					 * syntax error anyway, we don't bother to distinguish.
+					 */
+					yyless(yyleng-1);
+					yylval.str = pstrdup(yytext);
+					return FCONST;
+				}
+{realfail2}		{
+					/* throw back the [Ee][+-], and proceed as above */
+					yyless(yyleng-2);
+					yylval.str = pstrdup(yytext);
+					return FCONST;
+				}
 
 
 {identifier}	{