3434
3535/*
3636 * longest - longest-preferred matching engine
37+ *
38+ * On success, returns match endpoint address. Returns NULL on no match.
39+ * Internal errors also return NULL, with v->err set.
3740 */
38- static chr * /* endpoint, or NULL */
39- longest (struct vars * v , /* used only for debug and exec flags */
41+ static chr *
42+ longest (struct vars * v ,
4043 struct dfa * d ,
4144 chr * start , /* where the match should start */
4245 chr * stop , /* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v, /* used only for debug and exec flags */
5154 int i ;
5255 struct colormap * cm = d -> cm ;
5356
57+ /* prevent "uninitialized variable" warnings */
58+ if (hitstopp != NULL )
59+ * hitstopp = 0 ;
60+
5461 /* initialize */
5562 css = initialize (v , d , start );
63+ if (css == NULL )
64+ return NULL ;
5665 cp = start ;
57- if (hitstopp != NULL )
58- * hitstopp = 0 ;
5966
6067 /* startup */
6168 FDEBUG (("+++ startup +++\n" ));
@@ -74,8 +81,14 @@ longest(struct vars * v, /* used only for debug and exec flags */
7481 return NULL ;
7582 css -> lastseen = cp ;
7683
77- /* main loop */
84+ /*
85+ * This is the main text-scanning loop. It seems worth having two copies
86+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87+ * builds, when you're not actively tracing.
88+ */
89+ #ifdef REG_DEBUG
7890 if (v -> eflags & REG_FTRACE )
91+ {
7992 while (cp < realstop )
8093 {
8194 FDEBUG (("+++ at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -92,7 +105,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
92105 ss -> lastseen = cp ;
93106 css = ss ;
94107 }
108+ }
95109 else
110+ #endif
111+ {
96112 while (cp < realstop )
97113 {
98114 co = GETCOLOR (cm , * cp );
@@ -107,6 +123,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
107123 ss -> lastseen = cp ;
108124 css = ss ;
109125 }
126+ }
127+
128+ if (ISERR ())
129+ return NULL ;
110130
111131 /* shutdown */
112132 FDEBUG (("+++ shutdown at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -117,6 +137,8 @@ longest(struct vars * v, /* used only for debug and exec flags */
117137 co = d -> cnfa -> eos [(v -> eflags & REG_NOTEOL ) ? 0 : 1 ];
118138 FDEBUG (("color %ld\n" , (long ) co ));
119139 ss = miss (v , d , css , co , cp , start );
140+ if (ISERR ())
141+ return NULL ;
120142 /* special case: match ended at eol? */
121143 if (ss != NULL && (ss -> flags & POSTSTATE ))
122144 return cp ;
@@ -138,14 +160,17 @@ longest(struct vars * v, /* used only for debug and exec flags */
138160
139161/*
140162 * shortest - shortest-preferred matching engine
163+ *
164+ * On success, returns match endpoint address. Returns NULL on no match.
165+ * Internal errors also return NULL, with v->err set.
141166 */
142- static chr * /* endpoint, or NULL */
167+ static chr *
143168shortest (struct vars * v ,
144169 struct dfa * d ,
145170 chr * start , /* where the match should start */
146171 chr * min , /* match must end at or after here */
147172 chr * max , /* match must end at or before here */
148- chr * * coldp , /* store coldstart pointer here, if nonNULL */
173+ chr * * coldp , /* store coldstart pointer here, if non-NULL */
149174 int * hitstopp ) /* record whether hit v->stop, if non-NULL */
150175{
151176 chr * cp ;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156181 struct sset * ss ;
157182 struct colormap * cm = d -> cm ;
158183
184+ /* prevent "uninitialized variable" warnings */
185+ if (coldp != NULL )
186+ * coldp = NULL ;
187+ if (hitstopp != NULL )
188+ * hitstopp = 0 ;
189+
159190 /* initialize */
160191 css = initialize (v , d , start );
192+ if (css == NULL )
193+ return NULL ;
161194 cp = start ;
162- if (hitstopp != NULL )
163- * hitstopp = 0 ;
164195
165196 /* startup */
166197 FDEBUG (("--- startup ---\n" ));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180211 css -> lastseen = cp ;
181212 ss = css ;
182213
183- /* main loop */
214+ /*
215+ * This is the main text-scanning loop. It seems worth having two copies
216+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217+ * builds, when you're not actively tracing.
218+ */
219+ #ifdef REG_DEBUG
184220 if (v -> eflags & REG_FTRACE )
221+ {
185222 while (cp < realmax )
186223 {
187224 FDEBUG (("--- at c%d ---\n" , (int ) (css - d -> ssets )));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200237 if ((ss -> flags & POSTSTATE ) && cp >= realmin )
201238 break ; /* NOTE BREAK OUT */
202239 }
240+ }
203241 else
242+ #endif
243+ {
204244 while (cp < realmax )
205245 {
206246 co = GETCOLOR (cm , * cp );
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217257 if ((ss -> flags & POSTSTATE ) && cp >= realmin )
218258 break ; /* NOTE BREAK OUT */
219259 }
260+ }
220261
221262 if (ss == NULL )
222263 return NULL ;
@@ -389,7 +430,7 @@ hash(unsigned *uv,
389430 * initialize - hand-craft a cache entry for startup, otherwise get ready
390431 */
391432static struct sset *
392- initialize (struct vars * v , /* used only for debug flags */
433+ initialize (struct vars * v ,
393434 struct dfa * d ,
394435 chr * start )
395436{
@@ -402,6 +443,8 @@ initialize(struct vars * v, /* used only for debug flags */
402443 else
403444 { /* no, must (re)build it */
404445 ss = getvacant (v , d , start , start );
446+ if (ss == NULL )
447+ return NULL ;
405448 for (i = 0 ; i < d -> wordsper ; i ++ )
406449 ss -> states [i ] = 0 ;
407450 BSET (ss -> states , d -> cnfa -> pre );
@@ -420,10 +463,20 @@ initialize(struct vars * v, /* used only for debug flags */
420463}
421464
422465/*
423- * miss - handle a cache miss
466+ * miss - handle a stateset cache miss
467+ *
468+ * css is the current stateset, co is the color of the current input character,
469+ * cp points to the character after that (which is where we may need to test
470+ * LACONs). start does not affect matching behavior but is needed for pickss'
471+ * heuristics about which stateset cache entry to replace.
472+ *
473+ * Ordinarily, returns the address of the next stateset (the one that is
474+ * valid after consuming the input character). Returns NULL if no valid
475+ * NFA states remain, ie we have a certain match failure.
476+ * Internal errors also return NULL, with v->err set.
424477 */
425- static struct sset * /* NULL if goes to empty set */
426- miss (struct vars * v , /* used only for debug flags */
478+ static struct sset *
479+ miss (struct vars * v ,
427480 struct dfa * d ,
428481 struct sset * css ,
429482 pcolor co ,
@@ -449,9 +502,23 @@ miss(struct vars * v, /* used only for debug flags */
449502 }
450503 FDEBUG (("miss\n" ));
451504
452- /* first, what set of states would we end up in? */
505+ /*
506+ * Checking for operation cancel in the inner text search loop seems
507+ * unduly expensive. As a compromise, check during cache misses.
508+ */
509+ if (CANCEL_REQUESTED (v -> re ))
510+ {
511+ ERR (REG_CANCEL );
512+ return NULL ;
513+ }
514+
515+ /*
516+ * What set of states would we end up in after consuming the co character?
517+ * We first consider PLAIN arcs that consume the character, and then look
518+ * to see what LACON arcs could be traversed after consuming it.
519+ */
453520 for (i = 0 ; i < d -> wordsper ; i ++ )
454- d -> work [i ] = 0 ;
521+ d -> work [i ] = 0 ; /* build new stateset bitmap in d->work */
455522 ispost = 0 ;
456523 noprogress = 1 ;
457524 gotstate = 0 ;
@@ -468,22 +535,31 @@ miss(struct vars * v, /* used only for debug flags */
468535 noprogress = 0 ;
469536 FDEBUG (("%d -> %d\n" , i , ca -> to ));
470537 }
471- dolacons = (gotstate ) ? (cnfa -> flags & HASLACONS ) : 0 ;
538+ if (!gotstate )
539+ return NULL ; /* character cannot reach any new state */
540+ dolacons = (cnfa -> flags & HASLACONS );
472541 sawlacons = 0 ;
542+ /* outer loop handles transitive closure of reachable-by-LACON states */
473543 while (dolacons )
474- { /* transitive closure */
544+ {
475545 dolacons = 0 ;
476546 for (i = 0 ; i < d -> nstates ; i ++ )
477547 if (ISBSET (d -> work , i ))
478548 for (ca = cnfa -> states [i ]; ca -> co != COLORLESS ; ca ++ )
479549 {
480550 if (ca -> co < cnfa -> ncolors )
481- continue ; /* NOTE CONTINUE */
482- sawlacons = 1 ;
551+ continue ; /* not a LACON arc */
483552 if (ISBSET (d -> work , ca -> to ))
484- continue ; /* NOTE CONTINUE */
553+ continue ; /* arc would be a no-op anyway */
554+ sawlacons = 1 ; /* this LACON affects our result */
485555 if (!lacon (v , cnfa , cp , ca -> co ))
486- continue ; /* NOTE CONTINUE */
556+ {
557+ if (ISERR ())
558+ return NULL ;
559+ continue ; /* LACON arc cannot be traversed */
560+ }
561+ if (ISERR ())
562+ return NULL ;
487563 BSET (d -> work , ca -> to );
488564 dolacons = 1 ;
489565 if (ca -> to == cnfa -> post )
@@ -493,11 +569,9 @@ miss(struct vars * v, /* used only for debug flags */
493569 FDEBUG (("%d :> %d\n" , i , ca -> to ));
494570 }
495571 }
496- if (!gotstate )
497- return NULL ;
498572 h = HASH (d -> work , d -> wordsper );
499573
500- /* next, is that in the cache? */
574+ /* Is this stateset already in the cache? */
501575 for (p = d -> ssets , i = d -> nssused ; i > 0 ; p ++ , i -- )
502576 if (HIT (h , d -> work , p , d -> wordsper ))
503577 {
@@ -507,6 +581,8 @@ miss(struct vars * v, /* used only for debug flags */
507581 if (i == 0 )
508582 { /* nope, need a new cache entry */
509583 p = getvacant (v , d , cp , start );
584+ if (p == NULL )
585+ return NULL ;
510586 assert (p != css );
511587 for (i = 0 ; i < d -> wordsper ; i ++ )
512588 p -> states [i ] = d -> work [i ];
@@ -517,8 +593,15 @@ miss(struct vars * v, /* used only for debug flags */
517593 /* lastseen to be dealt with by caller */
518594 }
519595
596+ /*
597+ * Link new stateset to old, unless a LACON affected the result, in which
598+ * case we don't create the link. That forces future transitions across
599+ * this same arc (same prior stateset and character color) to come through
600+ * miss() again, so that we can recheck the LACON(s), which might or might
601+ * not pass since context will be different.
602+ */
520603 if (!sawlacons )
521- { /* lookahead conds. always cache miss */
604+ {
522605 FDEBUG (("c%d[%d]->c%d\n" ,
523606 (int ) (css - d -> ssets ), co , (int ) (p - d -> ssets )));
524607 css -> outs [co ] = p ;
@@ -562,11 +645,12 @@ lacon(struct vars * v,
562645
563646/*
564647 * getvacant - get a vacant state set
648+ *
565649 * This routine clears out the inarcs and outarcs, but does not otherwise
566650 * clear the innards of the state set -- that's up to the caller.
567651 */
568652static struct sset *
569- getvacant (struct vars * v , /* used only for debug flags */
653+ getvacant (struct vars * v ,
570654 struct dfa * d ,
571655 chr * cp ,
572656 chr * start )
@@ -578,6 +662,8 @@ getvacant(struct vars * v, /* used only for debug flags */
578662 color co ;
579663
580664 ss = pickss (v , d , cp , start );
665+ if (ss == NULL )
666+ return NULL ;
581667 assert (!(ss -> flags & LOCKED ));
582668
583669 /* clear out its inarcs, including self-referential ones */
@@ -635,7 +721,7 @@ getvacant(struct vars * v, /* used only for debug flags */
635721 * pickss - pick the next stateset to be used
636722 */
637723static struct sset *
638- pickss (struct vars * v , /* used only for debug flags */
724+ pickss (struct vars * v ,
639725 struct dfa * d ,
640726 chr * cp ,
641727 chr * start )
@@ -691,7 +777,6 @@ pickss(struct vars * v, /* used only for debug flags */
691777
692778 /* nobody's old enough?!? -- something's really wrong */
693779 FDEBUG (("cannot find victim to replace!\n" ));
694- assert (NOTREACHED );
695780 ERR (REG_ASSERT );
696- return d -> ssets ;
781+ return NULL ;
697782}
0 commit comments