@@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
233233static int casecmp (const chr * , const chr * , size_t );
234234
235235
236- /* info we need during compilation about a known capturing subexpression */
237- struct subinfo
238- {
239- struct state * left ; /* left end of its sub-NFA */
240- struct state * right ; /* right end of its sub-NFA */
241- };
242-
243236/* internal variables, bundled for easy passing around */
244237struct vars
245238{
@@ -252,10 +245,10 @@ struct vars
252245 int nexttype ; /* type of next token */
253246 chr nextvalue ; /* value (if any) of next token */
254247 int lexcon ; /* lexical context type (see regc_lex.c) */
255- int nsubexp ; /* number of known capturing subexpressions */
256- struct subinfo * subs ; /* info about known capturing subexpressions */
257- size_t nsubs ; /* allocated length of subs[] vector */
258- struct subinfo sub10 [10 ]; /* initial vector, enough for most */
248+ int nsubexp ; /* subexpression count */
249+ struct subre * * subs ; /* subRE pointer vector */
250+ size_t nsubs ; /* length of vector */
251+ struct subre * sub10 [10 ]; /* initial vector, enough for most */
259252 struct nfa * nfa ; /* the NFA */
260253 struct colormap * cm ; /* character color map */
261254 color nlcolor ; /* color of newline */
@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
375368 v -> subs = v -> sub10 ;
376369 v -> nsubs = 10 ;
377370 for (j = 0 ; j < v -> nsubs ; j ++ )
378- v -> subs [j ]. left = v -> subs [ j ]. right = NULL ;
371+ v -> subs [j ] = NULL ;
379372 v -> nfa = NULL ;
380373 v -> cm = NULL ;
381374 v -> nlcolor = COLORLESS ;
@@ -511,35 +504,35 @@ pg_regcomp(regex_t *re,
511504}
512505
513506/*
514- * moresubs - enlarge capturing-subexpressions vector
507+ * moresubs - enlarge subRE vector
515508 */
516509static void
517510moresubs (struct vars * v ,
518511 int wanted ) /* want enough room for this one */
519512{
520- struct subinfo * p ;
513+ struct subre * * p ;
521514 size_t n ;
522515
523516 assert (wanted > 0 && (size_t ) wanted >= v -> nsubs );
524517 n = (size_t ) wanted * 3 / 2 + 1 ;
525518
526519 if (v -> subs == v -> sub10 )
527520 {
528- p = (struct subinfo * ) MALLOC (n * sizeof (struct subinfo ));
521+ p = (struct subre * * ) MALLOC (n * sizeof (struct subre * ));
529522 if (p != NULL )
530523 memcpy (VS (p ), VS (v -> subs ),
531- v -> nsubs * sizeof (struct subinfo ));
524+ v -> nsubs * sizeof (struct subre * ));
532525 }
533526 else
534- p = (struct subinfo * ) REALLOC (v -> subs , n * sizeof (struct subinfo ));
527+ p = (struct subre * * ) REALLOC (v -> subs , n * sizeof (struct subre * ));
535528 if (p == NULL )
536529 {
537530 ERR (REG_ESPACE );
538531 return ;
539532 }
540533 v -> subs = p ;
541534 for (p = & v -> subs [v -> nsubs ]; v -> nsubs < n ; p ++ , v -> nsubs ++ )
542- p -> left = p -> right = NULL ;
535+ * p = NULL ;
543536 assert (v -> nsubs == n );
544537 assert ((size_t ) wanted < v -> nsubs );
545538}
@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
988981 s = newstate (v -> nfa );
989982 s2 = newstate (v -> nfa );
990983 NOERRN ();
984+ /* We may not need these arcs, but keep things connected for now */
991985 EMPTYARC (lp , s );
992986 EMPTYARC (s2 , rp );
993987 NOERRN ();
@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
997991 NOERRN ();
998992 if (cap )
999993 {
1000- /* save the sub-NFA's endpoints for future backrefs to use */
1001- assert (v -> subs [subno ].left == NULL );
1002- v -> subs [subno ].left = s ;
1003- v -> subs [subno ].right = s2 ;
1004994 if (atom -> capno == 0 )
1005995 {
1006996 /* normal case: just mark the atom as capturing */
@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
10161006 t -> child = atom ;
10171007 atom = t ;
10181008 }
1009+ assert (v -> subs [subno ] == NULL );
1010+ v -> subs [subno ] = atom ;
10191011 }
10201012 /* postpone everything else pending possible {0} */
10211013 break ;
10221014 case BACKREF : /* the Feature From The Black Lagoon */
10231015 INSIST (type != LACON , REG_ESUBREG );
10241016 INSIST (v -> nextvalue < v -> nsubs , REG_ESUBREG );
1025- INSIST (v -> subs [v -> nextvalue ]. left != NULL , REG_ESUBREG );
1017+ INSIST (v -> subs [v -> nextvalue ] != NULL , REG_ESUBREG );
10261018 NOERRN ();
10271019 assert (v -> nextvalue > 0 );
10281020 atom = subre (v , 'b' , BACKR , lp , rp );
@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
10971089 if (atom != NULL )
10981090 freesubre (v , atom );
10991091 if (atomtype == '(' )
1100- v -> subs [subno ]. left = v -> subs [ subno ]. right = NULL ;
1092+ v -> subs [subno ] = NULL ;
11011093 delsub (v -> nfa , lp , rp );
11021094 EMPTYARC (lp , rp );
11031095 return top ;
@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
11301122 NOERRN ();
11311123 }
11321124
1125+ /*
1126+ * For what follows, we need the atom to have its own begin/end states
1127+ * that are distinct from lp/rp, so that we can wrap iteration structure
1128+ * around it. The parenthesized-atom case above already made suitable
1129+ * states (and we don't want to modify a capturing subre, since it's
1130+ * already recorded in v->subs[]). Otherwise, we need more states.
1131+ */
1132+ if (atom -> begin == lp || atom -> end == rp )
1133+ {
1134+ s = newstate (v -> nfa );
1135+ s2 = newstate (v -> nfa );
1136+ NOERRN ();
1137+ moveouts (v -> nfa , lp , s );
1138+ moveins (v -> nfa , rp , s2 );
1139+ atom -> begin = s ;
1140+ atom -> end = s2 ;
1141+ }
1142+ else
1143+ {
1144+ /* The atom's OK, but we must temporarily disconnect it from lp/rp */
1145+ /* (this removes the EMPTY arcs we made above) */
1146+ delsub (v -> nfa , lp , atom -> begin );
1147+ delsub (v -> nfa , atom -> end , rp );
1148+ }
1149+
11331150 /*----------
11341151 * Prepare a general-purpose state skeleton.
11351152 *
11361153 * In the no-backrefs case, we want this:
11371154 *
1138- * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1155+ * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
11391156 *
1140- * where prefix is some repetitions of atom. In the general case we need
1157+ * where prefix is some repetitions of atom, and "rest" is the remainder
1158+ * of the branch. In the general case we need:
11411159 *
11421160 * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
11431161 *
1144- * where the iterator wraps around [begin] --- atom---> [end]
1162+ * where the iterator wraps around the atom.
11451163 *
11461164 * We make the s state here for both cases; s2 is made below if needed
11471165 *----------
11481166 */
1149- s = newstate (v -> nfa ); /* first, new endpoints for the atom */
1150- s2 = newstate (v -> nfa );
1151- NOERRN ();
1152- moveouts (v -> nfa , lp , s );
1153- moveins (v -> nfa , rp , s2 );
1154- NOERRN ();
1155- atom -> begin = s ;
1156- atom -> end = s2 ;
11571167 s = newstate (v -> nfa ); /* set up starting state */
11581168 NOERRN ();
11591169 EMPTYARC (lp , s );
@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
11901200 {
11911201 assert (atom -> begin -> nouts == 1 ); /* just the EMPTY */
11921202 delsub (v -> nfa , atom -> begin , atom -> end );
1193- assert (v -> subs [subno ]. left != NULL );
1203+ assert (v -> subs [subno ] != NULL );
11941204
11951205 /*
11961206 * And here's why the recursion got postponed: it must wait until the
11971207 * skeleton is filled in, because it may hit a backref that wants to
11981208 * copy the filled-in skeleton.
11991209 */
1200- dupnfa (v -> nfa , v -> subs [subno ]. left , v -> subs [subno ]. right ,
1210+ dupnfa (v -> nfa , v -> subs [subno ]-> begin , v -> subs [subno ]-> end ,
12011211 atom -> begin , atom -> end );
12021212 NOERRN ();
12031213
0 commit comments