@@ -65,6 +65,8 @@ newnfa(struct vars *v,
6565 nfa -> v = v ;
6666 nfa -> bos [0 ] = nfa -> bos [1 ] = COLORLESS ;
6767 nfa -> eos [0 ] = nfa -> eos [1 ] = COLORLESS ;
68+ nfa -> flags = 0 ;
69+ nfa -> minmatchall = nfa -> maxmatchall = -1 ;
6870 nfa -> parent = parent ; /* Precedes newfstate so parent is valid. */
6971 nfa -> post = newfstate (nfa , '@' ); /* number 0 */
7072 nfa -> pre = newfstate (nfa , '>' ); /* number 1 */
@@ -2875,15 +2877,297 @@ analyze(struct nfa *nfa)
28752877 if (NISERR ())
28762878 return 0 ;
28772879
2880+ /* Detect whether NFA can't match anything */
28782881 if (nfa -> pre -> outs == NULL )
28792882 return REG_UIMPOSSIBLE ;
2883+
2884+ /* Detect whether NFA matches all strings (possibly with length bounds) */
2885+ checkmatchall (nfa );
2886+
2887+ /* Detect whether NFA can possibly match a zero-length string */
28802888 for (a = nfa -> pre -> outs ; a != NULL ; a = a -> outchain )
28812889 for (aa = a -> to -> outs ; aa != NULL ; aa = aa -> outchain )
28822890 if (aa -> to == nfa -> post )
28832891 return REG_UEMPTYMATCH ;
28842892 return 0 ;
28852893}
28862894
2895+ /*
2896+ * checkmatchall - does the NFA represent no more than a string length test?
2897+ *
2898+ * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
2899+ * to begin with) and set the MATCHALL bit in nfa->flags.
2900+ *
2901+ * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
2902+ * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
2903+ * post state via RAINBOW arcs, and if there are any loops in the graph, they
2904+ * must be loop-to-self arcs, ensuring that each loop iteration consumes
2905+ * exactly one character. (Longer loops are problematic because they create
2906+ * non-consecutive possible match lengths; we have no good way to represent
2907+ * that situation for lengths beyond the DUPINF limit.)
2908+ *
2909+ * Pseudocolor arcs complicate things a little. We know that they can only
2910+ * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
2911+ * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
2912+ * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
2913+ * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
2914+ * arcs can occur, meaning that the NFA involves some constraint on the
2915+ * adjacent characters, which makes it not a matchall NFA.
2916+ */
2917+ static void
2918+ checkmatchall (struct nfa * nfa )
2919+ {
2920+ bool hasmatch [DUPINF + 1 ];
2921+ int minmatch ,
2922+ maxmatch ,
2923+ morematch ;
2924+
2925+ /*
2926+ * hasmatch[i] will be set true if a match of length i is feasible, for i
2927+ * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match
2928+ * length of DUPINF or more is feasible.
2929+ */
2930+ memset (hasmatch , 0 , sizeof (hasmatch ));
2931+
2932+ /*
2933+ * Recursively search the graph for all-RAINBOW paths to the "post" state,
2934+ * starting at the "pre" state. The -1 initial depth accounts for the
2935+ * fact that transitions out of the "pre" state are not part of the
2936+ * matched string. We likewise don't count the final transition to the
2937+ * "post" state as part of the match length. (But we still insist that
2938+ * those transitions have RAINBOW arcs, otherwise there are lookbehind or
2939+ * lookahead constraints at the start/end of the pattern.)
2940+ */
2941+ if (!checkmatchall_recurse (nfa , nfa -> pre , false, -1 , hasmatch ))
2942+ return ;
2943+
2944+ /*
2945+ * We found some all-RAINBOW paths, and not anything that we couldn't
2946+ * handle. Now verify that pseudocolor arcs adjacent to the pre and post
2947+ * states match the RAINBOW arcs there. (We could do this while
2948+ * recursing, but it's expensive and unlikely to fail, so do it last.)
2949+ */
2950+ if (!check_out_colors_match (nfa -> pre , RAINBOW , nfa -> bos [0 ]) ||
2951+ !check_out_colors_match (nfa -> pre , nfa -> bos [0 ], RAINBOW ) ||
2952+ !check_out_colors_match (nfa -> pre , RAINBOW , nfa -> bos [1 ]) ||
2953+ !check_out_colors_match (nfa -> pre , nfa -> bos [1 ], RAINBOW ))
2954+ return ;
2955+ if (!check_in_colors_match (nfa -> post , RAINBOW , nfa -> eos [0 ]) ||
2956+ !check_in_colors_match (nfa -> post , nfa -> eos [0 ], RAINBOW ) ||
2957+ !check_in_colors_match (nfa -> post , RAINBOW , nfa -> eos [1 ]) ||
2958+ !check_in_colors_match (nfa -> post , nfa -> eos [1 ], RAINBOW ))
2959+ return ;
2960+
2961+ /*
2962+ * hasmatch[] now represents the set of possible match lengths; but we
2963+ * want to reduce that to a min and max value, because it doesn't seem
2964+ * worth complicating regexec.c to deal with nonconsecutive possible match
2965+ * lengths. Find min and max of first run of lengths, then verify there
2966+ * are no nonconsecutive lengths.
2967+ */
2968+ for (minmatch = 0 ; minmatch <= DUPINF ; minmatch ++ )
2969+ {
2970+ if (hasmatch [minmatch ])
2971+ break ;
2972+ }
2973+ assert (minmatch <= DUPINF ); /* else checkmatchall_recurse lied */
2974+ for (maxmatch = minmatch ; maxmatch < DUPINF ; maxmatch ++ )
2975+ {
2976+ if (!hasmatch [maxmatch + 1 ])
2977+ break ;
2978+ }
2979+ for (morematch = maxmatch + 1 ; morematch <= DUPINF ; morematch ++ )
2980+ {
2981+ if (hasmatch [morematch ])
2982+ return ; /* fail, there are nonconsecutive lengths */
2983+ }
2984+
2985+ /* Success, so record the info */
2986+ nfa -> minmatchall = minmatch ;
2987+ nfa -> maxmatchall = maxmatch ;
2988+ nfa -> flags |= MATCHALL ;
2989+ }
2990+
2991+ /*
2992+ * checkmatchall_recurse - recursive search for checkmatchall
2993+ *
2994+ * s is the current state
2995+ * foundloop is true if any predecessor state has a loop-to-self
2996+ * depth is the current recursion depth (starting at -1)
2997+ * hasmatch[] is the output area for recording feasible match lengths
2998+ *
2999+ * We return true if there is at least one all-RAINBOW path to the "post"
3000+ * state and no non-matchall paths; otherwise false. Note we assume that
3001+ * any dead-end paths have already been removed, else we might return
3002+ * false unnecessarily.
3003+ */
3004+ static bool
3005+ checkmatchall_recurse (struct nfa * nfa , struct state * s ,
3006+ bool foundloop , int depth ,
3007+ bool * hasmatch )
3008+ {
3009+ bool result = false;
3010+ struct arc * a ;
3011+
3012+ /*
3013+ * Since this is recursive, it could be driven to stack overflow. But we
3014+ * need not treat that as a hard failure; just deem the NFA non-matchall.
3015+ */
3016+ if (STACK_TOO_DEEP (nfa -> v -> re ))
3017+ return false;
3018+
3019+ /*
3020+ * Likewise, if we get to a depth too large to represent correctly in
3021+ * maxmatchall, fail quietly.
3022+ */
3023+ if (depth >= DUPINF )
3024+ return false;
3025+
3026+ /*
3027+ * Scan the outarcs to detect cases we can't handle, and to see if there
3028+ * is a loop-to-self here. We need to know about any such loop before we
3029+ * recurse, so it's hard to avoid making two passes over the outarcs. In
3030+ * any case, checking for showstoppers before we recurse is probably best.
3031+ */
3032+ for (a = s -> outs ; a != NULL ; a = a -> outchain )
3033+ {
3034+ if (a -> type != PLAIN )
3035+ return false; /* any LACONs make it non-matchall */
3036+ if (a -> co != RAINBOW )
3037+ {
3038+ if (nfa -> cm -> cd [a -> co ].flags & PSEUDO )
3039+ {
3040+ /*
3041+ * Pseudocolor arc: verify it's in a valid place (this seems
3042+ * quite unlikely to fail, but let's be sure).
3043+ */
3044+ if (s == nfa -> pre &&
3045+ (a -> co == nfa -> bos [0 ] || a -> co == nfa -> bos [1 ]))
3046+ /* okay BOS/BOL arc */ ;
3047+ else if (a -> to == nfa -> post &&
3048+ (a -> co == nfa -> eos [0 ] || a -> co == nfa -> eos [1 ]))
3049+ /* okay EOS/EOL arc */ ;
3050+ else
3051+ return false; /* unexpected pseudocolor arc */
3052+ /* We'll finish checking these arcs after the recursion */
3053+ continue ;
3054+ }
3055+ return false; /* any other color makes it non-matchall */
3056+ }
3057+ if (a -> to == s )
3058+ {
3059+ /*
3060+ * We found a cycle of length 1, so remember that to pass down to
3061+ * successor states. (It doesn't matter if there was also such a
3062+ * loop at a predecessor state.)
3063+ */
3064+ foundloop = true;
3065+ }
3066+ else if (a -> to -> tmp )
3067+ {
3068+ /* We found a cycle of length > 1, so fail. */
3069+ return false;
3070+ }
3071+ }
3072+
3073+ /* We need to recurse, so mark state as under consideration */
3074+ assert (s -> tmp == NULL );
3075+ s -> tmp = s ;
3076+
3077+ for (a = s -> outs ; a != NULL ; a = a -> outchain )
3078+ {
3079+ if (a -> co != RAINBOW )
3080+ continue ; /* ignore pseudocolor arcs */
3081+ if (a -> to == nfa -> post )
3082+ {
3083+ /* We found an all-RAINBOW path to the post state */
3084+ result = true;
3085+ /* Record potential match lengths */
3086+ assert (depth >= 0 );
3087+ hasmatch [depth ] = true;
3088+ if (foundloop )
3089+ {
3090+ /* A predecessor loop makes all larger lengths match, too */
3091+ int i ;
3092+
3093+ for (i = depth + 1 ; i <= DUPINF ; i ++ )
3094+ hasmatch [i ] = true;
3095+ }
3096+ }
3097+ else if (a -> to != s )
3098+ {
3099+ /* This is a new path forward; recurse to investigate */
3100+ result = checkmatchall_recurse (nfa , a -> to ,
3101+ foundloop , depth + 1 ,
3102+ hasmatch );
3103+ /* Fail if any recursive path fails */
3104+ if (!result )
3105+ break ;
3106+ }
3107+ }
3108+
3109+ s -> tmp = NULL ;
3110+ return result ;
3111+ }
3112+
3113+ /*
3114+ * check_out_colors_match - subroutine for checkmatchall
3115+ *
3116+ * Check if every s outarc of color co1 has a matching outarc of color co2.
3117+ * (checkmatchall_recurse already verified that all of the outarcs are PLAIN,
3118+ * so we need not examine arc types here. Also, since it verified that there
3119+ * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for
3120+ * this brute-force O(N^2) implementation to cause problems.)
3121+ */
3122+ static bool
3123+ check_out_colors_match (struct state * s , color co1 , color co2 )
3124+ {
3125+ struct arc * a1 ;
3126+ struct arc * a2 ;
3127+
3128+ for (a1 = s -> outs ; a1 != NULL ; a1 = a1 -> outchain )
3129+ {
3130+ if (a1 -> co != co1 )
3131+ continue ;
3132+ for (a2 = s -> outs ; a2 != NULL ; a2 = a2 -> outchain )
3133+ {
3134+ if (a2 -> co == co2 && a2 -> to == a1 -> to )
3135+ break ;
3136+ }
3137+ if (a2 == NULL )
3138+ return false;
3139+ }
3140+ return true;
3141+ }
3142+
3143+ /*
3144+ * check_in_colors_match - subroutine for checkmatchall
3145+ *
3146+ * Check if every s inarc of color co1 has a matching inarc of color co2.
3147+ * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still
3148+ * not expecting very many arcs.)
3149+ */
3150+ static bool
3151+ check_in_colors_match (struct state * s , color co1 , color co2 )
3152+ {
3153+ struct arc * a1 ;
3154+ struct arc * a2 ;
3155+
3156+ for (a1 = s -> ins ; a1 != NULL ; a1 = a1 -> inchain )
3157+ {
3158+ if (a1 -> type != PLAIN || a1 -> co != co1 )
3159+ continue ;
3160+ for (a2 = s -> ins ; a2 != NULL ; a2 = a2 -> inchain )
3161+ {
3162+ if (a2 -> type == PLAIN && a2 -> co == co2 && a2 -> from == a1 -> from )
3163+ break ;
3164+ }
3165+ if (a2 == NULL )
3166+ return false;
3167+ }
3168+ return true;
3169+ }
3170+
28873171/*
28883172 * compact - construct the compact representation of an NFA
28893173 */
@@ -2930,7 +3214,9 @@ compact(struct nfa *nfa,
29303214 cnfa -> eos [0 ] = nfa -> eos [0 ];
29313215 cnfa -> eos [1 ] = nfa -> eos [1 ];
29323216 cnfa -> ncolors = maxcolor (nfa -> cm ) + 1 ;
2933- cnfa -> flags = 0 ;
3217+ cnfa -> flags = nfa -> flags ;
3218+ cnfa -> minmatchall = nfa -> minmatchall ;
3219+ cnfa -> maxmatchall = nfa -> maxmatchall ;
29343220
29353221 ca = cnfa -> arcs ;
29363222 for (s = nfa -> states ; s != NULL ; s = s -> next )
@@ -3034,6 +3320,11 @@ dumpnfa(struct nfa *nfa,
30343320 fprintf (f , ", eos [%ld]" , (long ) nfa -> eos [0 ]);
30353321 if (nfa -> eos [1 ] != COLORLESS )
30363322 fprintf (f , ", eol [%ld]" , (long ) nfa -> eos [1 ]);
3323+ if (nfa -> flags & HASLACONS )
3324+ fprintf (f , ", haslacons" );
3325+ if (nfa -> flags & MATCHALL )
3326+ fprintf (f , ", minmatchall %d, maxmatchall %d" ,
3327+ nfa -> minmatchall , nfa -> maxmatchall );
30373328 fprintf (f , "\n" );
30383329 for (s = nfa -> states ; s != NULL ; s = s -> next )
30393330 {
@@ -3201,6 +3492,9 @@ dumpcnfa(struct cnfa *cnfa,
32013492 fprintf (f , ", eol [%ld]" , (long ) cnfa -> eos [1 ]);
32023493 if (cnfa -> flags & HASLACONS )
32033494 fprintf (f , ", haslacons" );
3495+ if (cnfa -> flags & MATCHALL )
3496+ fprintf (f , ", minmatchall %d, maxmatchall %d" ,
3497+ cnfa -> minmatchall , cnfa -> maxmatchall );
32043498 fprintf (f , "\n" );
32053499 for (st = 0 ; st < cnfa -> nstates ; st ++ )
32063500 dumpcstate (st , cnfa , f );
0 commit comments