@@ -593,40 +593,6 @@ hasnonemptyout(struct state * s)
593593 return 0 ;
594594}
595595
596- /*
597- * nonemptyouts - count non-EMPTY out arcs of a state
598- */
599- static int
600- nonemptyouts (struct state * s )
601- {
602- int n = 0 ;
603- struct arc * a ;
604-
605- for (a = s -> outs ; a != NULL ; a = a -> outchain )
606- {
607- if (a -> type != EMPTY )
608- n ++ ;
609- }
610- return n ;
611- }
612-
613- /*
614- * nonemptyins - count non-EMPTY in arcs of a state
615- */
616- static int
617- nonemptyins (struct state * s )
618- {
619- int n = 0 ;
620- struct arc * a ;
621-
622- for (a = s -> ins ; a != NULL ; a = a -> inchain )
623- {
624- if (a -> type != EMPTY )
625- n ++ ;
626- }
627- return n ;
628- }
629-
630596/*
631597 * findarc - find arc, if any, from given source with given type and color
632598 * If there is more than one such arc, the result is random.
@@ -1856,6 +1822,12 @@ fixempties(struct nfa * nfa,
18561822 struct state * nexts ;
18571823 struct arc * a ;
18581824 struct arc * nexta ;
1825+ int totalinarcs ;
1826+ struct arc * * inarcsorig ;
1827+ struct arc * * arcarray ;
1828+ int arccount ;
1829+ int prevnins ;
1830+ int nskip ;
18591831
18601832 /*
18611833 * First, get rid of any states whose sole out-arc is an EMPTY, since
@@ -1896,41 +1868,131 @@ fixempties(struct nfa * nfa,
18961868 dropstate (nfa , s );
18971869 }
18981870
1871+ if (NISERR ())
1872+ return ;
1873+
18991874 /*
1900- * For each remaining NFA state, find all other states that are reachable
1901- * from it by a chain of one or more EMPTY arcs. Then generate new arcs
1875+ * For each remaining NFA state, find all other states from which it is
1876+ * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
19021877 * that eliminate the need for each such chain.
19031878 *
1904- * If we just do this straightforwardly, the algorithm gets slow in
1905- * complex graphs, because the same arcs get copied to all intermediate
1906- * states of an EMPTY chain, and then uselessly pushed repeatedly to the
1907- * chain's final state; we waste a lot of time in newarc's duplicate
1908- * checking. To improve matters, we decree that any state with only EMPTY
1909- * out-arcs is "doomed" and will not be part of the final NFA. That can be
1910- * ensured by not adding any new out-arcs to such a state. Having ensured
1911- * that, we need not update the state's in-arcs list either; all arcs that
1912- * might have gotten pushed forward to it will just get pushed directly to
1913- * successor states. This eliminates most of the useless duplicate arcs.
1879+ * We could replace a chain of EMPTY arcs that leads from a "from" state
1880+ * to a "to" state either by pushing non-EMPTY arcs forward (linking
1881+ * directly from "from"'s predecessors to "to") or by pulling them back
1882+ * (linking directly from "from" to "to"'s successors). We choose to
1883+ * always do the former; this choice is somewhat arbitrary, but the
1884+ * approach below requires that we uniformly do one or the other.
1885+ *
1886+ * Suppose we have a chain of N successive EMPTY arcs (where N can easily
1887+ * approach the size of the NFA). All of the intermediate states must
1888+ * have additional inarcs and outarcs, else they'd have been removed by
1889+ * the steps above. Assuming their inarcs are mostly not empties, we will
1890+ * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
1891+ * state in the chain must be duplicated to lead to all its successor
1892+ * states as well. So there is no hope of doing less than O(N^2) work;
1893+ * however, we should endeavor to keep the big-O cost from being even
1894+ * worse than that, which it can easily become without care. In
1895+ * particular, suppose we were to copy all S1's inarcs forward to S2, and
1896+ * then also to S3, and then later we consider pushing S2's inarcs forward
1897+ * to S3. If we include the arcs already copied from S1 in that, we'd be
1898+ * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
1899+ * and its cohorts would get rid of the extra arcs, but not without cost.)
1900+ *
1901+ * We can avoid this cost by treating only arcs that existed at the start
1902+ * of this phase as candidates to be pushed forward. To identify those,
1903+ * we remember the first inarc each state had to start with. We rely on
1904+ * the fact that newarc() and friends put new arcs on the front of their
1905+ * to-states' inchains, and that this phase never deletes arcs, so that
1906+ * the original arcs must be the last arcs in their to-states' inchains.
1907+ *
1908+ * So the process here is that, for each state in the NFA, we gather up
1909+ * all non-EMPTY inarcs of states that can reach the target state via
1910+ * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
1911+ * target state's inchain. (We can safely use sort-merge for this as long
1912+ * as we update each state's original-arcs pointer after we add arcs to
1913+ * it; the sort step of mergeins probably changed the order of the old
1914+ * arcs.)
1915+ *
1916+ * Another refinement worth making is that, because we only add non-EMPTY
1917+ * arcs during this phase, and all added arcs have the same from-state as
1918+ * the non-EMPTY arc they were cloned from, we know ahead of time that any
1919+ * states having only EMPTY outarcs will be useless for lack of outarcs
1920+ * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
1921+ * they had none to start with.) So we need not bother to update the
1922+ * inchains of such states at all.
1923+ */
1924+
1925+ /* Remember the states' first original inarcs */
1926+ /* ... and while at it, count how many old inarcs there are altogether */
1927+ inarcsorig = (struct arc * * ) MALLOC (nfa -> nstates * sizeof (struct arc * ));
1928+ if (inarcsorig == NULL )
1929+ {
1930+ NERR (REG_ESPACE );
1931+ return ;
1932+ }
1933+ totalinarcs = 0 ;
1934+ for (s = nfa -> states ; s != NULL ; s = s -> next )
1935+ {
1936+ inarcsorig [s -> no ] = s -> ins ;
1937+ totalinarcs += s -> nins ;
1938+ }
1939+
1940+ /*
1941+ * Create a workspace for accumulating the inarcs to be added to the
1942+ * current target state. totalinarcs is probably a considerable
1943+ * overestimate of the space needed, but the NFA is unlikely to be large
1944+ * enough at this point to make it worth being smarter.
19141945 */
1946+ arcarray = (struct arc * * ) MALLOC (totalinarcs * sizeof (struct arc * ));
1947+ if (arcarray == NULL )
1948+ {
1949+ NERR (REG_ESPACE );
1950+ FREE (inarcsorig );
1951+ return ;
1952+ }
1953+
1954+ /* And iterate over the target states */
19151955 for (s = nfa -> states ; s != NULL && !NISERR (); s = s -> next )
19161956 {
1917- for (s2 = emptyreachable (nfa , s , s ); s2 != s && !NISERR (); s2 = nexts )
1957+ /* Ignore target states without non-EMPTY outarcs, per note above */
1958+ if (!s -> flag && !hasnonemptyout (s ))
1959+ continue ;
1960+
1961+ /* Find predecessor states and accumulate their original inarcs */
1962+ arccount = 0 ;
1963+ for (s2 = emptyreachable (nfa , s , s , inarcsorig ); s2 != s ; s2 = nexts )
19181964 {
1919- /*
1920- * If s2 is doomed, we decide that (1) we will always push arcs
1921- * forward to it, not pull them back to s; and (2) we can optimize
1922- * away the push-forward, per comment above. So do nothing.
1923- */
1924- if (s2 -> flag || hasnonemptyout (s2 ))
1925- replaceempty (nfa , s , s2 );
1965+ /* Add s2's original inarcs to arcarray[], but ignore empties */
1966+ for (a = inarcsorig [s2 -> no ]; a != NULL ; a = a -> inchain )
1967+ {
1968+ if (a -> type != EMPTY )
1969+ arcarray [arccount ++ ] = a ;
1970+ }
19261971
19271972 /* Reset the tmp fields as we walk back */
19281973 nexts = s2 -> tmp ;
19291974 s2 -> tmp = NULL ;
19301975 }
19311976 s -> tmp = NULL ;
1977+ assert (arccount <= totalinarcs );
1978+
1979+ /* Remember how many original inarcs this state has */
1980+ prevnins = s -> nins ;
1981+
1982+ /* Add non-duplicate inarcs to target state */
1983+ mergeins (nfa , s , arcarray , arccount );
1984+
1985+ /* Now we must update the state's inarcsorig pointer */
1986+ nskip = s -> nins - prevnins ;
1987+ a = s -> ins ;
1988+ while (nskip -- > 0 )
1989+ a = a -> inchain ;
1990+ inarcsorig [s -> no ] = a ;
19321991 }
19331992
1993+ FREE (arcarray );
1994+ FREE (inarcsorig );
1995+
19341996 if (NISERR ())
19351997 return ;
19361998
@@ -1964,20 +2026,25 @@ fixempties(struct nfa * nfa,
19642026}
19652027
19662028/*
1967- * emptyreachable - recursively find all states reachable from s by EMPTY arcs
2029+ * emptyreachable - recursively find all states that can reach s by EMPTY arcs
19682030 *
19692031 * The return value is the last such state found. Its tmp field links back
19702032 * to the next-to-last such state, and so on back to s, so that all these
19712033 * states can be located without searching the whole NFA.
19722034 *
2035+ * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2036+ * maintained by that function. This lets us skip over all new inarcs, which
2037+ * are certainly not EMPTY arcs.
2038+ *
19732039 * The maximum recursion depth here is equal to the length of the longest
19742040 * loop-free chain of EMPTY arcs, which is surely no more than the size of
19752041 * the NFA ... but that could still be enough to cause trouble.
19762042 */
19772043static struct state *
19782044emptyreachable (struct nfa * nfa ,
19792045 struct state * s ,
1980- struct state * lastfound )
2046+ struct state * lastfound ,
2047+ struct arc * * inarcsorig )
19812048{
19822049 struct arc * a ;
19832050
@@ -1990,78 +2057,14 @@ emptyreachable(struct nfa * nfa,
19902057
19912058 s -> tmp = lastfound ;
19922059 lastfound = s ;
1993- for (a = s -> outs ; a != NULL ; a = a -> outchain )
2060+ for (a = inarcsorig [ s -> no ] ; a != NULL ; a = a -> inchain )
19942061 {
1995- if (a -> type == EMPTY && a -> to -> tmp == NULL )
1996- lastfound = emptyreachable (nfa , a -> to , lastfound );
2062+ if (a -> type == EMPTY && a -> from -> tmp == NULL )
2063+ lastfound = emptyreachable (nfa , a -> from , lastfound , inarcsorig );
19972064 }
19982065 return lastfound ;
19992066}
20002067
2001- /*
2002- * replaceempty - replace an EMPTY arc chain with some non-empty arcs
2003- *
2004- * The EMPTY arc(s) should be deleted later, but we can't do it here because
2005- * they may still be needed to identify other arc chains during fixempties().
2006- */
2007- static void
2008- replaceempty (struct nfa * nfa ,
2009- struct state * from ,
2010- struct state * to )
2011- {
2012- int fromouts ;
2013- int toins ;
2014-
2015- assert (from != to );
2016-
2017- /*
2018- * Create replacement arcs that bypass the need for the EMPTY chain. We
2019- * can do this either by pushing arcs forward (linking directly from
2020- * "from"'s predecessors to "to") or by pulling them back (linking
2021- * directly from "from" to "to"'s successors). In general, we choose
2022- * whichever way creates greater fan-out or fan-in, so as to improve the
2023- * odds of reducing the other state to zero in-arcs or out-arcs and
2024- * thereby being able to delete it. However, if "from" is doomed (has no
2025- * non-EMPTY out-arcs), we must keep it so, so always push forward in that
2026- * case.
2027- *
2028- * The fan-out/fan-in comparison should count only non-EMPTY arcs. If
2029- * "from" is doomed, we can skip counting "to"'s arcs, since we want to
2030- * force taking the copyins path in that case.
2031- */
2032- fromouts = nonemptyouts (from );
2033- toins = (fromouts == 0 ) ? 1 : nonemptyins (to );
2034-
2035- if (fromouts > toins )
2036- {
2037- copyouts (nfa , to , from , 0 );
2038- return ;
2039- }
2040- if (fromouts < toins )
2041- {
2042- copyins (nfa , from , to , 0 );
2043- return ;
2044- }
2045-
2046- /*
2047- * fromouts == toins. Decide on secondary issue: copy fewest arcs.
2048- *
2049- * Doesn't seem to be worth the trouble to exclude empties from these
2050- * comparisons; that takes extra time and doesn't seem to improve the
2051- * resulting graph much.
2052- */
2053- if (from -> nins > to -> nouts )
2054- {
2055- copyouts (nfa , to , from , 0 );
2056- return ;
2057- }
2058- else
2059- {
2060- copyins (nfa , from , to , 0 );
2061- return ;
2062- }
2063- }
2064-
20652068/*
20662069 * isconstraintarc - detect whether an arc is of a constraint type
20672070 */
0 commit comments