Fix infinite-loop risk in fixempties() stage of regex compilation.

The previous coding of this function could get into situations where it would never terminate, because successive passes would re-add EMPTY arcs that had been removed by the previous pass. Rewrite the function completely using a new algorithm that is guaranteed to terminate, and also seems to be usually faster than the old one. Per Tcl bugs 3604074 and 3606683. Tom Lane and Don Porter
13 years ago · a7b61d4f5a
parent 7ccefe8610
commit a7b61d4f5a
4 changed files with 280 additions and 73 deletions
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@ -455,6 +455,56 @@ freearc(struct nfa * nfa,
 	from->free = victim;
 }

+/*
+ * hasnonemptyout - Does state have a non-EMPTY out arc?
+ */
+static int
+hasnonemptyout(struct state * s)
+{
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type != EMPTY)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * nonemptyouts - count non-EMPTY out arcs of a state
+ */
+static int
+nonemptyouts(struct state * s)
+{
+	int			n = 0;
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type != EMPTY)
+			n++;
+	}
+	return n;
+}
+
+/*
+ * nonemptyins - count non-EMPTY in arcs of a state
+ */
+static int
+nonemptyins(struct state * s)
+{
+	int			n = 0;
+	struct arc *a;
+
+	for (a = s->ins; a != NULL; a = a->inchain)
+	{
+		if (a->type != EMPTY)
+			n++;
+	}
+	return n;
+}
+
 /*
 * findarc - find arc, if any, from given source with given type and color
 * If there is more than one such arc, the result is random.
@ -511,19 +561,25 @@ moveins(struct nfa * nfa,
 }

 /*
- * copyins - copy all in arcs of a state to another state
+ * copyins - copy in arcs of a state to another state
+ *
+ * Either all arcs, or only non-empty ones as determined by all value.
 */
 static void
 copyins(struct nfa * nfa,
 		struct state * oldState,
-		struct state * newState)
+		struct state * newState,
+		int all)
 {
 	struct arc *a;

 	assert(oldState != newState);

 	for (a = oldState->ins; a != NULL; a = a->inchain)
-		cparc(nfa, a, a->from, newState);
+	{
+		if (all || a->type != EMPTY)
+			cparc(nfa, a, a->from, newState);
+	}
 }

 /*
@ -546,19 +602,25 @@ moveouts(struct nfa * nfa,
 }

 /*
- * copyouts - copy all out arcs of a state to another state
+ * copyouts - copy out arcs of a state to another state
+ *
+ * Either all arcs, or only non-empty ones as determined by all value.
 */
 static void
 copyouts(struct nfa * nfa,
 		 struct state * oldState,
-		 struct state * newState)
+		 struct state * newState,
+		 int all)
 {
 	struct arc *a;

 	assert(oldState != newState);

 	for (a = oldState->outs; a != NULL; a = a->outchain)
-		cparc(nfa, a, newState, a->to);
+	{
+		if (all || a->type != EMPTY)
+			cparc(nfa, a, newState, a->to);
+	}
 }

 /*
@ -881,7 +943,7 @@ pull(struct nfa * nfa,
 		if (NISERR())
 			return 0;
 		assert(to != from);		/* con is not an inarc */
-		copyins(nfa, from, s);	/* duplicate inarcs */
+		copyins(nfa, from, s, 1);		/* duplicate inarcs */
 		cparc(nfa, con, s, to); /* move constraint arc */
 		freearc(nfa, con);
 		from = s;
@ -1027,7 +1089,7 @@ push(struct nfa * nfa,
 		s = newstate(nfa);
 		if (NISERR())
 			return 0;
-		copyouts(nfa, to, s);	/* duplicate outarcs */
+		copyouts(nfa, to, s, 1);	/* duplicate outarcs */
 		cparc(nfa, con, from, s);		/* move constraint */
 		freearc(nfa, con);
 		to = s;
@ -1134,91 +1196,205 @@ fixempties(struct nfa * nfa,
 		   FILE *f)				/* for debug output; NULL none */
 {
 	struct state *s;
+	struct state *s2;
 	struct state *nexts;
 	struct arc *a;
 	struct arc *nexta;
-	int			progress;

-	/* find and eliminate empties until there are no more */
-	do
+	/*
+	 * First, get rid of any states whose sole out-arc is an EMPTY, since
+	 * they're basically just aliases for their successor.  The parsing
+	 * algorithm creates enough of these that it's worth special-casing this.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
 	{
-		progress = 0;
-		for (s = nfa->states; s != NULL && !NISERR() &&
-			 s->no != FREESTATE; s = nexts)
+		nexts = s->next;
+		if (s->flag || s->nouts != 1)
+			continue;
+		a = s->outs;
+		assert(a != NULL && a->outchain == NULL);
+		if (a->type != EMPTY)
+			continue;
+		if (s != a->to)
+			moveins(nfa, s, a->to);
+		dropstate(nfa, s);
+	}
+
+	/*
+	 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
+	 * it into its predecessor.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+	{
+		nexts = s->next;
+		/* while we're at it, ensure tmp fields are clear for next step */
+		assert(s->tmp == NULL);
+		if (s->flag || s->nins != 1)
+			continue;
+		a = s->ins;
+		assert(a != NULL && a->inchain == NULL);
+		if (a->type != EMPTY)
+			continue;
+		if (s != a->from)
+			moveouts(nfa, s, a->from);
+		dropstate(nfa, s);
+	}
+
+	/*
+	 * For each remaining NFA state, find all other states that are reachable
+	 * from it by a chain of one or more EMPTY arcs.  Then generate new arcs
+	 * that eliminate the need for each such chain.
+	 *
+	 * If we just do this straightforwardly, the algorithm gets slow in
+	 * complex graphs, because the same arcs get copied to all intermediate
+	 * states of an EMPTY chain, and then uselessly pushed repeatedly to the
+	 * chain's final state; we waste a lot of time in newarc's duplicate
+	 * checking.  To improve matters, we decree that any state with only EMPTY
+	 * out-arcs is "doomed" and will not be part of the final NFA. That can be
+	 * ensured by not adding any new out-arcs to such a state. Having ensured
+	 * that, we need not update the state's in-arcs list either; all arcs that
+	 * might have gotten pushed forward to it will just get pushed directly to
+	 * successor states.  This eliminates most of the useless duplicate arcs.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+	{
+		for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts)
 		{
-			nexts = s->next;
-			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
-			{
-				nexta = a->outchain;
-				if (a->type == EMPTY && unempty(nfa, a))
-					progress = 1;
-				assert(nexta == NULL || s->no != FREESTATE);
-			}
+			/*
+			 * If s2 is doomed, we decide that (1) we will always push arcs
+			 * forward to it, not pull them back to s; and (2) we can optimize
+			 * away the push-forward, per comment above.  So do nothing.
+			 */
+			if (s2->flag || hasnonemptyout(s2))
+				replaceempty(nfa, s, s2);
+
+			/* Reset the tmp fields as we walk back */
+			nexts = s2->tmp;
+			s2->tmp = NULL;
 		}
-		if (progress && f != NULL)
-			dumpnfa(nfa, f);
-	} while (progress && !NISERR());
+		s->tmp = NULL;
+	}
+
+	if (NISERR())
+		return;
+
+	/*
+	 * Now remove all the EMPTY arcs, since we don't need them anymore.
+	 */
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		for (a = s->outs; a != NULL; a = nexta)
+		{
+			nexta = a->outchain;
+			if (a->type == EMPTY)
+				freearc(nfa, a);
+		}
+	}
+
+	/*
+	 * And remove any states that have become useless.	(This cleanup is not
+	 * very thorough, and would be even less so if we tried to combine it with
+	 * the previous step; but cleanup() will take care of anything we miss.)
+	 */
+	for (s = nfa->states; s != NULL; s = nexts)
+	{
+		nexts = s->next;
+		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+			dropstate(nfa, s);
+	}
+
+	if (f != NULL)
+		dumpnfa(nfa, f);
 }

 /*
- * unempty - optimize out an EMPTY arc, if possible
+ * emptyreachable - recursively find all states reachable from s by EMPTY arcs
+ *
+ * The return value is the last such state found.  Its tmp field links back
+ * to the next-to-last such state, and so on back to s, so that all these
+ * states can be located without searching the whole NFA.
 *
- * Actually, as it stands this function always succeeds, but the return
- * value is kept with an eye on possible future changes.
+ * The maximum recursion depth here is equal to the length of the longest
+ * loop-free chain of EMPTY arcs, which is surely no more than the size of
+ * the NFA, and in practice will be a lot less than that.
 */
-static int						/* 0 couldn't, 1 could */
-unempty(struct nfa * nfa,
-		struct arc * a)
+static struct state *
+emptyreachable(struct state * s, struct state * lastfound)
 {
-	struct state *from = a->from;
-	struct state *to = a->to;
-	int			usefrom;		/* work on from, as opposed to to? */
-
-	assert(a->type == EMPTY);
-	assert(from != nfa->pre && to != nfa->post);
+	struct arc *a;

-	if (from == to)
-	{							/* vacuous loop */
-		freearc(nfa, a);
-		return 1;
+	s->tmp = lastfound;
+	lastfound = s;
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type == EMPTY && a->to->tmp == NULL)
+			lastfound = emptyreachable(a->to, lastfound);
 	}
+	return lastfound;
+}

-	/* decide which end to work on */
-	usefrom = 1;				/* default:  attack from */
-	if (from->nouts > to->nins)
-		usefrom = 0;
-	else if (from->nouts == to->nins)
+/*
+ * replaceempty - replace an EMPTY arc chain with some non-empty arcs
+ *
+ * The EMPTY arc(s) should be deleted later, but we can't do it here because
+ * they may still be needed to identify other arc chains during fixempties().
+ */
+static void
+replaceempty(struct nfa * nfa,
+			 struct state * from,
+			 struct state * to)
+{
+	int			fromouts;
+	int			toins;
+
+	assert(from != to);
+
+	/*
+	 * Create replacement arcs that bypass the need for the EMPTY chain.  We
+	 * can do this either by pushing arcs forward (linking directly from
+	 * "from"'s predecessors to "to") or by pulling them back (linking
+	 * directly from "from" to "to"'s successors).  In general, we choose
+	 * whichever way creates greater fan-out or fan-in, so as to improve the
+	 * odds of reducing the other state to zero in-arcs or out-arcs and
+	 * thereby being able to delete it.  However, if "from" is doomed (has no
+	 * non-EMPTY out-arcs), we must keep it so, so always push forward in that
+	 * case.
+	 *
+	 * The fan-out/fan-in comparison should count only non-EMPTY arcs.	If
+	 * "from" is doomed, we can skip counting "to"'s arcs, since we want to
+	 * force taking the copyins path in that case.
+	 */
+	fromouts = nonemptyouts(from);
+	toins = (fromouts == 0) ? 1 : nonemptyins(to);
+
+	if (fromouts > toins)
 	{
-		/* decide on secondary issue:  move/copy fewest arcs */
-		if (from->nins > to->nouts)
-			usefrom = 0;
+		copyouts(nfa, to, from, 0);
+		return;
+	}
+	if (fromouts < toins)
+	{
+		copyins(nfa, from, to, 0);
+		return;
 	}

-	freearc(nfa, a);
-	if (usefrom)
+	/*
+	 * fromouts == toins.  Decide on secondary issue: copy fewest arcs.
+	 *
+	 * Doesn't seem to be worth the trouble to exclude empties from these
+	 * comparisons; that takes extra time and doesn't seem to improve the
+	 * resulting graph much.
+	 */
+	if (from->nins > to->nouts)
 	{
-		if (from->nouts == 0)
-		{
-			/* was the state's only outarc */
-			moveins(nfa, from, to);
-			freestate(nfa, from);
-		}
-		else
-			copyins(nfa, from, to);
+		copyouts(nfa, to, from, 0);
+		return;
 	}
 	else
 	{
-		if (to->nins == 0)
-		{
-			/* was the state's only inarc */
-			moveouts(nfa, to, from);
-			freestate(nfa, to);
-		}
-		else
-			copyouts(nfa, to, from);
+		copyins(nfa, from, to, 0);
+		return;
 	}
-
-	return 1;
 }

 /*
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@ -122,12 +122,15 @@ static void destroystate(struct nfa *, struct state *);
 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
 static struct arc *allocarc(struct nfa *, struct state *);
 static void freearc(struct nfa *, struct arc *);
+static int	hasnonemptyout(struct state *);
+static int	nonemptyouts(struct state *);
+static int	nonemptyins(struct state *);
 static struct arc *findarc(struct state *, int, pcolor);
 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
 static void moveins(struct nfa *, struct state *, struct state *);
-static void copyins(struct nfa *, struct state *, struct state *);
+static void copyins(struct nfa *, struct state *, struct state *, int);
 static void moveouts(struct nfa *, struct state *, struct state *);
-static void copyouts(struct nfa *, struct state *, struct state *);
+static void copyouts(struct nfa *, struct state *, struct state *, int);
 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
 static void delsub(struct nfa *, struct state *, struct state *);
 static void deltraverse(struct nfa *, struct state *, struct state *);
@ -146,7 +149,8 @@ static int	push(struct nfa *, struct arc *);
 #define COMPATIBLE	3			/* compatible but not satisfied yet */
 static int	combine(struct arc *, struct arc *);
 static void fixempties(struct nfa *, FILE *);
-static int	unempty(struct nfa *, struct arc *);
+static struct state *emptyreachable(struct state *, struct state *);
+static void replaceempty(struct nfa *, struct state *, struct state *);
 static void cleanup(struct nfa *);
 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
@ -583,7 +587,7 @@ makesearch(struct vars * v,
 	for (s = slist; s != NULL; s = s2)
 	{
 		s2 = newstate(nfa);
-		copyouts(nfa, s, s2);
+		copyouts(nfa, s, s2, 1);
 		for (a = s->ins; a != NULL; a = b)
 		{
 			b = a->inchain;
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@ -153,3 +153,23 @@ explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
   Filter: (proname ~ '^(abc)?d'::text)
 (2 rows)

+-- Test for infinite loop in pullback() (CVE-2007-4772)
+select 'a' ~ '($|^)*';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
+select 'a' ~ '((((((a)*)*)*)*)*)*';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
+ ?column? 
+----------
+ t
+(1 row)
+
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@ -34,3 +34,10 @@ explain (costs off) select * from pg_proc where proname ~ '^abc+d';
 explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
 explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
 explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
+
+-- Test for infinite loop in pullback() (CVE-2007-4772)
+select 'a' ~ '($|^)*';
+
+-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
+select 'a' ~ '((((((a)*)*)*)*)*)*';
+select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';