Fix infinite-loop risk in fixempties() stage of regex compilation.

The previous coding of this function could get into situations where it
would never terminate, because successive passes would re-add EMPTY arcs
that had been removed by the previous pass.  Rewrite the function
completely using a new algorithm that is guaranteed to terminate, and
also seems to be usually faster than the old one.  Per Tcl bugs 3604074
and 3606683.

Tom Lane and Don Porter
pull/3/head
Tom Lane 13 years ago
parent 7ccefe8610
commit a7b61d4f5a
  1. 314
      src/backend/regex/regc_nfa.c
  2. 12
      src/backend/regex/regcomp.c
  3. 20
      src/test/regress/expected/regex.out
  4. 7
      src/test/regress/sql/regex.sql

@ -455,6 +455,56 @@ freearc(struct nfa * nfa,
from->free = victim;
}
/*
* hasnonemptyout - Does state have a non-EMPTY out arc?
*/
static int
hasnonemptyout(struct state * s)
{
struct arc *a;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type != EMPTY)
return 1;
}
return 0;
}
/*
* nonemptyouts - count non-EMPTY out arcs of a state
*/
static int
nonemptyouts(struct state * s)
{
int n = 0;
struct arc *a;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type != EMPTY)
n++;
}
return n;
}
/*
* nonemptyins - count non-EMPTY in arcs of a state
*/
static int
nonemptyins(struct state * s)
{
int n = 0;
struct arc *a;
for (a = s->ins; a != NULL; a = a->inchain)
{
if (a->type != EMPTY)
n++;
}
return n;
}
/*
* findarc - find arc, if any, from given source with given type and color
* If there is more than one such arc, the result is random.
@ -511,19 +561,25 @@ moveins(struct nfa * nfa,
}
/*
* copyins - copy all in arcs of a state to another state
* copyins - copy in arcs of a state to another state
*
* Either all arcs, or only non-empty ones as determined by all value.
*/
static void
copyins(struct nfa * nfa,
struct state * oldState,
struct state * newState)
struct state * newState,
int all)
{
struct arc *a;
assert(oldState != newState);
for (a = oldState->ins; a != NULL; a = a->inchain)
cparc(nfa, a, a->from, newState);
{
if (all || a->type != EMPTY)
cparc(nfa, a, a->from, newState);
}
}
/*
@ -546,19 +602,25 @@ moveouts(struct nfa * nfa,
}
/*
* copyouts - copy all out arcs of a state to another state
* copyouts - copy out arcs of a state to another state
*
* Either all arcs, or only non-empty ones as determined by all value.
*/
static void
copyouts(struct nfa * nfa,
struct state * oldState,
struct state * newState)
struct state * newState,
int all)
{
struct arc *a;
assert(oldState != newState);
for (a = oldState->outs; a != NULL; a = a->outchain)
cparc(nfa, a, newState, a->to);
{
if (all || a->type != EMPTY)
cparc(nfa, a, newState, a->to);
}
}
/*
@ -881,7 +943,7 @@ pull(struct nfa * nfa,
if (NISERR())
return 0;
assert(to != from); /* con is not an inarc */
copyins(nfa, from, s); /* duplicate inarcs */
copyins(nfa, from, s, 1); /* duplicate inarcs */
cparc(nfa, con, s, to); /* move constraint arc */
freearc(nfa, con);
from = s;
@ -1027,7 +1089,7 @@ push(struct nfa * nfa,
s = newstate(nfa);
if (NISERR())
return 0;
copyouts(nfa, to, s); /* duplicate outarcs */
copyouts(nfa, to, s, 1); /* duplicate outarcs */
cparc(nfa, con, from, s); /* move constraint */
freearc(nfa, con);
to = s;
@ -1134,91 +1196,205 @@ fixempties(struct nfa * nfa,
FILE *f) /* for debug output; NULL none */
{
struct state *s;
struct state *s2;
struct state *nexts;
struct arc *a;
struct arc *nexta;
int progress;
/* find and eliminate empties until there are no more */
do
/*
* First, get rid of any states whose sole out-arc is an EMPTY, since
* they're basically just aliases for their successor. The parsing
* algorithm creates enough of these that it's worth special-casing this.
*/
for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
{
progress = 0;
for (s = nfa->states; s != NULL && !NISERR() &&
s->no != FREESTATE; s = nexts)
nexts = s->next;
if (s->flag || s->nouts != 1)
continue;
a = s->outs;
assert(a != NULL && a->outchain == NULL);
if (a->type != EMPTY)
continue;
if (s != a->to)
moveins(nfa, s, a->to);
dropstate(nfa, s);
}
/*
* Similarly, get rid of any state with a single EMPTY in-arc, by folding
* it into its predecessor.
*/
for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
{
nexts = s->next;
/* while we're at it, ensure tmp fields are clear for next step */
assert(s->tmp == NULL);
if (s->flag || s->nins != 1)
continue;
a = s->ins;
assert(a != NULL && a->inchain == NULL);
if (a->type != EMPTY)
continue;
if (s != a->from)
moveouts(nfa, s, a->from);
dropstate(nfa, s);
}
/*
* For each remaining NFA state, find all other states that are reachable
* from it by a chain of one or more EMPTY arcs. Then generate new arcs
* that eliminate the need for each such chain.
*
* If we just do this straightforwardly, the algorithm gets slow in
* complex graphs, because the same arcs get copied to all intermediate
* states of an EMPTY chain, and then uselessly pushed repeatedly to the
* chain's final state; we waste a lot of time in newarc's duplicate
* checking. To improve matters, we decree that any state with only EMPTY
* out-arcs is "doomed" and will not be part of the final NFA. That can be
* ensured by not adding any new out-arcs to such a state. Having ensured
* that, we need not update the state's in-arcs list either; all arcs that
* might have gotten pushed forward to it will just get pushed directly to
* successor states. This eliminates most of the useless duplicate arcs.
*/
for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
{
for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts)
{
nexts = s->next;
for (a = s->outs; a != NULL && !NISERR(); a = nexta)
{
nexta = a->outchain;
if (a->type == EMPTY && unempty(nfa, a))
progress = 1;
assert(nexta == NULL || s->no != FREESTATE);
}
/*
* If s2 is doomed, we decide that (1) we will always push arcs
* forward to it, not pull them back to s; and (2) we can optimize
* away the push-forward, per comment above. So do nothing.
*/
if (s2->flag || hasnonemptyout(s2))
replaceempty(nfa, s, s2);
/* Reset the tmp fields as we walk back */
nexts = s2->tmp;
s2->tmp = NULL;
}
if (progress && f != NULL)
dumpnfa(nfa, f);
} while (progress && !NISERR());
s->tmp = NULL;
}
if (NISERR())
return;
/*
* Now remove all the EMPTY arcs, since we don't need them anymore.
*/
for (s = nfa->states; s != NULL; s = s->next)
{
for (a = s->outs; a != NULL; a = nexta)
{
nexta = a->outchain;
if (a->type == EMPTY)
freearc(nfa, a);
}
}
/*
* And remove any states that have become useless. (This cleanup is not
* very thorough, and would be even less so if we tried to combine it with
* the previous step; but cleanup() will take care of anything we miss.)
*/
for (s = nfa->states; s != NULL; s = nexts)
{
nexts = s->next;
if ((s->nins == 0 || s->nouts == 0) && !s->flag)
dropstate(nfa, s);
}
if (f != NULL)
dumpnfa(nfa, f);
}
/*
* unempty - optimize out an EMPTY arc, if possible
* emptyreachable - recursively find all states reachable from s by EMPTY arcs
*
* The return value is the last such state found. Its tmp field links back
* to the next-to-last such state, and so on back to s, so that all these
* states can be located without searching the whole NFA.
*
* Actually, as it stands this function always succeeds, but the return
* value is kept with an eye on possible future changes.
* The maximum recursion depth here is equal to the length of the longest
* loop-free chain of EMPTY arcs, which is surely no more than the size of
* the NFA, and in practice will be a lot less than that.
*/
static int /* 0 couldn't, 1 could */
unempty(struct nfa * nfa,
struct arc * a)
static struct state *
emptyreachable(struct state * s, struct state * lastfound)
{
struct state *from = a->from;
struct state *to = a->to;
int usefrom; /* work on from, as opposed to to? */
assert(a->type == EMPTY);
assert(from != nfa->pre && to != nfa->post);
struct arc *a;
if (from == to)
{ /* vacuous loop */
freearc(nfa, a);
return 1;
s->tmp = lastfound;
lastfound = s;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type == EMPTY && a->to->tmp == NULL)
lastfound = emptyreachable(a->to, lastfound);
}
return lastfound;
}
/* decide which end to work on */
usefrom = 1; /* default: attack from */
if (from->nouts > to->nins)
usefrom = 0;
else if (from->nouts == to->nins)
/*
* replaceempty - replace an EMPTY arc chain with some non-empty arcs
*
* The EMPTY arc(s) should be deleted later, but we can't do it here because
* they may still be needed to identify other arc chains during fixempties().
*/
static void
replaceempty(struct nfa * nfa,
struct state * from,
struct state * to)
{
int fromouts;
int toins;
assert(from != to);
/*
* Create replacement arcs that bypass the need for the EMPTY chain. We
* can do this either by pushing arcs forward (linking directly from
* "from"'s predecessors to "to") or by pulling them back (linking
* directly from "from" to "to"'s successors). In general, we choose
* whichever way creates greater fan-out or fan-in, so as to improve the
* odds of reducing the other state to zero in-arcs or out-arcs and
* thereby being able to delete it. However, if "from" is doomed (has no
* non-EMPTY out-arcs), we must keep it so, so always push forward in that
* case.
*
* The fan-out/fan-in comparison should count only non-EMPTY arcs. If
* "from" is doomed, we can skip counting "to"'s arcs, since we want to
* force taking the copyins path in that case.
*/
fromouts = nonemptyouts(from);
toins = (fromouts == 0) ? 1 : nonemptyins(to);
if (fromouts > toins)
{
/* decide on secondary issue: move/copy fewest arcs */
if (from->nins > to->nouts)
usefrom = 0;
copyouts(nfa, to, from, 0);
return;
}
if (fromouts < toins)
{
copyins(nfa, from, to, 0);
return;
}
freearc(nfa, a);
if (usefrom)
/*
* fromouts == toins. Decide on secondary issue: copy fewest arcs.
*
* Doesn't seem to be worth the trouble to exclude empties from these
* comparisons; that takes extra time and doesn't seem to improve the
* resulting graph much.
*/
if (from->nins > to->nouts)
{
if (from->nouts == 0)
{
/* was the state's only outarc */
moveins(nfa, from, to);
freestate(nfa, from);
}
else
copyins(nfa, from, to);
copyouts(nfa, to, from, 0);
return;
}
else
{
if (to->nins == 0)
{
/* was the state's only inarc */
moveouts(nfa, to, from);
freestate(nfa, to);
}
else
copyouts(nfa, to, from);
copyins(nfa, from, to, 0);
return;
}
return 1;
}
/*

@ -122,12 +122,15 @@ static void destroystate(struct nfa *, struct state *);
static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
static struct arc *allocarc(struct nfa *, struct state *);
static void freearc(struct nfa *, struct arc *);
static int hasnonemptyout(struct state *);
static int nonemptyouts(struct state *);
static int nonemptyins(struct state *);
static struct arc *findarc(struct state *, int, pcolor);
static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
static void moveins(struct nfa *, struct state *, struct state *);
static void copyins(struct nfa *, struct state *, struct state *);
static void copyins(struct nfa *, struct state *, struct state *, int);
static void moveouts(struct nfa *, struct state *, struct state *);
static void copyouts(struct nfa *, struct state *, struct state *);
static void copyouts(struct nfa *, struct state *, struct state *, int);
static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
static void delsub(struct nfa *, struct state *, struct state *);
static void deltraverse(struct nfa *, struct state *, struct state *);
@ -146,7 +149,8 @@ static int push(struct nfa *, struct arc *);
#define COMPATIBLE 3 /* compatible but not satisfied yet */
static int combine(struct arc *, struct arc *);
static void fixempties(struct nfa *, FILE *);
static int unempty(struct nfa *, struct arc *);
static struct state *emptyreachable(struct state *, struct state *);
static void replaceempty(struct nfa *, struct state *, struct state *);
static void cleanup(struct nfa *);
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
@ -583,7 +587,7 @@ makesearch(struct vars * v,
for (s = slist; s != NULL; s = s2)
{
s2 = newstate(nfa);
copyouts(nfa, s, s2);
copyouts(nfa, s, s2, 1);
for (a = s->ins; a != NULL; a = b)
{
b = a->inchain;

@ -153,3 +153,23 @@ explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
Filter: (proname ~ '^(abc)?d'::text)
(2 rows)
-- Test for infinite loop in pullback() (CVE-2007-4772)
select 'a' ~ '($|^)*';
?column?
----------
t
(1 row)
-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
select 'a' ~ '((((((a)*)*)*)*)*)*';
?column?
----------
t
(1 row)
select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
?column?
----------
t
(1 row)

@ -34,3 +34,10 @@ explain (costs off) select * from pg_proc where proname ~ '^abc+d';
explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
-- Test for infinite loop in pullback() (CVE-2007-4772)
select 'a' ~ '($|^)*';
-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
select 'a' ~ '((((((a)*)*)*)*)*)*';
select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';

Loading…
Cancel
Save