mirror of https://github.com/postgres/postgres
Back-patch of commitsREL8_4_STABLE628cbb50baandc6aae3042b. This has been broken since 7.3, so back-patch to all supported branches.
parent
b84da909d9
commit
b9edaa784e
@ -0,0 +1,256 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* regprefix.c |
||||
* Extract a common prefix, if any, from a compiled regex. |
||||
* |
||||
* |
||||
* Portions Copyright (c) 2012, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1998, 1999 Henry Spencer |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/regex/regprefix.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
|
||||
#include "regex/regguts.h" |
||||
|
||||
|
||||
/*
|
||||
* forward declarations |
||||
*/ |
||||
static int findprefix(struct cnfa * cnfa, struct colormap * cm, |
||||
chr *string, size_t *slength); |
||||
|
||||
|
||||
/*
|
||||
* pg_regprefix - get common prefix for regular expression |
||||
* |
||||
* Returns one of: |
||||
* REG_NOMATCH: there is no common prefix of strings matching the regex |
||||
* REG_PREFIX: there is a common prefix of strings matching the regex |
||||
* REG_EXACT: all strings satisfying the regex must match the same string |
||||
* or a REG_XXX error code |
||||
* |
||||
* In the non-failure cases, *string is set to a malloc'd string containing |
||||
* the common prefix or exact value, of length *slength (measured in chrs |
||||
* not bytes!). |
||||
* |
||||
* This function does not analyze all complex cases (such as lookahead |
||||
* constraints) exactly. Therefore it is possible that some strings matching |
||||
* the reported prefix or exact-match string do not satisfy the regex. But |
||||
* it should never be the case that a string satisfying the regex does not |
||||
* match the reported prefix or exact-match string. |
||||
*/ |
||||
int |
||||
pg_regprefix(regex_t *re, |
||||
chr **string, |
||||
size_t *slength) |
||||
{ |
||||
struct guts *g; |
||||
struct cnfa *cnfa; |
||||
int st; |
||||
|
||||
/* sanity checks */ |
||||
if (string == NULL || slength == NULL) |
||||
return REG_INVARG; |
||||
*string = NULL; /* initialize for failure cases */ |
||||
*slength = 0; |
||||
if (re == NULL || re->re_magic != REMAGIC) |
||||
return REG_INVARG; |
||||
if (re->re_csize != sizeof(chr)) |
||||
return REG_MIXED; |
||||
|
||||
/* setup */ |
||||
g = (struct guts *) re->re_guts; |
||||
if (g->info & REG_UIMPOSSIBLE) |
||||
return REG_NOMATCH; |
||||
|
||||
/*
|
||||
* This implementation considers only the search NFA for the topmost regex |
||||
* tree node. Therefore, constraints such as backrefs are not fully |
||||
* applied, which is allowed per the function's API spec. |
||||
*/ |
||||
assert(g->tree != NULL); |
||||
cnfa = &g->tree->cnfa; |
||||
|
||||
/*
|
||||
* Since a correct NFA should never contain any exit-free loops, it should |
||||
* not be possible for our traversal to return to a previously visited |
||||
* NFA state. Hence we need at most nstates chrs in the output string. |
||||
*/ |
||||
*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); |
||||
if (*string == NULL) |
||||
return REG_ESPACE; |
||||
|
||||
/* do it */ |
||||
st = findprefix(cnfa, &g->cmap, *string, slength); |
||||
|
||||
assert(*slength <= cnfa->nstates); |
||||
|
||||
/* clean up */ |
||||
if (st != REG_PREFIX && st != REG_EXACT) |
||||
{ |
||||
FREE(*string); |
||||
*string = NULL; |
||||
*slength = 0; |
||||
} |
||||
|
||||
return st; |
||||
} |
||||
|
||||
/*
|
||||
* findprefix - extract common prefix from cNFA |
||||
* |
||||
* Results are returned into the preallocated chr array string[], with |
||||
* *slength (which must be preset to zero) incremented for each chr. |
||||
*/ |
||||
static int /* regprefix return code */ |
||||
findprefix(struct cnfa * cnfa, |
||||
struct colormap * cm, |
||||
chr *string, |
||||
size_t *slength) |
||||
{ |
||||
int st; |
||||
int nextst; |
||||
color thiscolor; |
||||
chr c; |
||||
struct carc *ca; |
||||
|
||||
/*
|
||||
* The "pre" state must have only BOS/BOL outarcs, else pattern isn't |
||||
* anchored left. If we have both BOS and BOL, they must go to the |
||||
* same next state. |
||||
*/ |
||||
st = cnfa->pre; |
||||
nextst = -1; |
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
||||
{ |
||||
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) |
||||
{ |
||||
if (nextst == -1) |
||||
nextst = ca->to; |
||||
else if (nextst != ca->to) |
||||
return REG_NOMATCH; |
||||
} |
||||
else |
||||
return REG_NOMATCH; |
||||
} |
||||
if (nextst == -1) |
||||
return REG_NOMATCH; |
||||
|
||||
/*
|
||||
* Scan through successive states, stopping as soon as we find one with |
||||
* more than one acceptable transition character (either multiple colors |
||||
* on out-arcs, or a color with more than one member chr). |
||||
* |
||||
* We could find a state with multiple out-arcs that are all labeled with |
||||
* the same singleton color; this comes from patterns like "^ab(cde|cxy)". |
||||
* In that case we add the chr "c" to the output string but then exit the |
||||
* loop with nextst == -1. This leaves a little bit on the table: if the |
||||
* pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added |
||||
* to the prefix. But chasing multiple parallel state chains doesn't seem |
||||
* worth the trouble. |
||||
*/ |
||||
do |
||||
{ |
||||
st = nextst; |
||||
nextst = -1; |
||||
thiscolor = COLORLESS; |
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
||||
{ |
||||
/* We ignore lookahead constraints */ |
||||
if (ca->co >= cnfa->ncolors) |
||||
continue; |
||||
/* We can also ignore BOS/BOL arcs */ |
||||
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) |
||||
continue; |
||||
/* ... but EOS/EOL arcs terminate the search */ |
||||
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) |
||||
{ |
||||
thiscolor = COLORLESS; |
||||
break; |
||||
} |
||||
if (thiscolor == COLORLESS) |
||||
{ |
||||
/* First plain outarc */ |
||||
thiscolor = ca->co; |
||||
nextst = ca->to; |
||||
} |
||||
else if (thiscolor == ca->co) |
||||
{ |
||||
/* Another plain outarc for same color */ |
||||
nextst = -1; |
||||
} |
||||
else |
||||
{ |
||||
/* More than one plain outarc color terminates the search */ |
||||
thiscolor = COLORLESS; |
||||
break; |
||||
} |
||||
} |
||||
/* Done if we didn't find exactly one color on plain outarcs */ |
||||
if (thiscolor == COLORLESS) |
||||
break; |
||||
/* The color must be a singleton */ |
||||
if (cm->cd[thiscolor].nchrs != 1) |
||||
break; |
||||
|
||||
/*
|
||||
* Identify the color's sole member chr and add it to the prefix |
||||
* string. In general the colormap data structure doesn't provide a |
||||
* way to find color member chrs, except by trying GETCOLOR() on each |
||||
* possible chr value, which won't do at all. However, for the cases |
||||
* we care about it should be sufficient to test the "firstchr" value, |
||||
* that is the first chr ever added to the color. There are cases |
||||
* where this might no longer be a member of the color (so we do need |
||||
* to test), but none of them are likely to arise for a character that |
||||
* is a member of a common prefix. If we do hit such a corner case, |
||||
* we just fall out without adding anything to the prefix string. |
||||
*/ |
||||
c = cm->cd[thiscolor].firstchr; |
||||
if (GETCOLOR(cm, c) != thiscolor) |
||||
break; |
||||
|
||||
string[(*slength)++] = c; |
||||
|
||||
/* Advance to next state, but only if we have a unique next state */ |
||||
} while (nextst != -1); |
||||
|
||||
/*
|
||||
* If we ended at a state that only has EOS/EOL outarcs leading to the |
||||
* "post" state, then we have an exact-match string. Note this is true |
||||
* even if the string is of zero length. |
||||
*/ |
||||
nextst = -1; |
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
||||
{ |
||||
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) |
||||
{ |
||||
if (nextst == -1) |
||||
nextst = ca->to; |
||||
else if (nextst != ca->to) |
||||
{ |
||||
nextst = -1; |
||||
break; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
nextst = -1; |
||||
break; |
||||
} |
||||
} |
||||
if (nextst == cnfa->post) |
||||
return REG_EXACT; |
||||
|
||||
/*
|
||||
* Otherwise, if we were unable to identify any prefix characters, say |
||||
* NOMATCH --- the pattern is anchored left, but doesn't specify any |
||||
* particular first character. |
||||
*/ |
||||
if (*slength > 0) |
||||
return REG_PREFIX; |
||||
|
||||
return REG_NOMATCH; |
||||
} |
||||
Loading…
Reference in new issue