|
|
|
#ifndef _REGEX_H_
|
|
|
|
#define _REGEX_H_ /* never again */
|
|
|
|
/*
|
|
|
|
* regular expressions
|
|
|
|
*
|
|
|
|
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
|
|
|
*
|
|
|
|
* Development of this software was funded, in part, by Cray Research Inc.,
|
|
|
|
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
|
|
|
* Corporation, none of whom are responsible for the results. The author
|
|
|
|
* thanks all of them.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms -- with or without
|
|
|
|
* modification -- are permitted for any purpose, provided that
|
|
|
|
* redistributions in source form retain this entire copyright notice and
|
|
|
|
* indicate the origin and nature of any modifications.
|
|
|
|
*
|
|
|
|
* I'd appreciate being given credit for this package in the documentation
|
|
|
|
* of software which uses it, but that is not a requirement.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
|
|
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
|
|
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
|
|
|
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
|
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
|
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
|
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* src/include/regex/regex.h
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add your own defines, if needed, here.
|
|
|
|
*/
|
|
|
|
#include "mb/pg_wchar.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* interface types etc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* regoff_t has to be large enough to hold either off_t or ssize_t,
|
|
|
|
* and must be signed; it's only a guess that long is suitable.
|
|
|
|
*/
|
|
|
|
typedef long regoff_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* other interface types
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* the biggie, a compiled RE (or rather, a front end to same) */
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
int re_magic; /* magic number */
|
|
|
|
size_t re_nsub; /* number of subexpressions */
|
Avoid generating extra subre tree nodes for capturing parentheses.
Previously, each pair of capturing parentheses gave rise to a separate
subre tree node, whose only function was to identify that we ought to
capture the match details for this particular sub-expression. In
most cases we don't really need that, since we can perfectly well
put a "capture this" annotation on the child node that does the real
matching work. As with the two preceding commits, the main value
of this is to avoid generating and optimizing an NFA for a tree node
that's not really pulling its weight.
The chosen data representation only allows one capture annotation
per subre node. In the legal-per-spec, but seemingly not very useful,
case where there are multiple capturing parens around the exact same
bit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capture
nodes that act the same as before. We could work harder at that but
I'll refrain, pending some evidence that such cases are worth troubling
over.
In passing, improve the comments in regex.h to say what all the
different re_info bits mean. Some of them were pretty obvious
but others not so much, so reverse-engineer some documentation.
This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
4 years ago
|
|
|
long re_info; /* bitmask of the following flags: */
|
|
|
|
#define REG_UBACKREF 000001 /* has back-reference (\n) */
|
|
|
|
#define REG_ULOOKAROUND 000002 /* has lookahead/lookbehind constraint */
|
|
|
|
#define REG_UBOUNDS 000004 /* has bounded quantifier ({m,n}) */
|
|
|
|
#define REG_UBRACES 000010 /* has { that doesn't begin a quantifier */
|
|
|
|
#define REG_UBSALNUM 000020 /* has backslash-alphanumeric in non-ARE */
|
|
|
|
#define REG_UPBOTCH 000040 /* has unmatched right paren in ERE (legal
|
|
|
|
* per spec, but that was a mistake) */
|
|
|
|
#define REG_UBBS 000100 /* has backslash within bracket expr */
|
|
|
|
#define REG_UNONPOSIX 000200 /* has any construct that extends POSIX */
|
|
|
|
#define REG_UUNSPEC 000400 /* has any case disallowed by POSIX, e.g.
|
|
|
|
* an empty branch */
|
|
|
|
#define REG_UUNPORT 001000 /* has numeric character code dependency */
|
|
|
|
#define REG_ULOCALE 002000 /* has locale dependency */
|
|
|
|
#define REG_UEMPTYMATCH 004000 /* can match a zero-length string */
|
|
|
|
#define REG_UIMPOSSIBLE 010000 /* provably cannot match anything */
|
|
|
|
#define REG_USHORTEST 020000 /* has non-greedy quantifier */
|
|
|
|
int re_csize; /* sizeof(character) */
|
|
|
|
char *re_endp; /* backward compatibility kludge */
|
|
|
|
Oid re_collation; /* Collation that defines LC_CTYPE behavior */
|
|
|
|
/* the rest is opaque pointers to hidden innards */
|
|
|
|
char *re_guts; /* `char *' is more portable than `void *' */
|
|
|
|
char *re_fns;
|
|
|
|
} regex_t;
|
|
|
|
|
|
|
|
/* result reporting (may acquire more fields later) */
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
regoff_t rm_so; /* start of substring */
|
|
|
|
regoff_t rm_eo; /* end of substring */
|
|
|
|
} regmatch_t;
|
|
|
|
|
|
|
|
/* supplementary control and reporting */
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
regmatch_t rm_extend; /* see REG_EXPECT */
|
|
|
|
} rm_detail_t;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* regex compilation flags
|
|
|
|
*/
|
|
|
|
#define REG_BASIC 000000 /* BREs (convenience) */
|
|
|
|
#define REG_EXTENDED 000001 /* EREs */
|
|
|
|
#define REG_ADVF 000002 /* advanced features in EREs */
|
|
|
|
#define REG_ADVANCED 000003 /* AREs (which are also EREs) */
|
|
|
|
#define REG_QUOTE 000004 /* no special characters, none */
|
|
|
|
#define REG_NOSPEC REG_QUOTE /* historical synonym */
|
|
|
|
#define REG_ICASE 000010 /* ignore case */
|
|
|
|
#define REG_NOSUB 000020 /* don't care about subexpressions */
|
|
|
|
#define REG_EXPANDED 000040 /* expanded format, white space & comments */
|
|
|
|
#define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */
|
|
|
|
#define REG_NLANCH 000200 /* ^ matches after \n, $ before */
|
|
|
|
#define REG_NEWLINE 000300 /* newlines are line terminators */
|
|
|
|
#define REG_PEND 000400 /* ugh -- backward-compatibility hack */
|
|
|
|
#define REG_EXPECT 001000 /* report details on partial/limited matches */
|
|
|
|
#define REG_BOSONLY 002000 /* temporary kludge for BOS-only matches */
|
|
|
|
#define REG_DUMP 004000 /* none of your business :-) */
|
|
|
|
#define REG_FAKE 010000 /* none of your business :-) */
|
|
|
|
#define REG_PROGRESS 020000 /* none of your business :-) */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* regex execution flags
|
|
|
|
*/
|
|
|
|
#define REG_NOTBOL 0001 /* BOS is not BOL */
|
|
|
|
#define REG_NOTEOL 0002 /* EOS is not EOL */
|
|
|
|
#define REG_STARTEND 0004 /* backward compatibility kludge */
|
|
|
|
#define REG_FTRACE 0010 /* none of your business */
|
|
|
|
#define REG_MTRACE 0020 /* none of your business */
|
|
|
|
#define REG_SMALL 0040 /* none of your business */
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* error reporting
|
|
|
|
* Be careful if modifying the list of error codes -- the table used by
|
|
|
|
* regerror() is generated automatically from this file!
|
|
|
|
*/
|
|
|
|
#define REG_OKAY 0 /* no errors detected */
|
|
|
|
#define REG_NOMATCH 1 /* failed to match */
|
|
|
|
#define REG_BADPAT 2 /* invalid regexp */
|
|
|
|
#define REG_ECOLLATE 3 /* invalid collating element */
|
|
|
|
#define REG_ECTYPE 4 /* invalid character class */
|
|
|
|
#define REG_EESCAPE 5 /* invalid escape \ sequence */
|
|
|
|
#define REG_ESUBREG 6 /* invalid backreference number */
|
|
|
|
#define REG_EBRACK 7 /* brackets [] not balanced */
|
|
|
|
#define REG_EPAREN 8 /* parentheses () not balanced */
|
|
|
|
#define REG_EBRACE 9 /* braces {} not balanced */
|
|
|
|
#define REG_BADBR 10 /* invalid repetition count(s) */
|
|
|
|
#define REG_ERANGE 11 /* invalid character range */
|
|
|
|
#define REG_ESPACE 12 /* out of memory */
|
|
|
|
#define REG_BADRPT 13 /* quantifier operand invalid */
|
|
|
|
#define REG_ASSERT 15 /* "can't happen" -- you found a bug */
|
|
|
|
#define REG_INVARG 16 /* invalid argument to regex function */
|
|
|
|
#define REG_MIXED 17 /* character widths of regex and string differ */
|
|
|
|
#define REG_BADOPT 18 /* invalid embedded option */
|
Improve memory-usage accounting in regular-expression compiler.
This code previously counted the number of NFA states it created, and
complained if a limit was exceeded, so as to prevent bizarre regex patterns
from consuming unreasonable time or memory. That's fine as far as it went,
but the code paid no attention to how many arcs linked those states. Since
regexes can be contrived that have O(N) states but will need O(N^2) arcs
after fixempties() processing, it was still possible to blow out memory,
and take a long time doing it too. To fix, modify the bookkeeping to count
space used by both states and arcs.
I did not bother with including the "color map" in the accounting; it
can only grow to a few megabytes, which is not a lot in comparison to
what we're allowing for states+arcs (about 150MB on 64-bit machines
or half that on 32-bit machines).
Looking at some of the larger real-world regexes captured in the Tcl
regression test suite suggests that the most that is likely to be needed
for regexes found in the wild is under 10MB, so I believe that the current
limit has enough headroom to make it okay to keep it as a hard-wired limit.
In connection with this, redefine REG_ETOOBIG as meaning "regular
expression is too complex"; the previous wording of "nfa has too many
states" was already somewhat inapropos because of the error code's use
for stack depth overrun, and it was not very user-friendly either.
Back-patch to all supported branches.
10 years ago
|
|
|
#define REG_ETOOBIG 19 /* regular expression is too complex */
|
|
|
|
#define REG_ECOLORS 20 /* too many colors */
|
|
|
|
#define REG_CANCEL 21 /* operation cancelled */
|
|
|
|
/* two specials for debugging and testing */
|
|
|
|
#define REG_ATOI 101 /* convert error-code name to number */
|
|
|
|
#define REG_ITOA 102 /* convert error-code number to name */
|
|
|
|
/* non-error result codes for pg_regprefix */
|
|
|
|
#define REG_PREFIX (-1) /* identified a common prefix */
|
|
|
|
#define REG_EXACT (-2) /* identified an exact match */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the prototypes for exported functions
|
|
|
|
*/
|
Partial implementation of SQL/JSON path language
SQL 2016 standards among other things contains set of SQL/JSON features for
JSON processing inside of relational database. The core of SQL/JSON is JSON
path language, allowing access parts of JSON documents and make computations
over them. This commit implements partial support JSON path language as
separate datatype called "jsonpath". The implementation is partial because
it's lacking datetime support and suppression of numeric errors. Missing
features will be added later by separate commits.
Support of SQL/JSON features requires implementation of separate nodes, and it
will be considered in subsequent patches. This commit includes following
set of plain functions, allowing to execute jsonpath over jsonb values:
* jsonb_path_exists(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_match(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_query(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_query_array(jsonb, jsonpath[, jsonb, bool]).
* jsonb_path_query_first(jsonb, jsonpath[, jsonb, bool]).
This commit also implements "jsonb @? jsonpath" and "jsonb @@ jsonpath", which
are wrappers over jsonpath_exists(jsonb, jsonpath) and jsonpath_predicate(jsonb,
jsonpath) correspondingly. These operators will have an index support
(implemented in subsequent patches).
Catversion bumped, to add new functions and operators.
Code was written by Nikita Glukhov and Teodor Sigaev, revised by me.
Documentation was written by Oleg Bartunov and Liudmila Mantrova. The work
was inspired by Oleg Bartunov.
Discussion: https://postgr.es/m/fcc6fc6a-b497-f39a-923d-aa34d0c588e8%402ndQuadrant.com
Author: Nikita Glukhov, Teodor Sigaev, Alexander Korotkov, Oleg Bartunov, Liudmila Mantrova
Reviewed-by: Tomas Vondra, Andrew Dunstan, Pavel Stehule, Alexander Korotkov
6 years ago
|
|
|
|
|
|
|
/* regcomp.c */
|
|
|
|
extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
|
|
|
|
extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
|
|
|
|
extern int pg_regprefix(regex_t *, pg_wchar **, size_t *);
|
|
|
|
extern void pg_regfree(regex_t *);
|
|
|
|
extern size_t pg_regerror(int, const regex_t *, char *, size_t);
|
|
|
|
|
Partial implementation of SQL/JSON path language
SQL 2016 standards among other things contains set of SQL/JSON features for
JSON processing inside of relational database. The core of SQL/JSON is JSON
path language, allowing access parts of JSON documents and make computations
over them. This commit implements partial support JSON path language as
separate datatype called "jsonpath". The implementation is partial because
it's lacking datetime support and suppression of numeric errors. Missing
features will be added later by separate commits.
Support of SQL/JSON features requires implementation of separate nodes, and it
will be considered in subsequent patches. This commit includes following
set of plain functions, allowing to execute jsonpath over jsonb values:
* jsonb_path_exists(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_match(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_query(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_query_array(jsonb, jsonpath[, jsonb, bool]).
* jsonb_path_query_first(jsonb, jsonpath[, jsonb, bool]).
This commit also implements "jsonb @? jsonpath" and "jsonb @@ jsonpath", which
are wrappers over jsonpath_exists(jsonb, jsonpath) and jsonpath_predicate(jsonb,
jsonpath) correspondingly. These operators will have an index support
(implemented in subsequent patches).
Catversion bumped, to add new functions and operators.
Code was written by Nikita Glukhov and Teodor Sigaev, revised by me.
Documentation was written by Oleg Bartunov and Liudmila Mantrova. The work
was inspired by Oleg Bartunov.
Discussion: https://postgr.es/m/fcc6fc6a-b497-f39a-923d-aa34d0c588e8%402ndQuadrant.com
Author: Nikita Glukhov, Teodor Sigaev, Alexander Korotkov, Oleg Bartunov, Liudmila Mantrova
Reviewed-by: Tomas Vondra, Andrew Dunstan, Pavel Stehule, Alexander Korotkov
6 years ago
|
|
|
/* regexp.c */
|
|
|
|
extern regex_t *RE_compile_and_cache(text *text_re, int cflags, Oid collation);
|
|
|
|
extern bool RE_compile_and_execute(text *text_re, char *dat, int dat_len,
|
|
|
|
int cflags, Oid collation,
|
|
|
|
int nmatch, regmatch_t *pmatch);
|
Partial implementation of SQL/JSON path language
SQL 2016 standards among other things contains set of SQL/JSON features for
JSON processing inside of relational database. The core of SQL/JSON is JSON
path language, allowing access parts of JSON documents and make computations
over them. This commit implements partial support JSON path language as
separate datatype called "jsonpath". The implementation is partial because
it's lacking datetime support and suppression of numeric errors. Missing
features will be added later by separate commits.
Support of SQL/JSON features requires implementation of separate nodes, and it
will be considered in subsequent patches. This commit includes following
set of plain functions, allowing to execute jsonpath over jsonb values:
* jsonb_path_exists(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_match(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_query(jsonb, jsonpath[, jsonb, bool]),
* jsonb_path_query_array(jsonb, jsonpath[, jsonb, bool]).
* jsonb_path_query_first(jsonb, jsonpath[, jsonb, bool]).
This commit also implements "jsonb @? jsonpath" and "jsonb @@ jsonpath", which
are wrappers over jsonpath_exists(jsonb, jsonpath) and jsonpath_predicate(jsonb,
jsonpath) correspondingly. These operators will have an index support
(implemented in subsequent patches).
Catversion bumped, to add new functions and operators.
Code was written by Nikita Glukhov and Teodor Sigaev, revised by me.
Documentation was written by Oleg Bartunov and Liudmila Mantrova. The work
was inspired by Oleg Bartunov.
Discussion: https://postgr.es/m/fcc6fc6a-b497-f39a-923d-aa34d0c588e8%402ndQuadrant.com
Author: Nikita Glukhov, Teodor Sigaev, Alexander Korotkov, Oleg Bartunov, Liudmila Mantrova
Reviewed-by: Tomas Vondra, Andrew Dunstan, Pavel Stehule, Alexander Korotkov
6 years ago
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
#endif /* _REGEX_H_ */
|