@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $ PostgreSQL : pgsql / src / backend / utils / adt / regexp . c , v 1.71 2007 / 03 / 28 22 : 59 : 37 neilc Exp $
* $ PostgreSQL : pgsql / src / backend / utils / adt / regexp . c , v 1.72 2007 / 08 / 11 03 : 56 : 24 tgl Exp $
*
* Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there ' s a good chance
@ -29,19 +29,42 @@
*/
# include "postgres.h"
# include "access/heapam.h"
# include "catalog/pg_type.h"
# include "funcapi.h"
# include "regex/regex.h"
# include "utils/builtins.h"
# include "utils/guc.h"
# include "utils/lsyscache.h"
# define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
( PG_NARGS ( ) > ( _n ) ? PG_GETARG_TEXT_P ( _n ) : NULL )
/* GUC-settable flavor parameter */
static int regex_flavor = REG_ADVANCED ;
/* all the options of interest for regex functions */
typedef struct pg_re_flags
{
int cflags ; /* compile flags for Spencer's regex code */
bool glob ; /* do it globally (for each occurrence) */
} pg_re_flags ;
/* cross-call state for regexp_matches(), also regexp_split() */
typedef struct regexp_matches_ctx
{
text * orig_str ; /* data string in original TEXT form */
int nmatches ; /* number of places where pattern matched */
int npatterns ; /* number of capturing subpatterns */
/* We store start char index and end+1 char index for each match */
/* so the number of entries in match_locs is nmatches * npatterns * 2 */
int * match_locs ; /* 0-based character indexes */
int next_match ; /* 0-based index of next match to process */
/* workspace for build_regexp_matches_result() */
Datum * elems ; /* has npatterns elements */
bool * nulls ; /* has npatterns elements */
} regexp_matches_ctx ;
/*
* We cache precompiled regular expressions using a " self organizing list "
* structure , in which recently - used items tend to be near the front .
@ -79,48 +102,18 @@ typedef struct cached_re_str
regex_t cre_re ; /* the compiled regular expression */
} cached_re_str ;
typedef struct re_comp_flags
{
int cflags ;
bool glob ;
} re_comp_flags ;
typedef struct regexp_matches_ctx
{
text * orig_str ;
size_t orig_len ;
pg_wchar * wide_str ;
size_t wide_len ;
regex_t * cpattern ;
regmatch_t * pmatch ;
size_t offset ;
re_comp_flags flags ;
} regexp_matches_ctx ;
typedef struct regexp_split_ctx
{
text * orig_str ;
size_t orig_len ;
pg_wchar * wide_str ;
size_t wide_len ;
regex_t * cpattern ;
regmatch_t match ;
size_t offset ;
re_comp_flags flags ;
} regexp_split_ctx ;
static int num_res = 0 ; /* # of cached re's */
static cached_re_str re_array [ MAX_CACHED_RES ] ; /* cached re's */
static regexp_matches_ctx * setup_regexp_matches ( text * orig_str , text * pattern ,
text * flags ) ;
static ArrayType * perform_regexp_matches ( regexp_matches_ctx * matchctx ) ;
static regexp_split_ctx * setup_regexp_split ( text * str , text * pattern ,
text * flags ) ;
static Datum get_next_split ( regexp_split_ctx * splitctx ) ;
/* Local functions */
static regexp_matches_ctx * setup_regexp_matches ( text * orig_str , text * pattern ,
text * flags ,
bool force_glob ,
bool use_subpatterns ,
bool ignore_degenerate ) ;
static ArrayType * build_regexp_matches_result ( regexp_matches_ctx * matchctx ) ;
static Datum build_regexp_split_result ( regexp_matches_ctx * splitctx ) ;
/*
@ -139,7 +132,7 @@ RE_compile_and_cache(text *text_re, int cflags)
{
int text_re_len = VARSIZE ( text_re ) ;
pg_wchar * pattern ;
size_t pattern_len ;
int pattern_len ;
int i ;
int regcomp_result ;
cached_re_str re_temp ;
@ -235,7 +228,7 @@ RE_compile_and_cache(text *text_re, int cflags)
}
/*
* RE_wchar_execute - execute a RE
* RE_wchar_execute - execute a RE on pg_wchar data
*
* Returns TRUE on match , FALSE on no match
*
@ -250,7 +243,7 @@ RE_compile_and_cache(text *text_re, int cflags)
*/
static bool
RE_wchar_execute ( regex_t * re , pg_wchar * data , int data_len ,
size_ t start_search , int nmatch , regmatch_t * pmatch )
in t start_search , int nmatch , regmatch_t * pmatch )
{
int regexec_result ;
char errMsg [ 100 ] ;
@ -295,7 +288,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
int nmatch , regmatch_t * pmatch )
{
pg_wchar * data ;
size_t data_len ;
int data_len ;
bool match ;
/* Convert data string to wide characters */
@ -304,6 +297,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
/* Perform RE match and return result */
match = RE_wchar_execute ( re , data , data_len , 0 , nmatch , pmatch ) ;
pfree ( data ) ;
return match ;
}
@ -334,17 +328,28 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
return RE_execute ( re , dat , dat_len , nmatch , pmatch ) ;
}
/*
* parse_re_flags - parse the options argument of regexp_matches and friends
*
* flags - - - output argument , filled with desired options
* opts - - - * untoasted * TEXT object , or NULL for defaults
*
* This accepts all the options allowed by any of the callers ; callers that
* don ' t want some have to reject them after the fact .
*/
static void
parse_re_comp_flags ( re_comp_flags * flags , text * opts )
parse_re_flags ( pg_ re_flags * flags , text * opts )
{
MemSet ( flags , 0 , sizeof ( re_comp_flags ) ) ;
/* regex_flavor is always folded into the compile flags */
flags - > cflags = regex_flavor ;
flags - > glob = false ;
if ( opts )
{
char * opt_p = VARDATA ( opts ) ;
size_t opt_len = VARSIZE ( opts ) - VARHDRSZ ;
int i ;
char * opt_p = VARDATA ( opts ) ;
int opt_len = VARSIZE ( opts ) - VARHDRSZ ;
int i ;
for ( i = 0 ; i < opt_len ; i + + )
{
@ -353,28 +358,49 @@ parse_re_comp_flags(re_comp_flags *flags, text *opts)
case ' g ' :
flags - > glob = true ;
break ;
case ' i ' :
case ' b ' : /* BREs (but why???) */
flags - > cflags & = ~ ( REG_ADVANCED | REG_EXTENDED | REG_QUOTE ) ;
break ;
case ' c ' : /* case sensitive */
flags - > cflags & = ~ REG_ICASE ;
break ;
case ' e ' : /* plain EREs */
flags - > cflags | = REG_EXTENDED ;
flags - > cflags & = ~ ( REG_ADVANCED | REG_QUOTE ) ;
break ;
case ' i ' : /* case insensitive */
flags - > cflags | = REG_ICASE ;
break ;
case ' m ' :
case ' n ' :
case ' m ' : /* Perloid synonym for n */
case ' n ' : /* \n affects ^ $ . [^ */
flags - > cflags | = REG_NEWLINE ;
break ;
case ' p ' :
case ' p ' : /* ~Perl, \n affects . [^ */
flags - > cflags | = REG_NLSTOP ;
flags - > cflags & = ~ REG_NLANCH ;
break ;
case ' w ' :
case ' q ' : /* literal string */
flags - > cflags | = REG_QUOTE ;
flags - > cflags & = ~ ( REG_ADVANCED | REG_EXTENDED ) ;
break ;
case ' s ' : /* single line, \n ordinary */
flags - > cflags & = ~ REG_NEWLINE ;
break ;
case ' t ' : /* tight syntax */
flags - > cflags & = ~ REG_EXPANDED ;
break ;
case ' w ' : /* weird, \n affects ^ $ only */
flags - > cflags & = ~ REG_NLSTOP ;
flags - > cflags | = REG_NLANCH ;
break ;
case ' x ' :
case ' x ' : /* expanded syntax */
flags - > cflags | = REG_EXPANDED ;
break ;
default :
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " invalid regexp option: %c " , opt_p [ i ] ) ) ) ;
errmsg ( " invalid regexp option: \" %c \" " ,
opt_p [ i ] ) ) ) ;
break ;
}
}
@ -409,6 +435,16 @@ assign_regex_flavor(const char *value, bool doit, GucSource source)
}
/*
* report whether regex_flavor is currently BASIC
*/
bool
regex_flavor_is_basic ( void )
{
return ( regex_flavor = = REG_BASIC ) ;
}
/*
* interface routines called by the function manager
*/
@ -605,16 +641,17 @@ textregexreplace(PG_FUNCTION_ARGS)
text * r = PG_GETARG_TEXT_P ( 2 ) ;
text * opt = PG_GETARG_TEXT_P ( 3 ) ;
regex_t * re ;
re_comp _flags flags ;
pg_ re_flags flags ;
parse_re_comp_ flags ( & flags , opt ) ;
parse_re_flags ( & flags , opt ) ;
re = RE_compile_and_cache ( p , flags . cflags ) ;
PG_RETURN_TEXT_P ( replace_text_regexp ( s , ( void * ) re , r , flags . glob ) ) ;
}
/* similar_escape()
/*
* similar_escape ( )
* Convert a SQL99 regexp pattern to POSIX style , so it can be used by
* our regexp engine .
*/
@ -735,185 +772,255 @@ similar_escape(PG_FUNCTION_ARGS)
PG_RETURN_TEXT_P ( result ) ;
}
# define PG_GETARG_TEXT_P_IF_EXISTS(_n) \
( PG_NARGS ( ) > _n ? PG_GETARG_TEXT_P ( _n ) : NULL )
/*
* regexp_matches ( )
* Return a table of matches of a pattern within a string .
*/
Datum
regexp_matches ( PG_FUNCTION_ARGS )
{
FuncCallContext * funcctx ;
MemoryContext oldcontext ;
regexp_matches_ctx * matchctx ;
if ( SRF_IS_FIRSTCALL ( ) )
{
text * pattern = PG_GETARG_TEXT_P ( 1 ) ;
text * flags = PG_GETARG_TEXT_P_IF_EXISTS ( 2 ) ;
MemoryContext oldcontext ;
funcctx = SRF_FIRSTCALL_INIT ( ) ;
oldcontext = MemoryContextSwitchTo ( funcctx - > multi_call_memory_ctx ) ;
/* be sure to copy the input string into the multi-call ctx */
matchctx = setup_regexp_matches ( PG_GETARG_TEXT_P_COPY ( 0 ) , pattern ,
flags ) ;
flags , false , true , false ) ;
/* Pre-create workspace that build_regexp_matches_result needs */
matchctx - > elems = ( Datum * ) palloc ( sizeof ( Datum ) * matchctx - > npatterns ) ;
matchctx - > nulls = ( bool * ) palloc ( sizeof ( bool ) * matchctx - > npatterns ) ;
MemoryContextSwitchTo ( oldcontext ) ;
funcctx - > user_fctx = ( void * ) matchctx ;
/*
* Avoid run - away function by making sure we never iterate
* more than the length of the text + 1 ( the number of matches
* an empty pattern will make is length + 1 )
*/
if ( matchctx - > flags . glob )
funcctx - > max_calls = matchctx - > wide_len + 1 ;
else
funcctx - > max_calls = 0 ;
}
funcctx = SRF_PERCALL_SETUP ( ) ;
matchctx = ( regexp_matches_ctx * ) funcctx - > user_fctx ;
if ( funcctx - > call_cntr > funcctx - > max_calls )
{
/*
* If max_calls = = 0 , then we are doing a non - global match , we
* should stop now , no problem . Otherwise , if we exceed
* max_calls something really wonky is going on , since it is
* returning more matches than there are characters in the
* string , which should not happen
*/
if ( funcctx - > max_calls ! = 0 )
elog ( ERROR , " set returning match function terminated after iterating %d times " ,
funcctx - > call_cntr ) ;
SRF_RETURN_DONE ( funcctx ) ;
}
if ( matchctx - > offset < matchctx - > wide_len )
if ( matchctx - > next_match < matchctx - > nmatches )
{
ArrayType * result_ary ;
if ( matchctx - > pmatch [ 0 ] . rm_so = = matchctx - > pmatch [ 0 ] . rm_eo )
matchctx - > offset + + ;
result_ary = perform_regexp_matches ( matchctx ) ;
if ( result_ary ! = NULL )
{
matchctx - > offset = matchctx - > pmatch [ 0 ] . rm_eo ;
SRF_RETURN_NEXT ( funcctx , PointerGetDatum ( result_ary ) ) ;
}
/* else fall through and return done */
result_ary = build_regexp_matches_result ( matchctx ) ;
matchctx - > next_match + + ;
SRF_RETURN_NEXT ( funcctx , PointerGetDatum ( result_ary ) ) ;
}
SRF_RETURN_DONE ( funcctx ) ;
}
/* This is separate to keep the opr_sanity regression test from complaining */
Datum
regexp_matches_no_flags ( PG_FUNCTION_ARGS )
{
return regexp_matches ( fcinfo ) ;
}
/*
* setup_regexp_matches - - - do the initial matching for regexp_matches ( )
* or regexp_split ( )
*
* To avoid having to re - find the compiled pattern on each call , we do
* all the matching in one swoop . The returned regexp_matches_ctx contains
* the locations of all the substrings matching the pattern .
*
* The three bool parameters have only two patterns ( one for each caller )
* but it seems clearer to distinguish the functionality this way than to
* key it all off one " is_split " flag .
*/
static regexp_matches_ctx *
setup_regexp_matches ( text * orig_str , text * pattern , text * flags )
setup_regexp_matches ( text * orig_str , text * pattern , text * flags ,
bool force_glob , bool use_subpatterns ,
bool ignore_degenerate )
{
regexp_matches_ctx * matchctx = palloc ( sizeof ( regexp_matches_ctx ) ) ;
regexp_matches_ctx * matchctx = palloc0 ( sizeof ( regexp_matches_ctx ) ) ;
int orig_len ;
pg_wchar * wide_str ;
int wide_len ;
pg_re_flags re_flags ;
regex_t * cpattern ;
regmatch_t * pmatch ;
int pmatch_len ;
int array_len ;
int array_idx ;
int prev_match_end ;
int start_search ;
/* save original string --- we'll extract result substrings from it */
matchctx - > orig_str = orig_str ;
matchctx - > orig_len = VARSIZE ( matchctx - > orig_str ) - VARHDRSZ ;
parse_re_comp_flags ( & matchctx - > flags , flags ) ;
matchctx - > cpattern = RE_compile_and_cache ( pattern , matchctx - > flags . cflags ) ;
matchctx - > pmatch = palloc ( sizeof ( regmatch_t ) * ( matchctx - > cpattern - > re_nsub + 1 ) ) ;
matchctx - > offset = 0 ;
/* convert string to pg_wchar form for matching */
orig_len = VARSIZE ( orig_str ) - VARHDRSZ ;
wide_str = ( pg_wchar * ) palloc ( sizeof ( pg_wchar ) * ( orig_len + 1 ) ) ;
wide_len = pg_mb2wchar_with_len ( VARDATA ( orig_str ) , wide_str , orig_len ) ;
matchctx - > wide_str = palloc ( sizeof ( pg_wchar ) * ( matchctx - > orig_len + 1 ) ) ;
matchctx - > wide_len = pg_mb2wchar_with_len ( VARDATA ( matchctx - > orig_str ) ,
matchctx - > wide_str , matchctx - > orig_len ) ;
matchctx - > pmatch [ 0 ] . rm_so = - 1 ;
/* both < 0 but not equal */
matchctx - > pmatch [ 0 ] . rm_eo = - 2 ;
/* determine options */
parse_re_flags ( & re_flags , flags ) ;
if ( force_glob )
{
/* user mustn't specify 'g' for regexp_split */
if ( re_flags . glob )
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " regexp_split does not support the global option " ) ) ) ;
/* but we find all the matches anyway */
re_flags . glob = true ;
}
return matchctx ;
}
/* set up the compiled pattern */
cpattern = RE_compile_and_cache ( pattern , re_flags . cflags ) ;
static ArrayType *
perform_regexp_matches ( regexp_matches_ctx * matchctx )
{
Datum * elems ;
bool * nulls ;
Datum fullmatch ; /* used to avoid a palloc if no matches */
int ndims = 1 ;
int dims [ 1 ] ;
int lbs [ 1 ] = { 1 } ;
if ( RE_wchar_execute ( matchctx - > cpattern ,
matchctx - > wide_str ,
matchctx - > wide_len ,
matchctx - > offset ,
matchctx - > cpattern - > re_nsub + 1 ,
matchctx - > pmatch ) = = false )
return NULL ;
if ( matchctx - > cpattern - > re_nsub > 0 )
/* do we want to remember subpatterns? */
if ( use_subpatterns & & cpattern - > re_nsub > 0 )
{
int i ;
matchctx - > npatterns = cpattern - > re_nsub ;
pmatch_len = cpattern - > re_nsub + 1 ;
}
else
{
use_subpatterns = false ;
matchctx - > npatterns = 1 ;
pmatch_len = 1 ;
}
elems = palloc ( sizeof ( Datum ) * matchctx - > cpattern - > re_nsub ) ;
nulls = palloc ( sizeof ( bool ) * matchctx - > cpattern - > re_nsub ) ;
dims [ 0 ] = matchctx - > cpattern - > re_nsub ;
/* temporary output space for RE package */
pmatch = palloc ( sizeof ( regmatch_t ) * pmatch_len ) ;
for ( i = 0 ; i < matchctx - > cpattern - > re_nsub ; i + + )
/* the real output space (grown dynamically if needed) */
array_len = re_flags . glob ? 256 : 32 ;
matchctx - > match_locs = ( int * ) palloc ( sizeof ( int ) * array_len ) ;
array_idx = 0 ;
/* search for the pattern, perhaps repeatedly */
prev_match_end = 0 ;
start_search = 0 ;
while ( RE_wchar_execute ( cpattern , wide_str , wide_len , start_search ,
pmatch_len , pmatch ) )
{
/*
* If requested , ignore degenerate matches , which are zero - length
* matches occurring at the start or end of a string or just after
* a previous match .
*/
if ( ! ignore_degenerate | |
( pmatch [ 0 ] . rm_so < wide_len & &
pmatch [ 0 ] . rm_eo > prev_match_end ) )
{
int so = matchctx - > pmatch [ i + 1 ] . rm_so ;
int eo = matchctx - > pmatch [ i + 1 ] . rm_eo ;
/* enlarge output space if needed */
while ( array_idx + matchctx - > npatterns * 2 > array_len )
{
array_len * = 2 ;
matchctx - > match_locs = ( int * ) repalloc ( matchctx - > match_locs ,
sizeof ( int ) * array_len ) ;
}
if ( so < 0 | | eo < 0 )
/* save this match's locations */
if ( use_subpatterns )
{
elems [ i ] = 0 ;
nulls [ i ] = true ;
int i ;
for ( i = 1 ; i < = matchctx - > npatterns ; i + + )
{
matchctx - > match_locs [ array_idx + + ] = pmatch [ i ] . rm_so ;
matchctx - > match_locs [ array_idx + + ] = pmatch [ i ] . rm_eo ;
}
}
else
{
elems [ i ] = DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( matchctx - > orig_str ) ,
Int32GetDatum ( so + 1 ) ,
Int32GetDatum ( eo - so ) ) ;
nulls [ i ] = false ;
matchctx - > match_locs [ array_idx + + ] = pmatch [ 0 ] . rm_so ;
matchctx - > match_locs [ array_idx + + ] = pmatch [ 0 ] . rm_eo ;
}
matchctx - > nmatches + + ;
}
prev_match_end = pmatch [ 0 ] . rm_eo ;
/* if not glob, stop after one match */
if ( ! re_flags . glob )
break ;
/*
* Advance search position . Normally we start just after the end
* of the previous match , but always advance at least one character
* ( the special case can occur if the pattern matches zero characters
* just after the prior match or at the end of the string ) .
*/
if ( start_search < pmatch [ 0 ] . rm_eo )
start_search = pmatch [ 0 ] . rm_eo ;
else
start_search + + ;
if ( start_search > wide_len )
break ;
}
else
{
int so = matchctx - > pmatch [ 0 ] . rm_so ;
int eo = matchctx - > pmatch [ 0 ] . rm_eo ;
if ( so < 0 | | eo < 0 )
elog ( ERROR , " regexp code said it had a match, but did not return it " ) ;
/* Clean up temp storage */
pfree ( wide_str ) ;
pfree ( pmatch ) ;
fullmatch = DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( matchctx - > orig_str ) ,
Int32GetDatum ( so + 1 ) ,
Int32GetDatum ( eo - so ) ) ;
return matchctx ;
}
/*
* build_regexp_matches_result - build output array for current match
*/
static ArrayType *
build_regexp_matches_result ( regexp_matches_ctx * matchctx )
{
Datum * elems = matchctx - > elems ;
bool * nulls = matchctx - > nulls ;
int dims [ 1 ] ;
int lbs [ 1 ] ;
int loc ;
int i ;
elems = & fullmatch ;
nulls = NULL ;
dims [ 0 ] = 1 ;
/* Extract matching substrings from the original string */
loc = matchctx - > next_match * matchctx - > npatterns * 2 ;
for ( i = 0 ; i < matchctx - > npatterns ; i + + )
{
int so = matchctx - > match_locs [ loc + + ] ;
int eo = matchctx - > match_locs [ loc + + ] ;
if ( so < 0 | | eo < 0 )
{
elems [ i ] = ( Datum ) 0 ;
nulls [ i ] = true ;
}
else
{
elems [ i ] = DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( matchctx - > orig_str ) ,
Int32GetDatum ( so + 1 ) ,
Int32GetDatum ( eo - so ) ) ;
nulls [ i ] = false ;
}
}
/* And form an array */
dims [ 0 ] = matchctx - > npatterns ;
lbs [ 0 ] = 1 ;
/* XXX: this hardcodes assumptions about the text type */
return construct_md_array ( elems , nulls , ndims , dims , lbs ,
return construct_md_array ( elems , nulls , 1 , dims , lbs ,
TEXTOID , - 1 , false , ' i ' ) ;
}
/*
* regexp_split_to_table ( )
* Split the string at matches of the pattern , returning the
* split - out substrings as a table .
*/
Datum
regexp_split_to_table ( PG_FUNCTION_ARGS )
{
FuncCallContext * funcctx ;
regexp_split_ctx * splitctx ;
regexp_matches _ctx * splitctx ;
if ( SRF_IS_FIRSTCALL ( ) )
{
@ -924,168 +1031,102 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
funcctx = SRF_FIRSTCALL_INIT ( ) ;
oldcontext = MemoryContextSwitchTo ( funcctx - > multi_call_memory_ctx ) ;
splitctx = setup_regexp_split ( PG_GETARG_TEXT_P_COPY ( 0 ) , pattern , flags ) ;
/* be sure to copy the input string into the multi-call ctx */
splitctx = setup_regexp_matches ( PG_GETARG_TEXT_P_COPY ( 0 ) , pattern ,
flags , true , false , true ) ;
MemoryContextSwitchTo ( oldcontext ) ;
funcctx - > user_fctx = ( void * ) splitctx ;
/*
* Avoid run - away function by making sure we never iterate
* more than the length of the text
*/
funcctx - > max_calls = splitctx - > wide_len ;
}
funcctx = SRF_PERCALL_SETUP ( ) ;
splitctx = ( regexp_split _ctx * ) funcctx - > user_fctx ;
splitctx = ( regexp_matches_ctx * ) funcctx - > user_fctx ;
if ( funcctx - > call_cntr > funcctx - > max_call s)
if ( splitctx - > next_match < = splitctx - > nmatches )
{
/*
* If we exceed wide_len something really wonky is going on ,
* since it is returning more matches than there are
* characters in the string , which should not happen
*/
elog ( ERROR , " set returning split function terminated after iterating %d times " ,
funcctx - > call_cntr ) ;
Datum result = build_regexp_split_result ( splitctx ) ;
splitctx - > next_match + + ;
SRF_RETURN_NEXT ( funcctx , result ) ;
}
if ( splitctx - > offset < splitctx - > wide_len )
SRF_RETURN_NEXT ( funcctx , get_next_split ( splitctx ) ) ;
else
SRF_RETURN_DONE ( funcctx ) ;
SRF_RETURN_DONE ( funcctx ) ;
}
/* This is separate to keep the opr_sanity regression test from complaining */
Datum regexp_split_to_table_no_flags ( PG_FUNCTION_ARGS )
{
return regexp_split_to_table ( fcinfo ) ;
}
/*
* regexp_split_to_array ( )
* Split the string at matches of the pattern , returning the
* split - out substrings as an array .
*/
Datum regexp_split_to_array ( PG_FUNCTION_ARGS )
{
ArrayBuildState * astate = NULL ;
regexp_split_ctx * splitctx ;
int nitems ;
regexp_matches_ctx * splitctx ;
splitctx = setup_regexp_split ( PG_GETARG_TEXT_P ( 0 ) ,
PG_GETARG_TEXT_P ( 1 ) ,
PG_GETARG_TEXT_P_IF_EXISTS ( 2 ) ) ;
splitctx = setup_regexp_matches ( PG_GETARG_TEXT_P ( 0 ) ,
PG_GETARG_TEXT_P ( 1 ) ,
PG_GETARG_TEXT_P_IF_EXISTS ( 2 ) ,
true , false , true ) ;
for ( nitems = 0 ; splitctx - > offset < splitctx - > wide_len ; nitems + + )
while ( splitctx - > next_match < = splitctx - > nmatches )
{
if ( nitems > splitctx - > wide_len )
elog ( ERROR , " split function terminated after iterating %d times " ,
nitems ) ;
astate = accumArrayResult ( astate ,
get_next_spli t( splitctx ) ,
build_regexp_split_result ( splitctx ) ,
false ,
TEXTOID ,
CurrentMemoryContext ) ;
splitctx - > next_match + + ;
}
PG_RETURN_ARRAYTYPE_P ( makeArrayResult ( astate , CurrentMemoryContext ) ) ;
}
/* This is separate to keep the opr_sanity regression test from complaining */
Datum regexp_split_to_array_no_flags ( PG_FUNCTION_ARGS )
{
return regexp_split_to_array ( fcinfo ) ;
}
static regexp_split_ctx *
setup_regexp_split ( text * str , text * pattern , text * flags )
{
regexp_split_ctx * splitctx = palloc ( sizeof ( regexp_split_ctx ) ) ;
splitctx - > orig_str = str ;
splitctx - > orig_len = VARSIZE ( splitctx - > orig_str ) - VARHDRSZ ;
parse_re_comp_flags ( & splitctx - > flags , flags ) ;
if ( splitctx - > flags . glob )
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " regexp_split does not support the global option " ) ) ) ;
splitctx - > cpattern = RE_compile_and_cache ( pattern , splitctx - > flags . cflags ) ;
splitctx - > wide_str = palloc ( sizeof ( pg_wchar ) * ( splitctx - > orig_len + 1 ) ) ;
splitctx - > wide_len = pg_mb2wchar_with_len ( VARDATA ( splitctx - > orig_str ) ,
splitctx - > wide_str ,
splitctx - > orig_len ) ;
splitctx - > offset = 0 ;
splitctx - > match . rm_so = - 1 ;
/* both < 0 but not equal */
splitctx - > match . rm_eo = - 2 ;
return splitctx ;
}
/*
* build_regexp_split_result - build output string for current match
*
* We return the string between the current match and the previous one ,
* or the string after the last match when next_match = = nmatches .
*/
static Datum
get_next_split ( regexp_split _ctx * splitctx )
build_regexp_split_result ( regexp_matches_ctx * splitctx )
{
regmatch_t * pmatch = & ( splitctx - > match ) ;
for ( ; ; )
{
Datum result ;
int startpos = splitctx - > offset + 1 ;
/*
* If the last match was zero - length , we need to push the
* offset forward to avoid matching the same place forever
*/
if ( pmatch - > rm_so = = pmatch - > rm_eo )
splitctx - > offset + + ;
if ( RE_wchar_execute ( splitctx - > cpattern ,
splitctx - > wide_str ,
splitctx - > wide_len ,
splitctx - > offset ,
1 ,
pmatch ) )
{
int length = splitctx - > match . rm_so - startpos + 1 ;
/*
* If we are trying to match at the beginning of the string and
* we got a zero - length match , or if we just matched where we
* left off last time , go around the loop again and increment
* the offset . If we have incremented the offset already and
* it matched at the new offset , that ' s ok
*/
if ( length = = 0 )
continue ;
int startpos ;
int endpos ;
result = DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( splitctx - > orig_str ) ,
Int32GetDatum ( startpos ) ,
Int32GetDatum ( length ) ) ;
/* set the offset to the end of this match for next time */
splitctx - > offset = pmatch - > rm_eo ;
return result ;
}
if ( splitctx - > next_match > 0 )
startpos = splitctx - > match_locs [ splitctx - > next_match * 2 - 1 ] ;
else
startpos = 0 ;
if ( startpos < 0 )
elog ( ERROR , " invalid match ending position " ) ;
if ( splitctx - > next_match < splitctx - > nmatches )
{
endpos = splitctx - > match_locs [ splitctx - > next_match * 2 ] ;
if ( endpos < startpos )
elog ( ERROR , " invalid match starting position " ) ;
return DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( splitctx - > orig_str ) ,
Int32GetDatum ( startpos + 1 ) ,
Int32GetDatum ( endpos - startpos ) ) ;
}
else
{
/* no more matches, return rest of string */
result = DirectFunctionCall2 ( text_substr_no_len ,
PointerGetDatum ( splitctx - > orig_str ) ,
Int32GetDatum ( startpos ) ) ;
/* so we know we're done next time through */
splitctx - > offset = splitctx - > wide_len ;
return result ;
return DirectFunctionCall2 ( text_substr_no_len ,
PointerGetDatum ( splitctx - > orig_str ) ,
Int32GetDatum ( startpos + 1 ) ) ;
}
}
/*
* report whether regex_flavor is currently BASIC
*/
bool
regex_flavor_is_basic ( void )
{
return ( regex_flavor = = REG_BASIC ) ;
}