@ -35,6 +35,7 @@
# include "regex/regex.h"
# include "utils/array.h"
# include "utils/builtins.h"
# include "utils/memutils.h"
# include "utils/varlena.h"
# define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
/* workspace for build_regexp_match_result() */
Datum * elems ; /* has npatterns elements */
bool * nulls ; /* has npatterns elements */
pg_wchar * wide_str ; /* wide-char version of original string */
char * conv_buf ; /* conversion buffer */
int conv_bufsiz ; /* size thereof */
} regexp_matches_ctx ;
/*
@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
pg_re_flags * flags ,
Oid collation ,
bool use_subpatterns ,
bool ignore_degenerate ) ;
static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx ) ;
bool ignore_degenerate ,
bool fetching_unmatched ) ;
static ArrayType * build_regexp_match_result ( regexp_matches_ctx * matchctx ) ;
static Datum build_regexp_split_result ( regexp_matches_ctx * splitctx ) ;
@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
errhint ( " Use the regexp_matches function instead. " ) ) ) ;
matchctx = setup_regexp_matches ( orig_str , pattern , & re_flags ,
PG_GET_COLLATION ( ) , true , false ) ;
PG_GET_COLLATION ( ) , true , false , false ) ;
if ( matchctx - > nmatches = = 0 )
PG_RETURN_NULL ( ) ;
@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
matchctx = setup_regexp_matches ( PG_GETARG_TEXT_P_COPY ( 0 ) , pattern ,
& re_flags ,
PG_GET_COLLATION ( ) ,
true , false ) ;
true , false , false ) ;
/* Pre-create workspace that build_regexp_match_result needs */
matchctx - > elems = ( Datum * ) palloc ( sizeof ( Datum ) * matchctx - > npatterns ) ;
@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
SRF_RETURN_NEXT ( funcctx , PointerGetDatum ( result_ary ) ) ;
}
/* release space in multi-call ctx to avoid intraquery memory leak */
cleanup_regexp_matches ( matchctx ) ;
SRF_RETURN_DONE ( funcctx ) ;
}
@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
* all the matching in one swoop . The returned regexp_matches_ctx contains
* the locations of all the substrings matching the pattern .
*
* The two bool parameters have only two patterns ( one for matching , one for
* The three bool parameters have only two patterns ( one for matching , one for
* splitting ) but it seems clearer to distinguish the functionality this way
* than to key it all off one " is_split " flag .
* than to key it all off one " is_split " flag . We don ' t currently assume that
* fetching_unmatched is exclusive of fetching the matched text too ; if it ' s
* set , the conversion buffer is large enough to fetch any single matched or
* unmatched string , but not any larger substring . ( In practice , when splitting
* the matches are usually small anyway , and it didn ' t seem worth complicating
* the code further . )
*/
static regexp_matches_ctx *
setup_regexp_matches ( text * orig_str , text * pattern , pg_re_flags * re_flags ,
Oid collation ,
bool use_subpatterns ,
bool ignore_degenerate )
bool ignore_degenerate ,
bool fetching_unmatched )
{
regexp_matches_ctx * matchctx = palloc0 ( sizeof ( regexp_matches_ctx ) ) ;
int eml = pg_database_encoding_max_length ( ) ;
int orig_len ;
pg_wchar * wide_str ;
int wide_len ;
@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
int array_idx ;
int prev_match_end ;
int start_search ;
int maxlen = 0 ; /* largest fetch length in characters */
/* save original string --- we'll extract result substrings from it */
matchctx - > orig_str = orig_str ;
@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
/* temporary output space for RE package */
pmatch = palloc ( sizeof ( regmatch_t ) * pmatch_len ) ;
/* the real output space (grown dynamically if needed) */
array_len = re_flags - > glob ? 256 : 32 ;
/*
* the real output space ( grown dynamically if needed )
*
* use values 2 ^ n - 1 , not 2 ^ n , so that we hit the limit at 2 ^ 28 - 1 rather
* than at 2 ^ 27
*/
array_len = re_flags - > glob ? 255 : 31 ;
matchctx - > match_locs = ( int * ) palloc ( sizeof ( int ) * array_len ) ;
array_idx = 0 ;
@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
pmatch [ 0 ] . rm_eo > prev_match_end ) )
{
/* enlarge output space if needed */
while ( array_idx + matchctx - > npatterns * 2 > array_len )
while ( array_idx + matchctx - > npatterns * 2 + 1 > array_len )
{
array_len * = 2 ;
array_len + = array_len + 1 ; /* 2^n-1 => 2^(n+1)-1 */
if ( array_len > MaxAllocSize / sizeof ( int ) )
ereport ( ERROR ,
( errcode ( ERRCODE_PROGRAM_LIMIT_EXCEEDED ) ,
errmsg ( " too many regular expression matches " ) ) ) ;
matchctx - > match_locs = ( int * ) repalloc ( matchctx - > match_locs ,
sizeof ( int ) * array_len ) ;
}
@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
for ( i = 1 ; i < = matchctx - > npatterns ; i + + )
{
matchctx - > match_locs [ array_idx + + ] = pmatch [ i ] . rm_so ;
matchctx - > match_locs [ array_idx + + ] = pmatch [ i ] . rm_eo ;
int so = pmatch [ i ] . rm_so ;
int eo = pmatch [ i ] . rm_eo ;
matchctx - > match_locs [ array_idx + + ] = so ;
matchctx - > match_locs [ array_idx + + ] = eo ;
if ( so > = 0 & & eo > = 0 & & ( eo - so ) > maxlen )
maxlen = ( eo - so ) ;
}
}
else
{
matchctx - > match_locs [ array_idx + + ] = pmatch [ 0 ] . rm_so ;
matchctx - > match_locs [ array_idx + + ] = pmatch [ 0 ] . rm_eo ;
int so = pmatch [ 0 ] . rm_so ;
int eo = pmatch [ 0 ] . rm_eo ;
matchctx - > match_locs [ array_idx + + ] = so ;
matchctx - > match_locs [ array_idx + + ] = eo ;
if ( so > = 0 & & eo > = 0 & & ( eo - so ) > maxlen )
maxlen = ( eo - so ) ;
}
matchctx - > nmatches + + ;
/*
* check length of unmatched portion between end of previous match
* and start of current one
*/
if ( fetching_unmatched & &
pmatch [ 0 ] . rm_so > = 0 & &
( pmatch [ 0 ] . rm_so - prev_match_end ) > maxlen )
maxlen = ( pmatch [ 0 ] . rm_so - prev_match_end ) ;
}
prev_match_end = pmatch [ 0 ] . rm_eo ;
@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
break ;
}
/*
* check length of unmatched portion between end of last match and end of
* input string
*/
if ( fetching_unmatched & &
( wide_len - prev_match_end ) > maxlen )
maxlen = ( wide_len - prev_match_end ) ;
/*
* Keep a note of the end position of the string for the benefit of
* splitting code .
*/
matchctx - > match_locs [ array_idx ] = wide_len ;
if ( eml > 1 )
{
int64 maxsiz = eml * ( int64 ) maxlen ;
int conv_bufsiz ;
/*
* Make the conversion buffer large enough for any substring of
* interest .
*
* Worst case : assume we need the maximum size ( maxlen * eml ) , but take
* advantage of the fact that the original string length in bytes is an
* upper bound on the byte length of any fetched substring ( and we know
* that len + 1 is safe to allocate because the varlena header is longer
* than 1 byte ) .
*/
if ( maxsiz > orig_len )
conv_bufsiz = orig_len + 1 ;
else
conv_bufsiz = maxsiz + 1 ; /* safe since maxsiz < 2^30 */
matchctx - > conv_buf = palloc ( conv_bufsiz ) ;
matchctx - > conv_bufsiz = conv_bufsiz ;
matchctx - > wide_str = wide_str ;
}
else
{
/* No need to keep the wide string if we're in a single-byte charset. */
pfree ( wide_str ) ;
matchctx - > wide_str = NULL ;
matchctx - > conv_buf = NULL ;
matchctx - > conv_bufsiz = 0 ;
}
/* Clean up temp storage */
pfree ( wide_str ) ;
pfree ( pmatch ) ;
return matchctx ;
}
/*
* cleanup_regexp_matches - release memory of a regexp_matches_ctx
*/
static void
cleanup_regexp_matches ( regexp_matches_ctx * matchctx )
{
pfree ( matchctx - > orig_str ) ;
pfree ( matchctx - > match_locs ) ;
if ( matchctx - > elems )
pfree ( matchctx - > elems ) ;
if ( matchctx - > nulls )
pfree ( matchctx - > nulls ) ;
pfree ( matchctx ) ;
}
/*
* build_regexp_match_result - build output array for current match
*/
static ArrayType *
build_regexp_match_result ( regexp_matches_ctx * matchctx )
{
char * buf = matchctx - > conv_buf ;
int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx - > conv_bufsiz ;
Datum * elems = matchctx - > elems ;
bool * nulls = matchctx - > nulls ;
int dims [ 1 ] ;
@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
elems [ i ] = ( Datum ) 0 ;
nulls [ i ] = true ;
}
else if ( buf )
{
int len = pg_wchar2mb_with_len ( matchctx - > wide_str + so ,
buf ,
eo - so ) ;
Assert ( len < bufsiz ) ;
elems [ i ] = PointerGetDatum ( cstring_to_text_with_len ( buf , len ) ) ;
nulls [ i ] = false ;
}
else
{
elems [ i ] = DirectFunctionCall3 ( text_substr ,
@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
splitctx = setup_regexp_matches ( PG_GETARG_TEXT_P_COPY ( 0 ) , pattern ,
& re_flags ,
PG_GET_COLLATION ( ) ,
false , true ) ;
false , true , true ) ;
MemoryContextSwitchTo ( oldcontext ) ;
funcctx - > user_fctx = ( void * ) splitctx ;
@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
SRF_RETURN_NEXT ( funcctx , result ) ;
}
/* release space in multi-call ctx to avoid intraquery memory leak */
cleanup_regexp_matches ( splitctx ) ;
SRF_RETURN_DONE ( funcctx ) ;
}
@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
PG_GETARG_TEXT_PP ( 1 ) ,
& re_flags ,
PG_GET_COLLATION ( ) ,
false , true ) ;
false , true , true ) ;
while ( splitctx - > next_match < = splitctx - > nmatches )
{
@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
splitctx - > next_match + + ;
}
/*
* We don ' t call cleanup_regexp_matches here ; it would try to pfree the
* input string , which we didn ' t copy . The space is not in a long - lived
* memory context anyway .
*/
PG_RETURN_ARRAYTYPE_P ( makeArrayResult ( astate , CurrentMemoryContext ) ) ;
}
@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
static Datum
build_regexp_split_result ( regexp_matches_ctx * splitctx )
{
char * buf = splitctx - > conv_buf ;
int startpos ;
int endpos ;
@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
if ( startpos < 0 )
elog ( ERROR , " invalid match ending position " ) ;
if ( splitctx - > next_match < splitctx - > nmatches )
if ( buf )
{
int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx - > conv_bufsiz ;
int len ;
endpos = splitctx - > match_locs [ splitctx - > next_match * 2 ] ;
if ( endpos < startpos )
elog ( ERROR , " invalid match starting position " ) ;
return DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( splitctx - > orig_str ) ,
Int32GetDatum ( startpos + 1 ) ,
Int32GetDatum ( endpos - startpos ) ) ;
len = pg_wchar2mb_with_len ( splitctx - > wide_str + startpos ,
buf ,
endpos - startpos ) ;
Assert ( len < bufsiz ) ;
return PointerGetDatum ( cstring_to_text_with_len ( buf , len ) ) ;
}
else
{
/* no more matches, return rest of string */
return DirectFunctionCall2 ( text_substr_no_len ,
endpos = splitctx - > match_locs [ splitctx - > next_match * 2 ] ;
if ( endpos < startpos )
elog ( ERROR , " invalid match starting position " ) ;
return DirectFunctionCall3 ( text_substr ,
PointerGetDatum ( splitctx - > orig_str ) ,
Int32GetDatum ( startpos + 1 ) ) ;
Int32GetDatum ( startpos + 1 ) ,
Int32GetDatum ( endpos - startpos ) ) ;
}
}