@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
* $ PostgreSQL : pgsql / src / backend / utils / adt / selfuncs . c , v 1.191 .2 .2 2006 / 05 / 02 04 : 34 : 24 tgl Exp $
* $ PostgreSQL : pgsql / src / backend / utils / adt / selfuncs . c , v 1.191 .2 .3 2007 / 01 / 03 22 : 39 : 42 tgl Exp $
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
@ -3367,7 +3367,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
* These routines support analysis of LIKE and regular - expression patterns
* by the planner / optimizer . It ' s important that they agree with the
* regular - expression code in backend / regex / and the LIKE code in
* backend / utils / adt / like . c .
* backend / utils / adt / like . c . Also , the computation of the fixed prefix
* must be conservative : if we report a string longer than the true fixed
* prefix , the query may produce actually wrong answers , rather than just
* getting a bad selectivity estimate !
*
* Note that the prefix - analysis functions are called from
* backend / optimizer / path / indxpath . c as well as from routines in this file .
@ -3399,6 +3402,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
Oid typeid = patt_const - > consttype ;
int pos ,
match_pos ;
bool is_multibyte = ( pg_database_encoding_max_length ( ) > 1 ) ;
/* the right-hand const is type text or bytea */
Assert ( typeid = = BYTEAOID | | typeid = = TEXTOID ) ;
@ -3448,11 +3452,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
}
/*
* XXX I suspect isalpha ( ) is not an adequately locale - sensitive test
* for characters that can vary under case folding ?
* XXX In multibyte character sets , we can ' t trust isalpha , so assume
* any multibyte char is potentially case - varying .
*/
if ( case_insensitive & & isalpha ( ( unsigned char ) patt [ pos ] ) )
break ;
if ( case_insensitive )
{
if ( is_multibyte & & ( unsigned char ) patt [ pos ] > = 0x80 )
break ;
if ( isalpha ( ( unsigned char ) patt [ pos ] ) )
break ;
}
/*
* NOTE : this code used to think that % % meant a literal % , but
@ -3498,11 +3507,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
int pos ,
match_pos ,
prev_pos ,
prev_match_pos ,
paren_depth ;
prev_match_pos ;
bool have_leading_paren ;
char * patt ;
char * rest ;
Oid typeid = patt_const - > consttype ;
bool is_basic = regex_flavor_is_basic ( ) ;
bool is_multibyte = ( pg_database_encoding_max_length ( ) > 1 ) ;
/*
* Should be unnecessary , there are no bytea regex operators defined . As
@ -3517,8 +3528,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
/* the right-hand const is type text for all of these */
patt = DatumGetCString ( DirectFunctionCall1 ( textout , patt_const - > constvalue ) ) ;
/*
* Check for ARE director prefix . It ' s worth our trouble to recognize
* this because similar_escape ( ) uses it .
*/
pos = 0 ;
if ( strncmp ( patt , " ***: " , 4 ) = = 0 )
{
pos = 4 ;
is_basic = false ;
}
/* Pattern must be anchored left */
if ( patt [ 0 ] ! = ' ^ ' )
if ( patt [ pos ] ! = ' ^ ' )
{
rest = patt ;
@ -3527,72 +3549,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
return Pattern_Prefix_None ;
}
pos + + ;
/*
* If unquoted | is present at paren level 0 in pattern , then there are
* multiple alternatives for the start of the string .
* If ' | ' is present in pattern , then there may be multiple alternatives
* for the start of the string . ( There are cases where this isn ' t so ,
* for instance if the ' | ' is inside parens , but detecting that reliably
* is too hard . )
*/
paren_depth = 0 ;
for ( pos = 1 ; patt [ pos ] ; pos + + )
if ( strchr ( patt + pos , ' | ' ) ! = NULL )
{
if ( patt [ pos ] = = ' | ' & & paren_depth = = 0 )
{
rest = patt ;
rest = patt ;
* prefix_const = NULL ;
* rest_const = string_to_const ( rest , typeid ) ;
* prefix_const = NULL ;
* rest_const = string_to_const ( rest , typeid ) ;
return Pattern_Prefix_None ;
}
else if ( patt [ pos ] = = ' ( ' )
paren_depth + + ;
else if ( patt [ pos ] = = ' ) ' & & paren_depth > 0 )
paren_depth - - ;
else if ( patt [ pos ] = = ' \\ ' )
{
/* backslash quotes the next character */
pos + + ;
if ( patt [ pos ] = = ' \0 ' )
break ;
}
return Pattern_Prefix_None ;
}
/* OK, allocate space for pattern */
match = palloc ( strlen ( patt ) + 1 ) ;
prev_match_pos = match_pos = 0 ;
/* note start at pos 1 to skip leading ^ */
for ( prev_pos = pos = 1 ; patt [ pos ] ; )
/*
* We special - case the syntax ' ^ ( . . . ) $ ' because psql uses it . But beware :
* in BRE mode these parentheses are just ordinary characters . Also ,
* sequences beginning " (? " are not what they seem , unless they ' re " (?: " .
* ( We should recognize that , too , because of similar_escape ( ) . )
*
* Note : it ' s a bit bogus to be depending on the current regex_flavor
* setting here , because the setting could change before the pattern is
* used . We minimize the risk by trusting the flavor as little as we can ,
* but perhaps it would be a good idea to get rid of the " basic " setting .
*/
have_leading_paren = false ;
if ( patt [ pos ] = = ' ( ' & & ! is_basic & &
( patt [ pos + 1 ] ! = ' ? ' | | patt [ pos + 2 ] = = ' : ' ) )
{
have_leading_paren = true ;
pos + = ( patt [ pos + 1 ] ! = ' ? ' ? 1 : 3 ) ;
}
/* Scan remainder of pattern */
prev_pos = pos ;
while ( patt [ pos ] )
{
int len ;
/*
* Check for characters that indicate multiple possible matches here .
* XXX I suspect isalpha ( ) is not an adequately locale - sensitive test
* for characters that can vary under case folding ?
* Also , drop out at ' ) ' or ' $ ' so the termination test works right .
*/
if ( patt [ pos ] = = ' . ' | |
patt [ pos ] = = ' ( ' | |
patt [ pos ] = = ' ) ' | |
patt [ pos ] = = ' [ ' | |
patt [ pos ] = = ' $ ' | |
( case_insensitive & & isalpha ( ( unsigned char ) patt [ pos ] ) ) )
patt [ pos ] = = ' ^ ' | |
patt [ pos ] = = ' $ ' )
break ;
/*
* In AREs , backslash followed by alphanumeric is an escape , not a
* quoted character . Must treat it as having multiple possible
* matches .
* XXX In multibyte character sets , we can ' t trust isalpha , so assume
* any multibyte char is potentially case - varying .
*/
if ( patt [ pos ] = = ' \\ ' & & isalnum ( ( unsigned char ) patt [ pos + 1 ] ) )
break ;
if ( case_insensitive )
{
if ( is_multibyte & & ( unsigned char ) patt [ pos ] > = 0x80 )
break ;
if ( isalpha ( ( unsigned char ) patt [ pos ] ) )
break ;
}
/*
* Check for quantifiers . Except for + , this means the preceding
* character is optional , so we must remove it from the prefix too !
* Note : in BREs , \ { is a quantifier .
*/
if ( patt [ pos ] = = ' * ' | |
patt [ pos ] = = ' ? ' | |
patt [ pos ] = = ' { ' )
patt [ pos ] = = ' { ' | |
( patt [ pos ] = = ' \\ ' & & patt [ pos + 1 ] = = ' { ' ) )
{
match_pos = prev_match_pos ;
pos = prev_pos ;
@ -3603,9 +3639,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
pos = prev_pos ;
break ;
}
/*
* Normally , backslash quotes the next character . But in AREs ,
* backslash followed by alphanumeric is an escape , not a quoted
* character . Must treat it as having multiple possible matches .
* In BREs , \ ( is a parenthesis , so don ' t trust that either .
* Note : since only ASCII alphanumerics are escapes , we don ' t have
* to be paranoid about multibyte here .
*/
if ( patt [ pos ] = = ' \\ ' )
{
/* backslash quotes the next character */
if ( isalnum ( ( unsigned char ) patt [ pos + 1 ] ) | | patt [ pos + 1 ] = = ' ( ' )
break ;
pos + + ;
if ( patt [ pos ] = = ' \0 ' )
break ;
@ -3623,6 +3669,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
match [ match_pos ] = ' \0 ' ;
rest = & patt [ pos ] ;
if ( have_leading_paren & & patt [ pos ] = = ' ) ' )
pos + + ;
if ( patt [ pos ] = = ' $ ' & & patt [ pos + 1 ] = = ' \0 ' )
{
rest = & patt [ pos + 1 ] ;