@ -10,11 +10,7 @@
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
# ifdef FRONTEND
# include "postgres_fe.h"
# else
# include "postgres.h"
# endif
# include "c.h"
# include "mb/pg_wchar.h"
@ -838,6 +834,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
return cnt ;
}
/* exported for direct use by conv.c */
int
pg_mule_mblen ( const unsigned char * s )
{
@ -1498,214 +1495,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
return true ;
}
# ifndef FRONTEND
/*
* Generic character incrementer function .
*
* Not knowing anything about the properties of the encoding in use , we just
* keep incrementing the last byte until we get a validly - encoded result ,
* or we run out of values to try . We don ' t bother to try incrementing
* higher - order bytes , so there ' s no growth in runtime for wider characters .
* ( If we did try to do that , we ' d need to consider the likelihood that 255
* is not a valid final byte in the encoding . )
*/
static bool
pg_generic_charinc ( unsigned char * charptr , int len )
{
unsigned char * lastbyte = charptr + len - 1 ;
mbverifier mbverify ;
/* We can just invoke the character verifier directly. */
mbverify = pg_wchar_table [ GetDatabaseEncoding ( ) ] . mbverify ;
while ( * lastbyte < ( unsigned char ) 255 )
{
( * lastbyte ) + + ;
if ( ( * mbverify ) ( charptr , len ) = = len )
return true ;
}
return false ;
}
/*
* UTF - 8 character incrementer function .
*
* For a one - byte character less than 0x7F , we just increment the byte .
*
* For a multibyte character , every byte but the first must fall between 0x80
* and 0xBF ; and the first byte must be between 0xC0 and 0xF4 . We increment
* the last byte that ' s not already at its maximum value . If we can ' t find a
* byte that ' s less than the maximum allowable value , we simply fail . We also
* need some special - case logic to skip regions used for surrogate pair
* handling , as those should not occur in valid UTF - 8.
*
* Note that we don ' t reset lower - order bytes back to their minimums , since
* we can ' t afford to make an exhaustive search ( see make_greater_string ) .
*/
static bool
pg_utf8_increment ( unsigned char * charptr , int length )
{
unsigned char a ;
unsigned char limit ;
switch ( length )
{
default :
/* reject lengths 5 and 6 for now */
return false ;
case 4 :
a = charptr [ 3 ] ;
if ( a < 0xBF )
{
charptr [ 3 ] + + ;
break ;
}
/* FALL THRU */
case 3 :
a = charptr [ 2 ] ;
if ( a < 0xBF )
{
charptr [ 2 ] + + ;
break ;
}
/* FALL THRU */
case 2 :
a = charptr [ 1 ] ;
switch ( * charptr )
{
case 0xED :
limit = 0x9F ;
break ;
case 0xF4 :
limit = 0x8F ;
break ;
default :
limit = 0xBF ;
break ;
}
if ( a < limit )
{
charptr [ 1 ] + + ;
break ;
}
/* FALL THRU */
case 1 :
a = * charptr ;
if ( a = = 0x7F | | a = = 0xDF | | a = = 0xEF | | a = = 0xF4 )
return false ;
charptr [ 0 ] + + ;
break ;
}
return true ;
}
/*
* EUC - JP character incrementer function .
*
* If the sequence starts with SS2 ( 0x8e ) , it must be a two - byte sequence
* representing JIS X 0201 characters with the second byte ranging between
* 0xa1 and 0xdf . We just increment the last byte if it ' s less than 0xdf ,
* and otherwise rewrite the whole sequence to 0xa1 0xa1 .
*
* If the sequence starts with SS3 ( 0x8f ) , it must be a three - byte sequence
* in which the last two bytes range between 0xa1 and 0xfe . The last byte
* is incremented if possible , otherwise the second - to - last byte .
*
* If the sequence starts with a value other than the above and its MSB
* is set , it must be a two - byte sequence representing JIS X 020 8 characters
* with both bytes ranging between 0xa1 and 0xfe . The last byte is
* incremented if possible , otherwise the second - to - last byte .
*
* Otherwise , the sequence is a single - byte ASCII character . It is
* incremented up to 0x7f .
*/
static bool
pg_eucjp_increment ( unsigned char * charptr , int length )
{
unsigned char c1 ,
c2 ;
int i ;
c1 = * charptr ;
switch ( c1 )
{
case SS2 : /* JIS X 0201 */
if ( length ! = 2 )
return false ;
c2 = charptr [ 1 ] ;
if ( c2 > = 0xdf )
charptr [ 0 ] = charptr [ 1 ] = 0xa1 ;
else if ( c2 < 0xa1 )
charptr [ 1 ] = 0xa1 ;
else
charptr [ 1 ] + + ;
break ;
case SS3 : /* JIS X 0212 */
if ( length ! = 3 )
return false ;
for ( i = 2 ; i > 0 ; i - - )
{
c2 = charptr [ i ] ;
if ( c2 < 0xa1 )
{
charptr [ i ] = 0xa1 ;
return true ;
}
else if ( c2 < 0xfe )
{
charptr [ i ] + + ;
return true ;
}
}
/* Out of 3-byte code region */
return false ;
default :
if ( IS_HIGHBIT_SET ( c1 ) ) /* JIS X 0208? */
{
if ( length ! = 2 )
return false ;
for ( i = 1 ; i > = 0 ; i - - )
{
c2 = charptr [ i ] ;
if ( c2 < 0xa1 )
{
charptr [ i ] = 0xa1 ;
return true ;
}
else if ( c2 < 0xfe )
{
charptr [ i ] + + ;
return true ;
}
}
/* Out of 2 byte code region */
return false ;
}
else
{ /* ASCII, single byte */
if ( c1 > 0x7e )
return false ;
( * charptr ) + + ;
}
break ;
}
return true ;
}
# endif /* !FRONTEND */
/*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -1758,13 +1547,6 @@ const pg_wchar_tbl pg_wchar_table[] = {
{ 0 , 0 , pg_sjis_mblen , pg_sjis_dsplen , pg_sjis_verifier , 2 } /* PG_SHIFT_JIS_2004 */
} ;
/* returns the byte length of a word for mule internal code */
int
pg_mic_mblen ( const unsigned char * mbstr )
{
return pg_mule_mblen ( mbstr ) ;
}
/*
* Returns the byte length of a multibyte character .
*/
@ -1810,232 +1592,3 @@ pg_encoding_max_length(int encoding)
return pg_wchar_table [ encoding ] . maxmblen ;
}
# ifndef FRONTEND
/*
* fetch maximum length of the encoding for the current database
*/
int
pg_database_encoding_max_length ( void )
{
return pg_wchar_table [ GetDatabaseEncoding ( ) ] . maxmblen ;
}
/*
* get the character incrementer for the encoding for the current database
*/
mbcharacter_incrementer
pg_database_encoding_character_incrementer ( void )
{
/*
* Eventually it might be best to add a field to pg_wchar_table [ ] , but for
* now we just use a switch .
*/
switch ( GetDatabaseEncoding ( ) )
{
case PG_UTF8 :
return pg_utf8_increment ;
case PG_EUC_JP :
return pg_eucjp_increment ;
default :
return pg_generic_charinc ;
}
}
/*
* Verify mbstr to make sure that it is validly encoded in the current
* database encoding . Otherwise same as pg_verify_mbstr ( ) .
*/
bool
pg_verifymbstr ( const char * mbstr , int len , bool noError )
{
return
pg_verify_mbstr_len ( GetDatabaseEncoding ( ) , mbstr , len , noError ) > = 0 ;
}
/*
* Verify mbstr to make sure that it is validly encoded in the specified
* encoding .
*/
bool
pg_verify_mbstr ( int encoding , const char * mbstr , int len , bool noError )
{
return pg_verify_mbstr_len ( encoding , mbstr , len , noError ) > = 0 ;
}
/*
* Verify mbstr to make sure that it is validly encoded in the specified
* encoding .
*
* mbstr is not necessarily zero terminated ; length of mbstr is
* specified by len .
*
* If OK , return length of string in the encoding .
* If a problem is found , return - 1 when noError is
* true ; when noError is false , ereport ( ) a descriptive message .
*/
int
pg_verify_mbstr_len ( int encoding , const char * mbstr , int len , bool noError )
{
mbverifier mbverify ;
int mb_len ;
Assert ( PG_VALID_ENCODING ( encoding ) ) ;
/*
* In single - byte encodings , we need only reject nulls ( \ 0 ) .
*/
if ( pg_encoding_max_length ( encoding ) < = 1 )
{
const char * nullpos = memchr ( mbstr , 0 , len ) ;
if ( nullpos = = NULL )
return len ;
if ( noError )
return - 1 ;
report_invalid_encoding ( encoding , nullpos , 1 ) ;
}
/* fetch function pointer just once */
mbverify = pg_wchar_table [ encoding ] . mbverify ;
mb_len = 0 ;
while ( len > 0 )
{
int l ;
/* fast path for ASCII-subset characters */
if ( ! IS_HIGHBIT_SET ( * mbstr ) )
{
if ( * mbstr ! = ' \0 ' )
{
mb_len + + ;
mbstr + + ;
len - - ;
continue ;
}
if ( noError )
return - 1 ;
report_invalid_encoding ( encoding , mbstr , len ) ;
}
l = ( * mbverify ) ( ( const unsigned char * ) mbstr , len ) ;
if ( l < 0 )
{
if ( noError )
return - 1 ;
report_invalid_encoding ( encoding , mbstr , len ) ;
}
mbstr + = l ;
len - = l ;
mb_len + + ;
}
return mb_len ;
}
/*
* check_encoding_conversion_args : check arguments of a conversion function
*
* " expected " arguments can be either an encoding ID or - 1 to indicate that
* the caller will check whether it accepts the ID .
*
* Note : the errors here are not really user - facing , so elog instead of
* ereport seems sufficient . Also , we trust that the " expected " encoding
* arguments are valid encoding IDs , but we don ' t trust the actuals .
*/
void
check_encoding_conversion_args ( int src_encoding ,
int dest_encoding ,
int len ,
int expected_src_encoding ,
int expected_dest_encoding )
{
if ( ! PG_VALID_ENCODING ( src_encoding ) )
elog ( ERROR , " invalid source encoding ID: %d " , src_encoding ) ;
if ( src_encoding ! = expected_src_encoding & & expected_src_encoding > = 0 )
elog ( ERROR , " expected source encoding \" %s \" , but got \" %s \" " ,
pg_enc2name_tbl [ expected_src_encoding ] . name ,
pg_enc2name_tbl [ src_encoding ] . name ) ;
if ( ! PG_VALID_ENCODING ( dest_encoding ) )
elog ( ERROR , " invalid destination encoding ID: %d " , dest_encoding ) ;
if ( dest_encoding ! = expected_dest_encoding & & expected_dest_encoding > = 0 )
elog ( ERROR , " expected destination encoding \" %s \" , but got \" %s \" " ,
pg_enc2name_tbl [ expected_dest_encoding ] . name ,
pg_enc2name_tbl [ dest_encoding ] . name ) ;
if ( len < 0 )
elog ( ERROR , " encoding conversion length must not be negative " ) ;
}
/*
* report_invalid_encoding : complain about invalid multibyte character
*
* note : len is remaining length of string , not length of character ;
* len must be greater than zero , as we always examine the first byte .
*/
void
report_invalid_encoding ( int encoding , const char * mbstr , int len )
{
int l = pg_encoding_mblen ( encoding , mbstr ) ;
char buf [ 8 * 5 + 1 ] ;
char * p = buf ;
int j ,
jlimit ;
jlimit = Min ( l , len ) ;
jlimit = Min ( jlimit , 8 ) ; /* prevent buffer overrun */
for ( j = 0 ; j < jlimit ; j + + )
{
p + = sprintf ( p , " 0x%02x " , ( unsigned char ) mbstr [ j ] ) ;
if ( j < jlimit - 1 )
p + = sprintf ( p , " " ) ;
}
ereport ( ERROR ,
( errcode ( ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ) ,
errmsg ( " invalid byte sequence for encoding \" %s \" : %s " ,
pg_enc2name_tbl [ encoding ] . name ,
buf ) ) ) ;
}
/*
* report_untranslatable_char : complain about untranslatable character
*
* note : len is remaining length of string , not length of character ;
* len must be greater than zero , as we always examine the first byte .
*/
void
report_untranslatable_char ( int src_encoding , int dest_encoding ,
const char * mbstr , int len )
{
int l = pg_encoding_mblen ( src_encoding , mbstr ) ;
char buf [ 8 * 5 + 1 ] ;
char * p = buf ;
int j ,
jlimit ;
jlimit = Min ( l , len ) ;
jlimit = Min ( jlimit , 8 ) ; /* prevent buffer overrun */
for ( j = 0 ; j < jlimit ; j + + )
{
p + = sprintf ( p , " 0x%02x " , ( unsigned char ) mbstr [ j ] ) ;
if ( j < jlimit - 1 )
p + = sprintf ( p , " " ) ;
}
ereport ( ERROR ,
( errcode ( ERRCODE_UNTRANSLATABLE_CHARACTER ) ,
errmsg ( " character with byte sequence %s in encoding \" %s \" has no equivalent in encoding \" %s \" " ,
buf ,
pg_enc2name_tbl [ src_encoding ] . name ,
pg_enc2name_tbl [ dest_encoding ] . name ) ) ) ;
}
# endif /* !FRONTEND */