@ -16,6 +16,7 @@
# include "catalog/pg_opfamily.h"
# include "catalog/pg_type.h"
# include "common/ip.h"
# include "lib/hyperloglog.h"
# include "libpq/libpq-be.h"
# include "libpq/pqformat.h"
# include "miscadmin.h"
@ -24,12 +25,37 @@
# include "nodes/supportnodes.h"
# include "utils/builtins.h"
# include "utils/fmgroids.h"
# include "utils/guc.h"
# include "utils/hashutils.h"
# include "utils/inet.h"
# include "utils/lsyscache.h"
# include "utils/sortsupport.h"
/*
* An IPv4 netmask size is a value in the range of 0 - 32 , which is
* represented with 6 bits in inet / cidr abbreviated keys where possible .
*
* An IPv4 inet / cidr abbreviated key can use up to 25 bits for subnet
* component .
*/
# define ABBREV_BITS_INET4_NETMASK_SIZE 6
# define ABBREV_BITS_INET4_SUBNET 25
/* sortsupport for inet/cidr */
typedef struct
{
int64 input_count ; /* number of non-null values seen */
bool estimating ; /* true if estimating cardinality */
hyperLogLogState abbr_card ; /* cardinality estimator */
} network_sortsupport_state ;
static int32 network_cmp_internal ( inet * a1 , inet * a2 ) ;
static int network_fast_cmp ( Datum x , Datum y , SortSupport ssup ) ;
static int network_cmp_abbrev ( Datum x , Datum y , SortSupport ssup ) ;
static bool network_abbrev_abort ( int memtupcount , SortSupport ssup ) ;
static Datum network_abbrev_convert ( Datum original , SortSupport ssup ) ;
static List * match_network_function ( Node * leftop ,
Node * rightop ,
int indexarg ,
@ -405,6 +431,379 @@ network_cmp(PG_FUNCTION_ARGS)
PG_RETURN_INT32 ( network_cmp_internal ( a1 , a2 ) ) ;
}
/*
* SortSupport strategy routine
*/
Datum
network_sortsupport ( PG_FUNCTION_ARGS )
{
SortSupport ssup = ( SortSupport ) PG_GETARG_POINTER ( 0 ) ;
ssup - > comparator = network_fast_cmp ;
ssup - > ssup_extra = NULL ;
if ( ssup - > abbreviate )
{
network_sortsupport_state * uss ;
MemoryContext oldcontext ;
oldcontext = MemoryContextSwitchTo ( ssup - > ssup_cxt ) ;
uss = palloc ( sizeof ( network_sortsupport_state ) ) ;
uss - > input_count = 0 ;
uss - > estimating = true ;
initHyperLogLog ( & uss - > abbr_card , 10 ) ;
ssup - > ssup_extra = uss ;
ssup - > comparator = network_cmp_abbrev ;
ssup - > abbrev_converter = network_abbrev_convert ;
ssup - > abbrev_abort = network_abbrev_abort ;
ssup - > abbrev_full_comparator = network_fast_cmp ;
MemoryContextSwitchTo ( oldcontext ) ;
}
PG_RETURN_VOID ( ) ;
}
/*
* SortSupport comparison func
*/
static int
network_fast_cmp ( Datum x , Datum y , SortSupport ssup )
{
inet * arg1 = DatumGetInetPP ( x ) ;
inet * arg2 = DatumGetInetPP ( y ) ;
return network_cmp_internal ( arg1 , arg2 ) ;
}
/*
* Abbreviated key comparison func
*/
static int
network_cmp_abbrev ( Datum x , Datum y , SortSupport ssup )
{
if ( x > y )
return 1 ;
else if ( x = = y )
return 0 ;
else
return - 1 ;
}
/*
* Callback for estimating effectiveness of abbreviated key optimization .
*
* We pay no attention to the cardinality of the non - abbreviated data , because
* there is no equality fast - path within authoritative inet comparator .
*/
static bool
network_abbrev_abort ( int memtupcount , SortSupport ssup )
{
network_sortsupport_state * uss = ssup - > ssup_extra ;
double abbr_card ;
if ( memtupcount < 10000 | | uss - > input_count < 10000 | | ! uss - > estimating )
return false ;
abbr_card = estimateHyperLogLog ( & uss - > abbr_card ) ;
/*
* If we have > 100 k distinct values , then even if we were sorting many
* billion rows we ' d likely still break even , and the penalty of undoing
* that many rows of abbrevs would probably not be worth it . At this point
* we stop counting because we know that we ' re now fully committed .
*/
if ( abbr_card > 100000.0 )
{
# ifdef TRACE_SORT
if ( trace_sort )
elog ( LOG ,
" network_abbrev: estimation ends at cardinality %f "
" after " INT64_FORMAT " values (%d rows) " ,
abbr_card , uss - > input_count , memtupcount ) ;
# endif
uss - > estimating = false ;
return false ;
}
/*
* Target minimum cardinality is 1 per ~ 2 k of non - null inputs . 0.5 row
* fudge factor allows us to abort earlier on genuinely pathological data
* where we ' ve had exactly one abbreviated value in the first 2 k
* ( non - null ) rows .
*/
if ( abbr_card < uss - > input_count / 2000.0 + 0.5 )
{
# ifdef TRACE_SORT
if ( trace_sort )
elog ( LOG ,
" network_abbrev: aborting abbreviation at cardinality %f "
" below threshold %f after " INT64_FORMAT " values (%d rows) " ,
abbr_card , uss - > input_count / 2000.0 + 0.5 , uss - > input_count ,
memtupcount ) ;
# endif
return true ;
}
# ifdef TRACE_SORT
if ( trace_sort )
elog ( LOG ,
" network_abbrev: cardinality %f after " INT64_FORMAT
" values (%d rows) " , abbr_card , uss - > input_count , memtupcount ) ;
# endif
return false ;
}
/*
* SortSupport conversion routine . Converts original inet / cidr representation
* to abbreviated key representation that works with simple 3 - way unsigned int
* comparisons . The network_cmp_internal ( ) rules for sorting inet / cidr datums
* are followed by abbreviated comparisons by an encoding scheme that
* conditions keys through careful use of padding .
*
* Some background : inet values have three major components ( take for example
* the address 1.2 .3 .4 / 24 ) :
*
* * A network , or netmasked bits ( 1.2 .3 .0 ) .
* * A netmask size ( / 24 ) .
* * A subnet , or bits outside of the netmask ( 0.0 .0 .4 ) .
*
* cidr values are the same except that with only the first two components - -
* all their subnet bits * must * be zero ( 1.2 .3 .0 / 24 ) .
*
* IPv4 and IPv6 are identical in this makeup , with the difference being that
* IPv4 addresses have a maximum of 32 bits compared to IPv6 ' s 64 bits , so in
* IPv6 each part may be larger .
*
* inet / cdir types compare using these sorting rules . If inequality is detected
* at any step , comparison is finished . If any rule is a tie , the algorithm
* drops through to the next to break it :
*
* 1. IPv4 always appears before IPv6 .
* 2. Network bits are compared .
* 3. Netmask size is compared .
* 4. All bits are compared ( having made it here , we know that both
* netmasked bits and netmask size are equal , so we ' re in effect only
* comparing subnet bits ) .
*
* When generating abbreviated keys for SortSupport , we pack as much as we can
* into a datum while ensuring that when comparing those keys as integers ,
* these rules will be respected . Exact contents depend on IP family and datum
* size .
*
* IPv4
* - - - -
*
* 4 byte datums :
*
* Start with 1 bit for the IP family ( IPv4 or IPv6 ; this bit is present in
* every case below ) followed by all but 1 of the netmasked bits .
*
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - +
* | 1 bit IP | 31 bits network | ( 1 bit network
* | family | ( truncated ) | omitted )
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - +
*
* 8 byte datums :
*
* We have space to store all netmasked bits , followed by the netmask size ,
* followed by 25 bits of the subnet ( 25 bits is usually more than enough in
* practice ) . cidr datums always have all - zero subnet bits .
*
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - +
* | 1 bit IP | 32 bits network | 6 bits | 25 bits subnet |
* | family | ( full ) | network size | ( truncated ) |
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - +
*
* IPv6
* - - - -
*
* 4 byte datums :
*
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - +
* | 1 bit IP | 31 bits network | ( up to 97 bits
* | family | ( truncated ) | network omitted )
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - +
*
* 8 byte datums :
*
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +
* | 1 bit IP | 63 bits network | ( up to 65 bits
* | family | ( truncated ) | network omitted )
* + - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +
*/
static Datum
network_abbrev_convert ( Datum original , SortSupport ssup )
{
network_sortsupport_state * uss = ssup - > ssup_extra ;
inet * authoritative = DatumGetInetPP ( original ) ;
Datum res ,
ipaddr_datum ,
subnet_bitmask ,
network ;
int subnet_size ;
Assert ( ip_family ( authoritative ) = = PGSQL_AF_INET | |
ip_family ( authoritative ) = = PGSQL_AF_INET6 ) ;
/*
* Get an unsigned integer representation of the IP address by taking its
* first 4 or 8 bytes . Always take all 4 bytes of an IPv4 address . Take
* the first 8 bytes of an IPv6 address with an 8 byte datum and 4 bytes
* otherwise .
*
* We ' re consuming an array of unsigned char , so byteswap on little endian
* systems ( an inet ' s ipaddr field stores the most significant byte
* first ) .
*/
if ( ip_family ( authoritative ) = = PGSQL_AF_INET )
{
uint32 ipaddr_datum32 ;
memcpy ( & ipaddr_datum32 , ip_addr ( authoritative ) , sizeof ( uint32 ) ) ;
/* Must byteswap on little-endian machines */
# ifndef WORDS_BIGENDIAN
ipaddr_datum = pg_bswap32 ( ipaddr_datum32 ) ;
# else
ipaddr_datum = ipaddr_datum32 ;
# endif
/* Initialize result without setting ipfamily bit */
res = ( Datum ) 0 ;
}
else
{
memcpy ( & ipaddr_datum , ip_addr ( authoritative ) , sizeof ( Datum ) ) ;
/* Must byteswap on little-endian machines */
ipaddr_datum = DatumBigEndianToNative ( ipaddr_datum ) ;
/* Initialize result with ipfamily (most significant) bit set */
res = ( ( Datum ) 1 ) < < ( SIZEOF_DATUM * BITS_PER_BYTE - 1 ) ;
}
/*
* ipaddr_datum must be " split " : high order bits go in " network " component
* of abbreviated key ( often with zeroed bits at the end due to masking ) ,
* while low order bits go in " subnet " component when there is space for
* one . This is often accomplished by generating a temp datum subnet
* bitmask , which we may reuse later when generating the subnet bits
* themselves . ( Note that subnet bits are only used with IPv4 datums on
* platforms where datum is 8 bytes . )
*
* The number of bits in subnet is used to generate a datum subnet
* bitmask . For example , with a / 24 IPv4 datum there are 8 subnet bits
* ( since 32 - 24 is 8 ) , so the final subnet bitmask is B ' 1111 1111 ' . We
* need explicit handling for cases where the ipaddr bits cannot all fit
* in a datum , though ( otherwise we ' d incorrectly mask the network
* component with IPv6 values ) .
*/
subnet_size = ip_maxbits ( authoritative ) - ip_bits ( authoritative ) ;
Assert ( subnet_size > = 0 ) ;
/* subnet size must work with prefix ipaddr cases */
subnet_size % = SIZEOF_DATUM * BITS_PER_BYTE ;
if ( ip_bits ( authoritative ) = = 0 )
{
/* Fit as many ipaddr bits as possible into subnet */
subnet_bitmask = ( ( Datum ) 0 ) - 1 ;
network = 0 ;
}
else if ( ip_bits ( authoritative ) < SIZEOF_DATUM * BITS_PER_BYTE )
{
/* Split ipaddr bits between network and subnet */
subnet_bitmask = ( ( ( Datum ) 1 ) < < subnet_size ) - 1 ;
network = ipaddr_datum & ~ subnet_bitmask ;
}
else
{
/* Fit as many ipaddr bits as possible into network */
subnet_bitmask = 0 ;
network = ipaddr_datum ;
}
# if SIZEOF_DATUM == 8
if ( ip_family ( authoritative ) = = PGSQL_AF_INET )
{
/*
* IPv4 with 8 byte datums : keep all 32 netmasked bits , netmask size ,
* and most significant 25 subnet bits
*/
Datum netmask_size = ( Datum ) ip_bits ( authoritative ) ;
Datum subnet ;
/*
* Shift left 31 bits : 6 bits netmask size + 25 subnet bits .
*
* We don ' t make any distinction between network bits that are zero
* due to masking and " true " / non - masked zero bits . An abbreviated
* comparison that is resolved by comparing a non - masked and non - zero
* bit to a masked / zeroed bit is effectively resolved based on
* ip_bits ( ) , even though the comparison won ' t reach the netmask_size
* bits .
*/
network < < = ( ABBREV_BITS_INET4_NETMASK_SIZE +
ABBREV_BITS_INET4_SUBNET ) ;
/* Shift size to make room for subnet bits at the end */
netmask_size < < = ABBREV_BITS_INET4_SUBNET ;
/* Extract subnet bits without shifting them */
subnet = ipaddr_datum & subnet_bitmask ;
/*
* If we have more than 25 subnet bits , we can ' t fit everything . Shift
* subnet down to avoid clobbering bits that are only supposed to be
* used for netmask_size .
*
* Discarding the least significant subnet bits like this is correct
* because abbreviated comparisons that are resolved at the subnet
* level must have had equal netmask_size / ip_bits ( ) values in order to
* get that far .
*/
if ( subnet_size > ABBREV_BITS_INET4_SUBNET )
subnet > > = subnet_size - ABBREV_BITS_INET4_SUBNET ;
/*
* Assemble the final abbreviated key without clobbering the ipfamily
* bit that must remain a zero .
*/
res | = network | netmask_size | subnet ;
}
else
# endif
{
/*
* 4 byte datums , or IPv6 with 8 byte datums : Use as many of the
* netmasked bits as will fit in final abbreviated key . Avoid
* clobbering the ipfamily bit that was set earlier .
*/
res | = network > > 1 ;
}
uss - > input_count + = 1 ;
/* Hash abbreviated key */
if ( uss - > estimating )
{
uint32 tmp ;
# if SIZEOF_DATUM == 8
tmp = ( uint32 ) res ^ ( uint32 ) ( ( uint64 ) res > > 32 ) ;
# else /* SIZEOF_DATUM != 8 */
tmp = ( uint32 ) res ;
# endif
addHyperLogLog ( & uss - > abbr_card , DatumGetUInt32 ( hash_uint32 ( tmp ) ) ) ;
}
return res ;
}
/*
* Boolean ordering tests .
*/