@ -64,34 +64,30 @@
* Phishing design documentation ,
( initially written at http : //wiki.clamav.net/index.php/phishing_design as discussed with aCaB)
TODO : update this doc
* Warning * : if flag * - - phish - scan - alldomains * ( or equivalent clamd / clamav - milter config option ) isn ' t given , then phishing scanning is done only for domains listed in daily . pdb .
If your daily . pdb is empty , then by default NO PHISHING is DONE , UNLESS you give the * - - phish - scan - alldomains *
This is just a side - effect , daily . pdb is empty , because it isn ' t yet officialy in daily . cvd .
TODO : update this doc whenever behaviour changes
phishingCheck ( ) determines if @ displayedLink is a legit representation of @ realLink .
Steps :
1. if _realLink_ * = = * _displayLink_ = > * CLEAN *
1. if _realLink_ = = _displayLink_ = > CLEAN
2. url cleanup ( normalization )
- whitespace elimination
strip all spaces , and leading and trailing garbage .
When matching we have to keep in account whether we stripped any spaces or not .
See str_fixup_spaces .
- html entity conversion
- handle hex - encoded characters
- convert hostname to lowercase
- normalize \ to /
If there is a dot after the last space , then all spaces are replaced with dots ,
otherwise spaces are stripped .
So both : ' Go to yahoo . com ' , and ' Go to e b a y . c o m ' , and ' Go to ebay . com ' will work .
3. Matched the urls against a _whitelist_ :
a _realLink_ , _displayedLink_ pair is matched against the _whitelist_ .
the _whitelist_ is a list of pairs of realLink , displayedLink . Any of the elements of those pairs can be a _regex_ .
if url * is found * in _whitelist_ - - > * CLEAN *
4. URL is looked up in the _domainlist_ , unless disabled via flags ( _ - - phish - scan - alldomains_ ) .
4. URL is looked up in the _domainlist_
The _domainlist_ is a list of pairs of realLink , displayedLink ( any of which can be regex ) .
This is the list of domains we do phishing detection for ( such as ebay , paypal , chase , . . . . )
We can ' t decide to stop processing here or not , so we just set a flag .
@ -120,7 +116,6 @@ Checks if realLink is http, but displayedLink is https or viceversa.
12. Numeric IP detection .
If url is a numeric IP , then - > phish .
Maybe we should do DNS lookup ?
Maybe we should disable numericIP checks for - - phish - scan - alldomains ?
13. isURL ( displayedLink ) .
Checks if displayedLink is really a url .
@ -782,7 +777,6 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
if ( hrefs - > contents [ i ] ) {
struct url_check urls ;
enum phish_status rc ;
urls . always_check_flags = DOMAINLIST_REQUIRED ; /* required to work correctly */
urls . flags = strncmp ( ( char * ) hrefs - > tag [ i ] , href_text , href_text_len ) ? ( CL_PHISH_ALL_CHECKS & ~ CHECK_SSL ) : CL_PHISH_ALL_CHECKS ;
urls . link_type = 0 ;
if ( ! strncmp ( ( char * ) hrefs - > tag [ i ] , src_text , src_text_len ) ) {
@ -790,8 +784,6 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
continue ;
urls . link_type | = LINKTYPE_IMAGE ;
}
if ( ctx - > options & CL_SCAN_PHISHING_DOMAINLIST )
urls . flags | = DOMAINLIST_REQUIRED ;
if ( ctx - > options & CL_SCAN_PHISHING_BLOCKSSL ) {
urls . always_check_flags | = CHECK_SSL ;
}
@ -823,20 +815,6 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
switch ( rc ) /*TODO: support flags from ctx->options,*/
{
case CL_PHISH_CLEAN :
case CL_PHISH_CLEANUP_OK :
case CL_PHISH_HOST_OK :
case CL_PHISH_DOMAIN_OK :
case CL_PHISH_REDIR_OK :
case CL_PHISH_HOST_REDIR_OK :
case CL_PHISH_DOMAIN_REDIR_OK :
case CL_PHISH_HOST_REVERSE_OK :
case CL_PHISH_DOMAIN_REVERSE_OK :
case CL_PHISH_WHITELISTED :
case CL_PHISH_HOST_WHITELISTED :
case CL_PHISH_MAILTO_OK :
case CL_PHISH_TEXTURL :
case CL_PHISH_HOST_NOT_LISTED :
case CL_PHISH_CLEAN_CID :
continue ;
/* break;*/
case CL_PHISH_HEX_URL :
@ -1029,7 +1007,7 @@ static enum phish_status cleanupURLs(struct url_check* urls)
if ( ! urls - > displayLink . data | | ! urls - > realLink . data )
return CL_PHISH_NODECISION ;
if ( ! strcmp ( urls - > realLink . data , urls - > displayLink . data ) )
return CL_PHISH_CLEANUP_OK ;
return CL_PHISH_CLEAN ;
}
return CL_PHISH_NODECISION ;
}
@ -1046,33 +1024,31 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str
if ( ! start | | ! end ) {
string_assign_null ( host ) ;
}
else {
if ( ( rc = string_assign_dup ( host , start , end ) ) )
return rc ;
else if ( ( rc = string_assign_dup ( host , start , end ) ) ) {
return rc ;
}
cli_dbgmsg ( " Phishcheck:host:%s \n " , host - > data ) ;
if ( ! isReal ) {
url - > pre_fixup . host_start = start - URL ;
url - > pre_fixup . host_end = end - URL ;
}
if ( ! host - > data )
return CL_PHISH_CLEANUP_OK ;
if ( * phishy & REAL_IS_MAILTO )
return CL_PHISH_MAILTO_OK ;
if ( strchr ( host - > data , ' ' ) ) {
string_free ( host ) ;
return CL_PHISH_TEXTURL ;
if ( ! host - > data | | ( isReal & & host - > data [ 0 ] = = ' \0 ' ) | | * phishy & REAL_IS_MAILTO | | strchr ( host - > data , ' ' ) ) {
/* no host,
* link without domain , such as : href = " /isapi.dll?...
* mailto :
* spaces in hostname
*/
return CL_PHISH_CLEAN ;
}
if ( url - > flags & CHECK_CLOAKING & & ! cli_regexec ( & pchk - > preg_hexurl , host - > data , 0 , NULL , 0 ) ) {
/* uses a regex here, so that we don't accidentally block 0xacab.net style hosts */
string_free ( host ) ;
return CL_PHISH_HEX_URL ;
}
if ( isReal & & host - > data [ 0 ] = = ' \0 ' )
return CL_PHISH_CLEAN ; /* link without domain, such as: href="/isapi.dll?... */
if ( isNumeric ( host - > data ) ) {
* phishy | = PHISHY_NUMERIC_IP ;
}
if ( ! isReal ) {
url - > pre_fixup . host_start = start - URL ;
url - > pre_fixup . host_end = end - URL ;
}
return CL_PHISH_NODECISION ;
}
@ -1111,45 +1087,15 @@ static int whitelist_check(const struct cl_engine* engine,struct url_check* urls
return whitelist_match ( engine , urls - > realLink . data , urls - > displayLink . data , hostOnly ) ;
}
static int isPhishing ( enum phish_status rc )
{
switch ( rc ) {
case CL_PHISH_CLEAN :
case CL_PHISH_CLEANUP_OK :
case CL_PHISH_WHITELISTED :
case CL_PHISH_HOST_WHITELISTED :
case CL_PHISH_HOST_OK :
case CL_PHISH_DOMAIN_OK :
case CL_PHISH_REDIR_OK :
case CL_PHISH_HOST_REDIR_OK :
case CL_PHISH_DOMAIN_REDIR_OK :
case CL_PHISH_HOST_REVERSE_OK :
case CL_PHISH_DOMAIN_REVERSE_OK :
case CL_PHISH_MAILTO_OK :
case CL_PHISH_TEXTURL :
case CL_PHISH_HOST_NOT_LISTED :
case CL_PHISH_CLEAN_CID :
return 0 ;
case CL_PHISH_HEX_URL :
case CL_PHISH_CLOAKED_NULL :
case CL_PHISH_SSL_SPOOF :
case CL_PHISH_CLOAKED_UIU :
case CL_PHISH_NUMERIC_IP :
case CL_PHISH_NOMATCH :
return 1 ;
default :
return 1 ;
}
}
/* urls can't contain null pointer, caller must ensure this */
static enum phish_status phishingCheck ( const struct cl_engine * engine , struct url_check * urls )
{
struct url_check host_url ;
enum phish_status rc = CL_PHISH_NODECISION ;
int rc = CL_PHISH_NODECISION ;
int phishy = 0 ;
const struct phishcheck * pchk = ( const struct phishcheck * ) engine - > phishcheck ;
if ( ! urls - > realLink . data )
if ( ! urls - > realLink . data | | urls - > displayLink . data [ 0 ] = = ' \0 ' )
return CL_PHISH_CLEAN ;
cli_dbgmsg ( " Phishcheck:Checking url %s->%s \n " , urls - > realLink . data ,
@ -1159,59 +1105,43 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
return CL_PHISH_CLEAN ; /* displayed and real URL are identical -> clean */
if ( ( rc = cleanupURLs ( urls ) ) ) {
if ( isPhishing ( rc ) ) /* not allowed to decide this is phishing */
return CL_PHISH_CLEAN ;
return rc ; /* URLs identical after cleanup */
/* it can only return an error, or say its clean;
* it is not allowed to decide it is phishing */
return rc < 0 ? rc : CL_PHISH_CLEAN ;
}
if ( whitelist_check ( engine , urls , 0 ) )
return CL_PHISH_WHITELISTED ; /* if url is whitelist don't perform further checks */
if ( whitelist_check ( engine , urls , 0 ) )
return CL_PHISH_CLEAN ; /* if url is whitelisted don't perform further checks */
if ( ( ! isURL ( pchk , urls - > displayLink . data ) | | ! isRealURL ( pchk , urls - > realLink . data ) ) & &
if ( ( ! isURL ( pchk , urls - > displayLink . data ) | | ! isRealURL ( pchk , urls - > realLink . data ) ) & &
( ( phishy & PHISHY_NUMERIC_IP & & ! isNumericURL ( pchk , urls - > displayLink . data ) ) | |
! ( phishy & PHISHY_NUMERIC_IP ) ) ) {
cli_dbgmsg ( " Displayed 'url' is not url:%s \n " , urls - > displayLink . data ) ;
return CL_PHISH_TEXTURL ;
return CL_PHISH_CLEAN ;
}
if ( urls - > flags & DOMAINLIST_REQUIRED & & domainlist_match ( engine , urls - > realLink . data , urls - > displayLink . data , NULL , 0 , & urls - > flags ) )
if ( domainlist_match ( engine , urls - > realLink . data , urls - > displayLink . data , NULL , 0 , & urls - > flags ) ) {
phishy | = DOMAIN_LISTED ;
else {
} else {
/* although entire url is not listed, the host might be,
* so defer phishing decisions till we know if host is listed */
}
url_check_init ( & host_url ) ;
if ( ( rc = url_get_host ( pchk , urls , & host_url , DOMAIN_DISPLAY , & phishy ) ) ) {
if ( ( rc = url_get_host ( pchk , urls , & host_url , DOMAIN_DISPLAY , & phishy ) ) ) {
free_if_needed ( & host_url ) ;
if ( isPhishing ( rc ) )
return CL_PHISH_CLEAN ;
return rc ;
return rc < 0 ? rc : CL_PHISH_CLEAN ;
}
if ( urls - > flags & DOMAINLIST_REQUIRED ) {
if ( ! ( phishy & DOMAIN_LISTED ) ) {
if ( domainlist_match ( engine , host_url . displayLink . data , host_url . realLink . data , & urls - > pre_fixup , 1 , & urls - > flags ) )
phishy | = DOMAIN_LISTED ;
else {
}
}
if ( ! ( phishy & DOMAIN_LISTED ) & &
! domainlist_match ( engine , host_url . displayLink . data , host_url . realLink . data , & urls - > pre_fixup , 1 , & urls - > flags ) ) {
return CL_PHISH_CLEAN ; /* domain not listed */
}
/* link type filtering must occur after last domainlist_match */
if ( urls - > link_type & LINKTYPE_IMAGE & & ! ( urls - > flags & CHECK_IMG_URL ) )
return CL_PHISH_HOST_NOT_LISTED ; /* its listed, but this link type is filtered */
if ( urls - > flags & DOMAINLIST_REQUIRED & & ! ( phishy & DOMAIN_LISTED ) ) {
urls - > flags & = urls - > always_check_flags ;
if ( ! urls - > flags ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_HOST_NOT_LISTED ;
}
}
return CL_PHISH_CLEAN ; /* its listed, but this link type is filtered */
if ( urls - > flags & CHECK_CLOAKING ) {
/*Checks if URL is cloaked.
@ -1227,63 +1157,41 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
}
}
if ( urls - > displayLink . data [ 0 ] = = ' \0 ' ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_CLEAN ;
}
if ( urls - > flags & CHECK_SSL & & isSSL ( urls - > displayLink . data ) & & ! isSSL ( urls - > realLink . data ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_SSL_SPOOF ;
}
if ( ! urls - > flags & CHECK_CLOAKING & & urls - > flags & DOMAINLIST_REQUIRED & & ! ( phishy & DOMAIN_LISTED ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_HOST_NOT_LISTED ;
}
if ( ( rc = url_get_host ( pchk , urls , & host_url , DOMAIN_REAL , & phishy ) ) )
{
free_if_needed ( & host_url ) ;
return rc ;
return rc < 0 ? rc : CL_PHISH_CLEAN ;
}
if ( urls - > flags & DOMAINLIST_REQUIRED & & ! ( phishy & DOMAIN_LISTED ) ) {
if ( whitelist_check ( engine , & host_url , 1 ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_HOST_NOT_LISTED ;
return CL_PHISH_CLEAN ;
}
if ( whitelist_check ( engine , & host_url , 1 ) ) {
if ( ! strcmp ( urls - > realLink . data , urls - > displayLink . data ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_HOST_WHITELISTED ;
return CL_PHISH_CLEAN ;
}
if ( urls - > flags & HOST_SUFFICIENT ) {
if ( ! strcmp ( urls - > realLink . data , urls - > displayLink . data ) ) {
{
struct url_check domain_url ;
url_check_init ( & domain_url ) ;
url_get_domain ( pchk , & host_url , & domain_url ) ;
if ( ! strcmp ( domain_url . realLink . data , domain_url . displayLink . data ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_HOST_OK ;
}
if ( urls - > flags & DOMAIN_SUFFICIENT ) {
struct url_check domain_url ;
url_check_init ( & domain_url ) ;
url_get_domain ( pchk , & host_url , & domain_url ) ;
if ( ! strcmp ( domain_url . realLink . data , domain_url . displayLink . data ) ) {
free_if_needed ( & host_url ) ;
free_if_needed ( & domain_url ) ;
return CL_PHISH_DOMAIN_OK ;
}
free_if_needed ( & domain_url ) ;
return CL_PHISH_CLEAN ;
}
free_if_needed ( & domain_url ) ;
}
free_if_needed ( & host_url ) ;
} /*HOST_SUFFICIENT*/
free_if_needed ( & host_url ) ;
/*we failed to find a reason why the 2 URLs are different, this is definitely phishing*/
if ( urls - > flags & DOMAINLIST_REQUIRED & & ! ( phishy & DOMAIN_LISTED ) )
return CL_PHISH_HOST_NOT_LISTED ;
return phishy_map ( phishy , CL_PHISH_NOMATCH ) ;
}
@ -1292,28 +1200,6 @@ static const char* phishing_ret_toString(enum phish_status rc)
switch ( rc ) {
case CL_PHISH_CLEAN :
return " Clean " ;
case CL_PHISH_CLEANUP_OK :
return " URLs match after cleanup " ;
case CL_PHISH_WHITELISTED :
return " URL is whitelisted " ;
case CL_PHISH_HOST_WHITELISTED :
return " host part of URL is whitelist " ;
case CL_PHISH_HOST_OK :
return " Hosts match " ;
case CL_PHISH_DOMAIN_OK :
return " Domains match " ;
case CL_PHISH_REDIR_OK :
return " After redirecting realURL, they match " ;
case CL_PHISH_HOST_REDIR_OK :
return " After redirecting realURL, hosts match " ;
case CL_PHISH_DOMAIN_REDIR_OK :
return " After redirecting the domains match " ;
case CL_PHISH_MAILTO_OK :
return " URL is mailto " ;
case CL_PHISH_NUMERIC_IP :
return " IP address encountered in hostname " ;
case CL_PHISH_TEXTURL :
return " Displayed link is not an URL, can't check if phishing or not " ;
case CL_PHISH_CLOAKED_NULL :
return " Link URL is cloaked (null byte %00) " ;
case CL_PHISH_CLOAKED_UIU :
@ -1323,10 +1209,6 @@ static const char* phishing_ret_toString(enum phish_status rc)
return " Visible links is SSL, real link is not " ;
case CL_PHISH_NOMATCH :
return " URLs are way too different " ;
case CL_PHISH_HOST_NOT_LISTED :
return " Host not listed in .pdb -> not checked " ;
case CL_PHISH_CLEAN_CID :
return " Embedded image in mail -> clean " ;
case CL_PHISH_HEX_URL :
return " Embedded hex urls " ;
default :