/*
* Detect phishing , based on URL spoofing detection .
*
* Copyright ( C ) 2006 - 2007 T <EFBFBD> r <EFBFBD> k Edvin < edwin @ clamav . net >
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
*
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
# ifndef CL_DEBUG
# define NDEBUG
# endif
# ifdef CL_THREAD_SAFE
# ifndef _REENTRANT
# define _REENTRANT
# endif
# endif
# include <stdio.h>
# include <string.h>
# include <ctype.h>
# include "clamav.h"
# include "others.h"
# include "mbox.h"
# include "message.h"
# include "htmlnorm.h"
# include "phishcheck.h"
# include "phish_domaincheck_db.h"
# include "phish_whitelist.h"
# include "iana_tld.h"
# define DOMAIN_REAL 1
# define DOMAIN_DISPLAY 0
# define PHISHY_USERNAME_IN_URL 1
# define PHISHY_NUMERIC_IP 2
# define REAL_IS_MAILTO 4
/* this is just a flag, so that the displayed url will be parsed as mailto too, for example
* < a href = ' mailto : somebody @ yahoo . com ' > to : somebody @ yahoo . com < / a > */
# define DOMAIN_LISTED 8
# define PHISHY_CLOAKED_NULL 16
# define PHISHY_HEX_URL 32
/*
* Phishing design documentation ,
( initially written at http : //wiki.clamav.net/index.php/phishing_design as discussed with aCaB)
TODO : update this doc whenever behaviour changes
phishingCheck ( ) determines if @ displayedLink is a legit representation of @ realLink .
Steps :
1. if _realLink_ = = _displayLink_ = > CLEAN
2. url cleanup ( normalization )
- whitespace elimination
strip all spaces , and leading and trailing garbage .
When matching we have to keep in account whether we stripped any spaces or not .
See str_fixup_spaces .
- html entity conversion
- handle hex - encoded characters
- convert hostname to lowercase
- normalize \ to /
3. Matched the urls against a _whitelist_ :
a _realLink_ , _displayedLink_ pair is matched against the _whitelist_ .
the _whitelist_ is a list of pairs of realLink , displayedLink . Any of the elements of those pairs can be a _regex_ .
if url * is found * in _whitelist_ - - > * CLEAN *
4. URL is looked up in the _domainlist_
The _domainlist_ is a list of pairs of realLink , displayedLink ( any of which can be regex ) .
This is the list of domains we do phishing detection for ( such as ebay , paypal , chase , . . . . )
We can ' t decide to stop processing here or not , so we just set a flag .
Note ( * ! * ) : the flags are modified by the the domainlist checker . If domain is found , then the flags associated with it filter the default compile - time flags .
5. _Hostname_ is extracted from the _displayed URL_ .
It is checked against the _whitelist_ , and _domainlist_ .
6. Now we know if we want to stop processing .
If we are only scanning domains in the _domainlist_ ( default behaviour ) , and the url / domain
isn ' t found in it , we return ( and mark url as not_list / clean ) .
If we scan all domains , then the domainlist isn ' t even checked .
7. URL cloak check .
check for % 00 , and hex - encoded IPs in URL .
8. Skip empty displayedURLs
9. SSL mismatch detection .
Checks if realLink is http , but displayedLink is https or viceversa .
( by default the SSL detection is done for hrefs only , not for imgs )
10. Hostname of real URL is extracted .
12. Numeric IP detection .
If url is a numeric IP , then - > phish .
Maybe we should do DNS lookup ?
13. isURL ( displayedLink ) .
Checks if displayedLink is really a url .
if not - > clean
14. Hostnames of real , displayedLink are compared . If equal - > clean
15. Extract domain names , and compare . If equal - > clean
16. Do DNS lookups / reverse lookups . Disabled now ( too much load / too many lookups ) . *
For the Whitelist ( . wdb ) / Domainlist ( . pdb ) format see regex_list . c ( search for Flags )
*
*/
/* Constant strings and tables */
static char empty_string [ ] = " " ;
# define ANY_CLOAK "(0[xX][0-9a-fA-F]+|[0-9]+)"
# define CLOAKED_URL "^"ANY_CLOAK"(\\."ANY_CLOAK"){0,3}$"
static const char cloaked_host_regex [ ] = CLOAKED_URL ;
static const char tld_regex [ ] = " ^ " iana_tld " $ " ;
static const char cctld_regex [ ] = " ^ " iana_cctld " $ " ;
static const char dotnet [ ] = " .net " ;
static const char adonet [ ] = " ado.net " ;
static const char aspnet [ ] = " asp.net " ;
/* ; is replaced by ' ' so omit it here*/
static const char lt [ ] = " < " ;
static const char gt [ ] = " > " ;
static const char src_text [ ] = " src " ;
static const char href_text [ ] = " href " ;
static const char mailto [ ] = " mailto: " ;
static const char https [ ] = " https:// " ;
static const size_t href_text_len = sizeof ( href_text ) ;
static const size_t src_text_len = sizeof ( src_text ) ;
static const size_t dotnet_len = sizeof ( dotnet ) - 1 ;
static const size_t adonet_len = sizeof ( adonet ) - 1 ;
static const size_t aspnet_len = sizeof ( aspnet ) - 1 ;
static const size_t lt_len = sizeof ( lt ) - 1 ;
static const size_t gt_len = sizeof ( gt ) - 1 ;
static const size_t mailto_len = sizeof ( mailto ) - 1 ;
static const size_t https_len = sizeof ( https ) - 1 ;
/* for urls, including mailto: urls, and (broken) http:www... style urls*/
/* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html
* Modifications : don ' t allow empty domains / subdomains , such as www . . com < - that is no url
* So the ' safe ' char class has been split up
* */
/* character classes */
# define URI_alpha "a-zA-Z"
# define URI_digit "0-9"
# define URI_safe_nodot "-$_@&"
# define URI_safe "-$_@.&"
# define URI_extra "!*\"'(),"
# define URI_reserved "=; / #?: "
# define URI_national "{}|[]\\^~"
# define URI_punctuation "<>"
# define URI_hex "[0-9a-fA-f]"
# define URI_escape "%"URI_hex"{2}"
# define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
# define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
# define URI_xalphas URI_xalpha"+"
# define URI_xalphas_nodot URI_xalpha_nodot"*"
# define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot""
# define URI_xpalpha URI_xalpha"|\\+"
# define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
# define URI_xpalphas "("URI_xpalpha")+"
# define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
# define optional_URI_xpalphas "("URI_xpalpha"|=)*"
# define URI_scheme URI_ialpha
# define URI_tld iana_tld
# define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
# define URI_path2 URI_tld
# define URI_path3 "( / "optional_URI_xpalphas")*"
# define URI_search "("URI_xalphas")*"
# define URI_fragmentid URI_xalphas
# define URI_IP_digits "["URI_digit"]{1,3}"
# define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?( / ("URI_xpalphas" / ?)*)?"
# define URI_numeric_URI "("URI_scheme":( //)?)?"URI_numeric_path"(\\?" URI_search")?"
# define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
# define URI_URI1 "("URI_scheme":( //)?)?"URI_path1
# define URI_URI2 URI_path2
# define URI_URI3 URI_path3"(\\?" URI_search")?"
# define URI_fragmentaddress1 URI_URI1
# define URI_fragmentaddress2 URI_URI2
# define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
# define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto): //.+"
/*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please.
* there is fragmentaddress1 , and 2 to work around the ISO limitation of 509 bytes max length for string constants */
static const char numeric_url_regex [ ] = " ^ * " URI_numeric_fragmentaddress " *$ " ;
/* generated by contrib/phishing/generate_tables.c */
static const short int hextable [ 256 ] = {
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x1 , 0x2 , 0x3 , 0x4 , 0x5 , 0x6 , 0x7 , 0x8 , 0x9 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0xa , 0xb , 0xc , 0xd , 0xe , 0xf , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0xa , 0xb , 0xc , 0xd , 0xe , 0xf , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 ,
0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0
} ;
/* Prototypes*/
static void string_init_c ( struct string * dest , char * data ) ;
static void string_assign_null ( struct string * dest ) ;
static char * rfind ( char * start , char c , size_t len ) ;
static char hex2int ( const unsigned char * src ) ;
static int isTLD ( const struct phishcheck * pchk , const char * str , int len ) ;
static enum phish_status phishingCheck ( const struct cl_engine * engine , struct url_check * urls ) ;
static const char * phishing_ret_toString ( enum phish_status rc ) ;
static void url_check_init ( struct url_check * urls )
{
string_init_c ( & urls - > realLink , NULL ) ;
string_init_c ( & urls - > displayLink , NULL ) ;
string_init_c ( & urls - > pre_fixup . pre_displayLink , NULL ) ;
}
/* string reference counting implementation,
* so that : we don ' t have to keep in mind who allocated what , and when needs to be freed ,
* and thus we won ' t leak memory */
static void string_free ( struct string * str )
{
for ( ; ; ) {
str - > refcount - - ;
if ( ! str - > refcount ) {
if ( str - > ref ) /* don't free, this is a portion of another string */
str = str - > ref ; /* try to free that one*/
else {
if ( str - > data )
free ( str - > data ) ;
break ;
}
}
else break ;
}
}
/* always use the string_assign when assigning to a string, this makes sure the old one's reference count is incremented*/
static void string_assign ( struct string * dest , struct string * src )
{
string_free ( dest ) ;
src - > refcount + + ;
dest - > data = src - > data ;
dest - > refcount = 1 ;
dest - > ref = src ;
}
/* data will be freed when string freed */
/* it doesn't free old string, use only for initialization
* Doesn ' t allow NULL pointers , they are replaced by pointer to empty string
* */
static void string_init_c ( struct string * dest , char * data )
{
dest - > refcount = data ? 1 : 0 ;
dest - > data = data ? data : empty_string ;
dest - > ref = NULL ;
}
/* make a copy of the string between start -> end*/
static int string_assign_dup ( struct string * dest , const char * start , const char * end )
{
char * ret = cli_malloc ( end - start + 1 ) ;
if ( ! ret )
return CL_EMEM ;
strncpy ( ret , start , end - start ) ;
ret [ end - start ] = ' \0 ' ;
string_free ( dest ) ;
dest - > data = ret ;
dest - > refcount = 1 ;
dest - > ref = NULL ;
return CL_SUCCESS ;
}
static void string_assign_null ( struct string * dest )
{
if ( dest ) {
string_free ( dest ) ;
dest - > data = empty_string ;
dest - > refcount = - 1 ; /* don't free it! */
dest - > ref = NULL ;
}
}
/* this string uses portion of another string*/
static void string_assign_ref ( struct string * dest , struct string * ref , char * data )
{
string_free ( dest ) ;
ref - > refcount + + ;
dest - > data = data ;
dest - > refcount = 1 ;
dest - > ref = ref ;
}
static void free_if_needed ( struct url_check * url )
{
string_free ( & url - > realLink ) ;
string_free ( & url - > displayLink ) ;
string_free ( & url - > pre_fixup . pre_displayLink ) ;
}
static int build_regex ( regex_t * preg , const char * regex , int nosub )
{
int rc ;
cli_dbgmsg ( " Phishcheck: Compiling regex: %s \n " , regex ) ;
rc = cli_regcomp ( preg , regex , REG_EXTENDED | REG_ICASE | ( nosub ? REG_NOSUB : 0 ) ) ;
if ( rc ) {
# ifdef C_WINDOWS
cli_errmsg ( " Phishcheck: Error in compiling regex, disabling phishing checks \n " ) ;
# else
size_t buflen = cli_regerror ( rc , preg , NULL , 0 ) ;
char * errbuf = cli_malloc ( buflen ) ;
if ( errbuf ) {
cli_regerror ( rc , preg , errbuf , buflen ) ;
cli_errmsg ( " Phishcheck: Error in compiling regex:%s \n Disabling phishing checks \n " , errbuf ) ;
free ( errbuf ) ;
} else
cli_errmsg ( " Phishcheck: Error in compiling regex, disabling phishing checks. Additionally an Out-of-memory error was encountered while generating a detailed error message \n " ) ;
# endif
return 1 ;
}
return CL_SUCCESS ;
}
/* allocates memory */
static int get_host ( const struct phishcheck * s , const char * URL , int isReal , int * phishy , const char * * hstart , const char * * hend )
{
int rc , ismailto = 0 ;
const char * start ;
const char * end = NULL ;
if ( ! URL ) {
* hstart = * hend = NULL ;
return 0 ;
}
start = strstr ( URL , " :// " ) ;
if ( ! start ) {
if ( ! strncmp ( URL , mailto , mailto_len ) ) {
start = URL + mailto_len ;
ismailto = 1 ;
}
else if ( ! isReal & & * phishy & REAL_IS_MAILTO ) {
/* it is not required to use mailto: in the displayed url, they might use to:, or whatever */
end = URL + strlen ( URL ) + 1 ;
start = URL + strcspn ( URL , " : " ) + 1 ;
if ( start = = end )
start = URL ;
ismailto = 1 ;
}
else {
start = URL ; /*URL without protocol*/
if ( isReal )
cli_dbgmsg ( " Phishcheck: Real URL without protocol: %s \n " , URL ) ;
else ismailto = 2 ; /*no-protocol, might be mailto, @ is no problem*/
}
}
else
start + = 3 ; /* :// */
if ( ! ismailto | | ! isReal ) {
const char * realhost , * tld ;
do {
end = start + strcspn ( start , " :/? " ) ;
realhost = strchr ( start , ' @ ' ) ;
if ( realhost = = NULL | | ( start ! = end & & realhost > end ) ) {
/*don't check beyond end of hostname*/
break ;
}
tld = strrchr ( realhost , ' . ' ) ;
rc = tld ? isTLD ( s , tld , tld - realhost - 1 ) : 0 ;
if ( rc < 0 )
return rc ;
if ( rc )
* phishy | = PHISHY_USERNAME_IN_URL ; /* if the url contains a username that is there just to fool people,
like http : //www.ebay.com@somevilplace.someevildomain.com/ */
start = realhost + 1 ; /*skip the username*/
} while ( realhost ) ; /*skip over multiple @ characters, text following last @ character is the real host*/
}
else if ( ismailto & & isReal )
* phishy | = REAL_IS_MAILTO ;
if ( ! end ) {
end = start + strcspn ( start , " :/? " ) ; /*especially important for mailto:somebody@yahoo.com?subject=...*/
if ( ! end )
end = start + strlen ( start ) ;
}
* hstart = start ;
* hend = end ;
return 0 ;
}
static int isCountryCode ( const struct phishcheck * s , const char * str )
{
return str ? ! cli_regexec ( & s - > preg_cctld , str , 0 , NULL , 0 ) : 0 ;
}
static int isTLD ( const struct phishcheck * pchk , const char * str , int len )
{
if ( ! str )
return 0 ;
else {
char * s = cli_malloc ( len + 1 ) ;
int rc ;
if ( ! s )
return CL_EMEM ;
strncpy ( s , str , len ) ;
s [ len ] = ' \0 ' ;
rc = ! cli_regexec ( & pchk - > preg_tld , s , 0 , NULL , 0 ) ;
free ( s ) ;
return rc ? 1 : 0 ;
}
}
/*
* memrchr isn ' t standard , so I use this
*/
static char *
rfind ( char * start , char c , size_t len )
{
char * p ;
if ( start = = NULL )
return NULL ;
for ( p = start + len ; ( p > = start ) & & ( * p ! = c ) ; p - - )
;
return ( p < start ) ? NULL : p ;
}
static void get_domain ( const struct phishcheck * pchk , struct string * dest , struct string * host )
{
char * domain ;
char * tld = strrchr ( host - > data , ' . ' ) ;
if ( ! tld ) {
cli_dbgmsg ( " Phishcheck: Encountered a host without a tld? (%s) \n " , host - > data ) ;
string_assign ( dest , host ) ;
return ;
}
if ( isCountryCode ( pchk , tld + 1 ) ) {
const char * countrycode = tld + 1 ;
tld = rfind ( host - > data , ' . ' , tld - host - > data - 1 ) ;
if ( ! tld ) {
cli_dbgmsg ( " Phishcheck: Weird, a name with only 2 levels (%s) \n " ,
host - > data ) ;
string_assign ( dest , host ) ;
return ;
}
if ( ! isTLD ( pchk , tld + 1 , countrycode - tld - 1 ) ) {
string_assign_ref ( dest , host , tld + 1 ) ;
return ; /*it was a name like: subdomain.domain.uk, return domain.uk*/
}
}
/*we need to strip one more level, this is the actual domain*/
domain = rfind ( host - > data , ' . ' , tld - host - > data - 1 ) ;
if ( ! domain ) {
string_assign ( dest , host ) ;
return ; /* it was like sourceforge.net?*/
}
string_assign_ref ( dest , host , domain + 1 ) ;
}
static int isNumeric ( const char * host )
{
int len = strlen ( host ) ;
int a , b , c , d , n = 0 ;
/* 1.2.3.4 -> 7*/
/* 127.127.127.127 -> 15*/
if ( len < 7 | | len > 15 )
return 0 ;
sscanf ( host , " %d.%d.%d.%d%n " , & a , & b , & c , & d , & n ) ;
if ( n = = len )
if ( a > = 0 & & a < = 256 & & b > = 0 & & b < = 256 & & c > = 0 & & c < = 256 & & d > = 0 & & d < = 256 )
return 1 ;
return 0 ;
}
static int isSSL ( const char * URL )
{
return URL ? ! strncmp ( https , URL , https_len ) : 0 ;
}
/* deletes @what from the string @begin.
* @ what_len : length of @ what , excluding the terminating \ 0 */
static void
str_hex_to_char ( char * * begin , const char * * end )
{
char * sbegin = * begin ;
const char * str_end = * end ;
if ( str_end < = sbegin )
return ;
if ( strlen ( sbegin ) < = 2 )
return ;
/* convert leading %xx*/
if ( sbegin [ 0 ] = = ' % ' ) {
sbegin [ 2 ] = hex2int ( ( unsigned char * ) sbegin + 1 ) ;
sbegin + = 2 ;
}
* begin = sbegin + + ;
while ( sbegin + 3 < str_end ) {
while ( sbegin + 3 < str_end & & sbegin [ 0 ] = = ' % ' ) {
const char * src = sbegin + 3 ;
* sbegin = hex2int ( ( unsigned char * ) sbegin + 1 ) ;
/* move string */
memmove ( sbegin + 1 , src , str_end - src + 1 ) ;
str_end - = 2 ;
}
sbegin + + ;
}
* end = str_end ;
}
/*
* deletes @ what from the string @ begin .
* @ what_len : length of @ what , excluding the terminating \ 0
*/
static void
str_strip ( char * * begin , const char * * end , const char * what , size_t what_len )
{
char * sbegin = * begin ;
const char * str_end = * end ;
const char * str_end_what ;
size_t cmp_len = what_len ;
if ( begin = = NULL | | str_end < = sbegin )
return ;
/*if(str_end < (sbegin + what_len))
return ; */
if ( strlen ( sbegin ) < what_len )
return ;
/* strip leading @what */
while ( cmp_len & & ! strncmp ( sbegin , what , cmp_len ) ) {
sbegin + = what_len ;
if ( cmp_len > what_len )
cmp_len - = what_len ;
else
cmp_len = 0 ;
}
/* strip trailing @what */
if ( what_len < = ( size_t ) ( str_end - sbegin ) ) {
str_end_what = str_end - what_len + 1 ;
while ( ( str_end_what > sbegin ) & &
( strncmp ( str_end_what , what , what_len ) = = 0 ) ) {
str_end - = what_len ;
str_end_what - = what_len ;
}
}
* begin = sbegin + + ;
while ( sbegin + what_len < = str_end ) {
while ( sbegin + what_len < = str_end & & ! strncmp ( sbegin , what , what_len ) ) {
const char * src = sbegin + what_len ;
/* move string */
memmove ( sbegin , src , str_end - src + 1 ) ;
str_end - = what_len ;
}
sbegin + + ;
}
* end = str_end ;
}
/* replace every occurrence of @c in @str with @r*/
static void str_replace ( char * str , const char * end , char c , char r )
{
for ( ; str < = end ; str + + ) {
if ( * str = = c )
* str = r ;
}
}
static void str_make_lowercase ( char * str , size_t len )
{
for ( ; len ; str + + , len - - ) {
* str = tolower ( * str ) ;
}
}
# define fix32(x) ((x)<32 ? 32 : (x))
static void clear_msb ( char * begin )
{
for ( ; * begin ; begin + + )
* begin = fix32 ( ( * begin ) & 0x7f ) ;
}
/*
* Particularly yahoo puts links like this in mails :
* http : / / mail . yahoo . com
* So first step : delete space between / /
*
* Next there could be possible links like this :
* < a href = " phishlink " > w w w . e b a y . c o m < / a >
* Here we need to strip spaces to get this picked up .
*
* Next there are links like :
* < a href = " www.yahoo.com " > Check out yahoo . com < / a >
* Here we add a . , so we get : check . out . yahoo . com ( it won ' t trigger )
*
* Old Rule for adding . : if substring from right contains dot , then add dot ,
* otherwise strip space
* New Rule : strip all spaces
* strip leading and trailing garbage
*
*/
static void
str_fixup_spaces ( char * * begin , const char * * end )
{
char * sbegin = * begin ;
const char * send = * end ;
if ( ! sbegin | | ! send | | send < sbegin )
return ;
/* strip spaces */
str_strip ( & sbegin , & send , " " , 1 ) ;
/* strip leading/trailing garbage */
while ( ! isalnum ( sbegin [ 0 ] ) & & sbegin < = send ) sbegin + + ;
while ( ! isalnum ( send [ 0 ] ) & & send > = sbegin ) send - - ;
* begin = sbegin ;
* end = send ;
}
/* allocates memory */
static int
cleanupURL ( struct string * URL , struct string * pre_URL , int isReal )
{
char * begin = URL - > data ;
const char * end ;
size_t len ;
clear_msb ( begin ) ;
/*if(begin == NULL)
return ; */
/*TODO: handle hex-encoded IPs*/
while ( isspace ( * begin ) )
begin + + ;
len = strlen ( begin ) ;
if ( len = = 0 ) {
string_assign_null ( URL ) ;
string_assign_null ( pre_URL ) ;
return 0 ;
}
end = begin + len - 1 ;
/*cli_dbgmsg("%d %d\n", end-begin, len);*/
if ( begin > = end ) {
string_assign_null ( URL ) ;
string_assign_null ( pre_URL ) ;
return 0 ;
}
while ( isspace ( * end ) )
end - - ;
/*TODO: convert \ to /, and stuff like that*/
/* From mailscanner, my comments enclosed in {} */
if ( ! strncmp ( begin , dotnet , dotnet_len ) | | ! strncmp ( begin , adonet , adonet_len ) | | ! strncmp ( begin , aspnet , aspnet_len ) ) {
string_assign_null ( URL ) ;
string_assign_null ( pre_URL ) ;
}
else {
size_t host_len ;
char * host_begin ;
int rc ;
str_replace ( begin , end , ' \\ ' , ' / ' ) ;
/* some broken MUAs put > in the href, and then
* we get a false positive , so remove them */
str_replace ( begin , end , ' < ' , ' ' ) ;
str_replace ( begin , end , ' > ' , ' ' ) ;
str_replace ( begin , end , ' \" ' , ' ' ) ;
str_replace ( begin , end , ' ; ' , ' ' ) ;
str_strip ( & begin , & end , lt , lt_len ) ;
str_strip ( & begin , & end , gt , gt_len ) ;
/* convert hostname to lowercase, but only hostname! */
host_begin = strchr ( begin , ' : ' ) ;
while ( host_begin & & host_begin [ 1 ] = = ' / ' ) host_begin + + ;
if ( ! host_begin ) host_begin = begin ;
else host_begin + + ;
host_len = strcspn ( host_begin , " /? " ) ;
str_make_lowercase ( host_begin , host_len ) ;
/* convert %xx to real value */
str_hex_to_char ( & begin , & end ) ;
if ( isReal ) {
/* htmlnorm converts \n to space, so we have to strip spaces */
str_strip ( & begin , & end , " " , 1 ) ;
}
else {
/* trim space */
while ( ( begin < = end ) & & ( begin [ 0 ] = = ' ' ) ) begin + + ;
while ( ( begin < = end ) & & ( end [ 0 ] = = ' ' ) ) end - - ;
}
if ( ( rc = string_assign_dup ( isReal ? URL : pre_URL , begin , end + 1 ) ) ) {
string_assign_null ( URL ) ;
return rc ;
}
if ( ! isReal ) {
str_fixup_spaces ( & begin , & end ) ;
if ( ( rc = string_assign_dup ( URL , begin , end + 1 ) ) ) {
return rc ;
}
}
/*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/
}
return 0 ;
}
/* -------end runtime disable---------*/
static int found_possibly_unwanted ( cli_ctx * ctx )
{
ctx - > found_possibly_unwanted = 1 ;
cli_dbgmsg ( " Phishcheck: found Possibly Unwanted: %s \n " , * ctx - > virname ) ;
return CL_CLEAN ;
}
int phishingScan ( message * m , const char * dir , cli_ctx * ctx , tag_arguments_t * hrefs )
{
int i ;
struct phishcheck * pchk = ( struct phishcheck * ) ctx - > engine - > phishcheck ;
/* check for status of whitelist fatal error, etc. */
if ( ! pchk | | pchk - > is_disabled )
return CL_CLEAN ;
if ( ! ctx - > found_possibly_unwanted )
* ctx - > virname = NULL ;
for ( i = 0 ; i < hrefs - > count ; i + + )
if ( hrefs - > contents [ i ] ) {
struct url_check urls ;
enum phish_status rc ;
urls . flags = strncmp ( ( char * ) hrefs - > tag [ i ] , href_text , href_text_len ) ? ( CL_PHISH_ALL_CHECKS & ~ CHECK_SSL ) : CL_PHISH_ALL_CHECKS ;
urls . link_type = 0 ;
if ( ! strncmp ( ( char * ) hrefs - > tag [ i ] , src_text , src_text_len ) ) {
if ( ! ( urls . flags & CHECK_IMG_URL ) )
continue ;
urls . link_type | = LINKTYPE_IMAGE ;
}
if ( ctx - > options & CL_SCAN_PHISHING_BLOCKSSL ) {
urls . always_check_flags | = CHECK_SSL ;
}
if ( ctx - > options & CL_SCAN_PHISHING_BLOCKCLOAK ) {
urls . always_check_flags | = CHECK_CLOAKING ;
}
string_init_c ( & urls . realLink , ( char * ) hrefs - > value [ i ] ) ;
string_init_c ( & urls . displayLink , ( char * ) blobGetData ( hrefs - > contents [ i ] ) ) ;
string_init_c ( & urls . pre_fixup . pre_displayLink , NULL ) ;
if ( urls . displayLink . data [ blobGetDataSize ( hrefs - > contents [ i ] ) - 1 ] ) {
cli_warnmsg ( " urls.displayLink.data[...] " ) ;
return CL_CLEAN ;
}
urls . realLink . refcount = - 1 ;
urls . displayLink . refcount = - 1 ; /*don't free these, caller will free*/
if ( strcmp ( ( char * ) hrefs - > tag [ i ] , " href " ) ) {
char * url ;
url = urls . realLink . data ;
urls . realLink . data = urls . displayLink . data ;
urls . displayLink . data = url ;
}
rc = phishingCheck ( ctx - > engine , & urls ) ;
if ( pchk - > is_disabled )
return CL_CLEAN ;
free_if_needed ( & urls ) ;
cli_dbgmsg ( " Phishcheck: Phishing scan result: %s \n " , phishing_ret_toString ( rc ) ) ;
switch ( rc ) /*TODO: support flags from ctx->options,*/
{
case CL_PHISH_CLEAN :
continue ;
/* break;*/
case CL_PHISH_HEX_URL :
* ctx - > virname = " Phishing.Heuristics.Email.HexURL " ;
return found_possibly_unwanted ( ctx ) ;
/* break;*/
case CL_PHISH_NUMERIC_IP :
* ctx - > virname = " Phishing.Heuristics.Email.Cloaked.NumericIP " ;
return found_possibly_unwanted ( ctx ) ;
case CL_PHISH_CLOAKED_NULL :
* ctx - > virname = " Phishing.Heuristics.Email.Cloaked.Null " ; /*http://www.real.com%01%00@www.evil.com*/
return found_possibly_unwanted ( ctx ) ;
case CL_PHISH_SSL_SPOOF :
* ctx - > virname = " Phishing.Heuristics.Email.SSL-Spoof " ;
return found_possibly_unwanted ( ctx ) ;
case CL_PHISH_CLOAKED_UIU :
* ctx - > virname = " Phishing.Heuristics.Email.Cloaked.Username " ; /*http://www.ebay.com@www.evil.com*/
return found_possibly_unwanted ( ctx ) ;
case CL_PHISH_NOMATCH :
default :
* ctx - > virname = " Phishing.Heuristics.Email.SpoofedDomain " ;
return found_possibly_unwanted ( ctx ) ;
}
}
else
if ( strcmp ( ( char * ) hrefs - > tag [ i ] , " href " ) )
cli_dbgmsg ( " Phishcheck: href with no contents? \n " ) ;
return CL_CLEAN ;
}
static char * str_compose ( const char * a , const char * b , const char * c )
{
const size_t a_len = strlen ( a ) ;
const size_t b_len = strlen ( b ) ;
const size_t c_len = strlen ( c ) ;
const size_t r_len = a_len + b_len + c_len + 1 ;
char * concated = cli_malloc ( r_len ) ;
if ( ! concated )
return NULL ;
strncpy ( concated , a , a_len ) ;
strncpy ( concated + a_len , b , b_len ) ;
strncpy ( concated + a_len + b_len , c , c_len ) ;
concated [ r_len - 1 ] = ' \0 ' ;
return concated ;
}
static char hex2int ( const unsigned char * src )
{
return ( src [ 0 ] = = ' 0 ' & & src [ 1 ] = = ' 0 ' ) ?
0x1 : /* don't convert %00 to \0, use 0x1
* this value is also used by cloak check */
hextable [ src [ 0 ] ] < < 4 | hextable [ src [ 1 ] ] ;
}
static void free_regex ( regex_t * p )
{
if ( p ) {
cli_regfree ( p ) ;
}
}
int phishing_init ( struct cl_engine * engine )
{
char * url_regex , * realurl_regex ;
struct phishcheck * pchk ;
if ( ! engine - > phishcheck ) {
pchk = engine - > phishcheck = cli_malloc ( sizeof ( struct phishcheck ) ) ;
if ( ! pchk )
return CL_EMEM ;
pchk - > is_disabled = 1 ;
}
else {
pchk = engine - > phishcheck ;
if ( ! pchk )
return CL_ENULLARG ;
if ( ! pchk - > is_disabled ) {
/* already initialized */
return CL_SUCCESS ;
}
}
cli_dbgmsg ( " Initializing phishcheck module \n " ) ;
if ( build_regex ( & pchk - > preg_hexurl , cloaked_host_regex , 1 ) ) {
free ( pchk ) ;
engine - > phishcheck = NULL ;
return CL_EFORMAT ;
}
if ( build_regex ( & pchk - > preg_cctld , cctld_regex , 1 ) ) {
free ( pchk ) ;
engine - > phishcheck = NULL ;
return CL_EFORMAT ;
}
if ( build_regex ( & pchk - > preg_tld , tld_regex , 1 ) ) {
free_regex ( & pchk - > preg_cctld ) ;
free ( pchk ) ;
engine - > phishcheck = NULL ;
return CL_EFORMAT ;
}
url_regex = str_compose ( " ^ *(( " URI_CHECK_PROTOCOLS " )|( " URI_fragmentaddress1 , URI_fragmentaddress2 , URI_fragmentaddress3 " )) *$ " ) ;
if ( build_regex ( & pchk - > preg , url_regex , 1 ) ) {
free_regex ( & pchk - > preg_cctld ) ;
free_regex ( & pchk - > preg_tld ) ;
free ( url_regex ) ;
free ( pchk ) ;
engine - > phishcheck = NULL ;
return CL_EFORMAT ;
}
free ( url_regex ) ;
realurl_regex = str_compose ( " ^ *(( " URI_CHECK_PROTOCOLS " )|( " URI_path1 , URI_fragmentaddress2 , URI_fragmentaddress3 " )) *$ " ) ;
if ( build_regex ( & pchk - > preg_realurl , realurl_regex , 1 ) ) {
free_regex ( & pchk - > preg_cctld ) ;
free_regex ( & pchk - > preg_tld ) ;
free_regex ( & pchk - > preg ) ;
free ( url_regex ) ;
free ( realurl_regex ) ;
free ( pchk ) ;
engine - > phishcheck = NULL ;
return CL_EFORMAT ;
}
free ( realurl_regex ) ;
if ( build_regex ( & pchk - > preg_numeric , numeric_url_regex , 1 ) ) {
free_regex ( & pchk - > preg_cctld ) ;
free_regex ( & pchk - > preg_tld ) ;
free_regex ( & pchk - > preg ) ;
free_regex ( & pchk - > preg_realurl ) ;
free ( pchk ) ;
engine - > phishcheck = NULL ;
return CL_EFORMAT ;
}
pchk - > is_disabled = 0 ;
cli_dbgmsg ( " Phishcheck module initialized \n " ) ;
return CL_SUCCESS ;
}
void phishing_done ( struct cl_engine * engine )
{
struct phishcheck * pchk = engine - > phishcheck ;
cli_dbgmsg ( " Cleaning up phishcheck \n " ) ;
if ( pchk & & ! pchk - > is_disabled ) {
free_regex ( & pchk - > preg ) ;
free_regex ( & pchk - > preg_hexurl ) ;
free_regex ( & pchk - > preg_cctld ) ;
free_regex ( & pchk - > preg_tld ) ;
free_regex ( & pchk - > preg_numeric ) ;
free_regex ( & pchk - > preg_realurl ) ;
pchk - > is_disabled = 1 ;
}
whitelist_done ( engine ) ;
domainlist_done ( engine ) ;
if ( pchk ) {
cli_dbgmsg ( " Freeing phishcheck struct \n " ) ;
free ( pchk ) ;
engine - > phishcheck = NULL ;
}
cli_dbgmsg ( " Phishcheck cleaned up \n " ) ;
}
/*
* Only those URLs are identified as URLs for which phishing detection can be performed .
*/
static int isURL ( const struct phishcheck * pchk , const char * URL )
{
return URL ? ! cli_regexec ( & pchk - > preg , URL , 0 , NULL , 0 ) : 0 ;
}
/*
* Check if this is a real URL , which basically means to check if it has a known URL scheme ( http , https , ftp ) .
* This prevents false positives with outbind : // and blocked:: links.
*/
static int isRealURL ( const struct phishcheck * pchk , const char * URL )
{
return URL ? ! cli_regexec ( & pchk - > preg_realurl , URL , 0 , NULL , 0 ) : 0 ;
}
static int isNumericURL ( const struct phishcheck * pchk , const char * URL )
{
return URL ? ! cli_regexec ( & pchk - > preg_numeric , URL , 0 , NULL , 0 ) : 0 ;
}
/* Cleans up @urls
* If URLs are identical after cleanup it will return CL_PHISH_CLEANUP_OK .
* */
static enum phish_status cleanupURLs ( struct url_check * urls )
{
if ( urls - > flags & CLEANUP_URL ) {
cleanupURL ( & urls - > realLink , NULL , 1 ) ;
cleanupURL ( & urls - > displayLink , & urls - > pre_fixup . pre_displayLink , 0 ) ;
if ( ! urls - > displayLink . data | | ! urls - > realLink . data )
return CL_PHISH_NODECISION ;
if ( ! strcmp ( urls - > realLink . data , urls - > displayLink . data ) )
return CL_PHISH_CLEAN ;
}
return CL_PHISH_NODECISION ;
}
static int url_get_host ( const struct phishcheck * pchk , struct url_check * url , struct url_check * host_url , int isReal , int * phishy )
{
const char * start , * end ;
struct string * host = isReal ? & host_url - > realLink : & host_url - > displayLink ;
const char * URL = isReal ? url - > realLink . data : url - > displayLink . data ;
int rc ;
if ( ( rc = get_host ( pchk , URL , isReal , phishy , & start , & end ) ) ) {
return rc ;
}
if ( ! start | | ! end ) {
string_assign_null ( host ) ;
}
else if ( ( rc = string_assign_dup ( host , start , end ) ) ) {
return rc ;
}
cli_dbgmsg ( " Phishcheck:host:%s \n " , host - > data ) ;
if ( ! host - > data | | ( isReal & & host - > data [ 0 ] = = ' \0 ' ) | | * phishy & REAL_IS_MAILTO | | strchr ( host - > data , ' ' ) ) {
/* no host,
* link without domain , such as : href = " /isapi.dll?...
* mailto :
* spaces in hostname
*/
return CL_PHISH_CLEAN ;
}
if ( url - > flags & CHECK_CLOAKING & & ! cli_regexec ( & pchk - > preg_hexurl , host - > data , 0 , NULL , 0 ) ) {
/* uses a regex here, so that we don't accidentally block 0xacab.net style hosts */
return CL_PHISH_HEX_URL ;
}
if ( isNumeric ( host - > data ) ) {
* phishy | = PHISHY_NUMERIC_IP ;
}
if ( ! isReal ) {
url - > pre_fixup . host_start = start - URL ;
url - > pre_fixup . host_end = end - URL ;
}
return CL_PHISH_NODECISION ;
}
static void url_get_domain ( const struct phishcheck * pchk , struct url_check * url , struct url_check * domains )
{
get_domain ( pchk , & domains - > realLink , & url - > realLink ) ;
get_domain ( pchk , & domains - > displayLink , & url - > displayLink ) ;
domains - > flags = url - > flags ;
}
static enum phish_status phishy_map ( int phishy , enum phish_status fallback )
{
if ( phishy & PHISHY_USERNAME_IN_URL )
return CL_PHISH_CLOAKED_UIU ;
else if ( phishy & PHISHY_NUMERIC_IP )
return CL_PHISH_NUMERIC_IP ;
else
return fallback ;
}
static int isEncoded ( const char * url )
{
const char * start = url ;
size_t cnt = 0 ;
do {
cnt + + ;
start = strstr ( start , " &# " ) ;
if ( start )
start = strstr ( start , " ; " ) ;
} while ( start ) ;
return ( cnt - 1 > strlen ( url ) * 7 / 10 ) ; /*more than 70% made up of &#;*/
}
static int whitelist_check ( const struct cl_engine * engine , struct url_check * urls , int hostOnly )
{
return whitelist_match ( engine , urls - > realLink . data , urls - > displayLink . data , hostOnly ) ;
}
/* urls can't contain null pointer, caller must ensure this */
static enum phish_status phishingCheck ( const struct cl_engine * engine , struct url_check * urls )
{
struct url_check host_url ;
int rc = CL_PHISH_NODECISION ;
int phishy = 0 ;
const struct phishcheck * pchk = ( const struct phishcheck * ) engine - > phishcheck ;
if ( ! urls - > realLink . data | | urls - > displayLink . data [ 0 ] = = ' \0 ' )
return CL_PHISH_CLEAN ;
cli_dbgmsg ( " Phishcheck:Checking url %s->%s \n " , urls - > realLink . data ,
urls - > displayLink . data ) ;
if ( ! strcmp ( urls - > realLink . data , urls - > displayLink . data ) )
return CL_PHISH_CLEAN ; /* displayed and real URL are identical -> clean */
if ( ( rc = cleanupURLs ( urls ) ) ) {
/* it can only return an error, or say its clean;
* it is not allowed to decide it is phishing */
return rc < 0 ? rc : CL_PHISH_CLEAN ;
}
if ( whitelist_check ( engine , urls , 0 ) )
return CL_PHISH_CLEAN ; /* if url is whitelisted don't perform further checks */
if ( ( ! isURL ( pchk , urls - > displayLink . data ) | | ! isRealURL ( pchk , urls - > realLink . data ) ) & &
( ( phishy & PHISHY_NUMERIC_IP & & ! isNumericURL ( pchk , urls - > displayLink . data ) ) | |
! ( phishy & PHISHY_NUMERIC_IP ) ) ) {
cli_dbgmsg ( " Displayed 'url' is not url:%s \n " , urls - > displayLink . data ) ;
return CL_PHISH_CLEAN ;
}
if ( domainlist_match ( engine , urls - > realLink . data , urls - > displayLink . data , NULL , 0 , & urls - > flags ) ) {
phishy | = DOMAIN_LISTED ;
} else {
/* although entire url is not listed, the host might be,
* so defer phishing decisions till we know if host is listed */
}
url_check_init ( & host_url ) ;
if ( ( rc = url_get_host ( pchk , urls , & host_url , DOMAIN_DISPLAY , & phishy ) ) ) {
free_if_needed ( & host_url ) ;
return rc < 0 ? rc : CL_PHISH_CLEAN ;
}
if ( ! ( phishy & DOMAIN_LISTED ) & &
! domainlist_match ( engine , host_url . displayLink . data , host_url . realLink . data , & urls - > pre_fixup , 1 , & urls - > flags ) ) {
return CL_PHISH_CLEAN ; /* domain not listed */
}
/* link type filtering must occur after last domainlist_match */
if ( urls - > link_type & LINKTYPE_IMAGE & & ! ( urls - > flags & CHECK_IMG_URL ) )
return CL_PHISH_CLEAN ; /* its listed, but this link type is filtered */
if ( urls - > flags & CHECK_CLOAKING ) {
/*Checks if URL is cloaked.
Should we check if it contains another http : //, https://?
No because we might get false positives from redirect services . */
if ( strchr ( urls - > realLink . data , 0x1 ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_CLOAKED_NULL ;
}
if ( isEncoded ( urls - > displayLink . data ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_HEX_URL ;
}
}
if ( urls - > flags & CHECK_SSL & & isSSL ( urls - > displayLink . data ) & & ! isSSL ( urls - > realLink . data ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_SSL_SPOOF ;
}
if ( ( rc = url_get_host ( pchk , urls , & host_url , DOMAIN_REAL , & phishy ) ) )
{
free_if_needed ( & host_url ) ;
return rc < 0 ? rc : CL_PHISH_CLEAN ;
}
if ( whitelist_check ( engine , & host_url , 1 ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_CLEAN ;
}
if ( ! strcmp ( urls - > realLink . data , urls - > displayLink . data ) ) {
free_if_needed ( & host_url ) ;
return CL_PHISH_CLEAN ;
}
{
struct url_check domain_url ;
url_check_init ( & domain_url ) ;
url_get_domain ( pchk , & host_url , & domain_url ) ;
if ( ! strcmp ( domain_url . realLink . data , domain_url . displayLink . data ) ) {
free_if_needed ( & host_url ) ;
free_if_needed ( & domain_url ) ;
return CL_PHISH_CLEAN ;
}
free_if_needed ( & domain_url ) ;
}
free_if_needed ( & host_url ) ;
/*we failed to find a reason why the 2 URLs are different, this is definitely phishing*/
return phishy_map ( phishy , CL_PHISH_NOMATCH ) ;
}
static const char * phishing_ret_toString ( enum phish_status rc )
{
switch ( rc ) {
case CL_PHISH_CLEAN :
return " Clean " ;
case CL_PHISH_CLOAKED_NULL :
return " Link URL is cloaked (null byte %00) " ;
case CL_PHISH_CLOAKED_UIU :
return " Link URL contains username, and real<->displayed hosts don't match. " ;
/*username is a legit domain, and after the @ comes the evil one*/
case CL_PHISH_SSL_SPOOF :
return " Visible links is SSL, real link is not " ;
case CL_PHISH_NOMATCH :
return " URLs are way too different " ;
case CL_PHISH_HEX_URL :
return " Embedded hex urls " ;
default :
return " Unknown return code " ;
}
}