ignore invalid URLs containing double dots, optimization: cut URL after hostname

git-svn: trunk@3569
remotes/push_mirror/metadata
Török Edvin 18 years ago
parent 8619da9120
commit f12c2e6825
  1. 5
      ChangeLog
  2. BIN
      docs/phishsigs_howto.pdf
  3. 7
      docs/phishsigs_howto.tex
  4. 71
      libclamav/phishcheck.c

@ -1,3 +1,8 @@
Thu Jan 31 17:44:35 EET 2008 (edwin)
------------------------------------
* libclamav/phishcheck.c, docs/phishsigs_howto.tex: ignore invalid URLs
containing double dots, optimization: cut URL after hostname
Thu Jan 31 16:33:56 CET 2008 (tk)
---------------------------------
* libclamav/vba_extract.c: minor code tidy; drop broken sigtouint32()

Binary file not shown.

@ -237,7 +237,10 @@ Furthermore you can restrict what checks are to be performed by specifying the 3
\subsubsection{Extraction of \textsc{realURL}, \textsc{displayedURL} from HTML tags\label{sub:Extraction-of-realURL,}}
The html parser extracts pairs of \textsc{realURL}/\textsc{displayedURL}
based on the following rules:
based on the following rules.
In version 0.93: After URLs have been extracted, they are normalized, and cut after the hostname.
\verb+http://test.example.com/path/somecgi?queryparameters+ becomes \verb+http://test.example.com/+
\begin{description}
\item [{a}] (anchor) the \emph{href} is the \textsc{realURL}, its \emph{contents}
@ -588,4 +591,4 @@ Then see what urls are being checked, see if any of them is in a
whitelist, see if all urls are detected, etc.
\end{document}
\end{document}

@ -173,46 +173,34 @@ static const size_t https_len = sizeof(https)-1;
#define URI_safe_nodot "-$_@&"
#define URI_safe "-$_@.&"
#define URI_extra "!*\"'(),"
#define URI_reserved "=;/#?: "
#define URI_national "{}|[]\\^~"
#define URI_punctuation "<>"
#define URI_hex "[0-9a-fA-f]"
#define URI_escape "%"URI_hex"{2}"
#define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
#define URI_xalphas URI_xalpha"+"
#define URI_xalphas_nodot URI_xalpha_nodot"*"
#define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot""
#define URI_xpalpha URI_xalpha"|\\+"
#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
#define URI_xpalphas "("URI_xpalpha")+"
#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
#define optional_URI_xpalphas "("URI_xpalpha"|=)*"
#define URI_scheme URI_ialpha
#define URI_tld iana_tld
#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
#define URI_path2 URI_tld
#define URI_path3 "(/"optional_URI_xpalphas")*"
#define URI_search "("URI_xalphas")*"
#define URI_fragmentid URI_xalphas
#define URI_IP_digits "["URI_digit"]{1,3}"
#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?"
#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?"
#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
#define URI_path_start "[/?:]?"
#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start
#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path
#define URI_numeric_fragmentaddress URI_numeric_URI
#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
#define URI_URI2 URI_path2
#define URI_URI3 URI_path3"(\\?" URI_search")?"
#define URI_URI2 URI_tld
#define URI_fragmentaddress1 URI_URI1
#define URI_fragmentaddress2 URI_URI2
#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
#define URI_fragmentaddress2 URI_URI2""URI_path_start
#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"
@ -680,6 +668,9 @@ str_fixup_spaces(char **begin, const char **end)
/* strip leading/trailing garbage */
while(!isalnum(sbegin[0]) && sbegin <= send) sbegin++;
while(!isalnum(send[0]) && send >= sbegin) send--;
/* keep terminating slash character*/
if(send[1] == '/') send++;
*begin = sbegin;
*end = send;
}
@ -715,7 +706,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
}
while(isspace(*end))
end--;
/*TODO: convert \ to /, and stuff like that*/
/* From mailscanner, my comments enclosed in {} */
if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) {
string_assign_null(URL);
@ -727,6 +717,32 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
int rc;
str_replace(begin,end,'\\','/');
/* find beginning of hostname, because:
* - we want to keep only protocol, host, and
* strip path & query parameter(s)
* - we want to make hostname lowercase*/
host_begin = strchr(begin,':');
while(host_begin && (host_begin < end) && (host_begin[1] == '/')) host_begin++;
if(!host_begin) host_begin=begin;
else host_begin++;
host_len = strcspn(host_begin,":/?");
if(host_begin + host_len > end + 1) {
/* prevent hostname extending beyond end, it can happen
* if we have spaces at the end, we don't want those part of
* the hostname */
host_len = end - host_begin + 1;
} else {
/* cut the URL after the hostname */
/* @end points to last character we want to be part of the URL */
end = host_begin + host_len - 1;
}
/* terminate URL with a slash, except when we're at end of string */
if(host_begin[host_len]) {
host_begin[host_len] = '/';
end++;
}
/* convert hostname to lowercase, but only hostname! */
str_make_lowercase(host_begin, host_len);
/* some broken MUAs put > in the href, and then
* we get a false positive, so remove them */
str_replace(begin,end,'<',' ');
@ -735,13 +751,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
str_replace(begin,end,';',' ');
str_strip(&begin,&end,lt,lt_len);
str_strip(&begin,&end,gt,gt_len);
/* convert hostname to lowercase, but only hostname! */
host_begin = strchr(begin,':');
while(host_begin && host_begin[1]=='/') host_begin++;
if(!host_begin) host_begin=begin;
else host_begin++;
host_len = strcspn(host_begin,"/?");
str_make_lowercase(host_begin,host_len);
/* convert %xx to real value */
str_hex_to_char(&begin,&end);
if(isReal) {
@ -929,7 +938,7 @@ int phishing_init(struct cl_engine* engine)
engine->phishcheck = NULL;
return CL_EFORMAT;
}
url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$");
if(build_regex(&pchk->preg,url_regex,1)) {
free_regex(&pchk->preg_cctld);
free_regex(&pchk->preg_tld);
@ -939,7 +948,7 @@ int phishing_init(struct cl_engine* engine)
return CL_EFORMAT;
}
free(url_regex);
realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_path1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$");
if(build_regex(&pchk->preg_realurl, realurl_regex,1)) {
free_regex(&pchk->preg_cctld);
free_regex(&pchk->preg_tld);
@ -1017,7 +1026,6 @@ static enum phish_status cleanupURLs(struct url_check* urls)
{
if(urls->flags&CLEANUP_URL) {
cleanupURL(&urls->realLink,NULL,1);
cleanupURL(&urls->displayLink,&urls->pre_fixup.pre_displayLink,0);
if(!urls->displayLink.data || !urls->realLink.data)
return CL_PHISH_NODECISION;
@ -1045,12 +1053,14 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str
cli_dbgmsg("Phishcheck:host:%s\n", host->data);
if(!host->data || (isReal && host->data[0]=='\0') || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
if(!host->data || (isReal && (host->data[0]=='\0' || strstr(host->data,".."))) || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
/* no host,
* link without domain, such as: href="/isapi.dll?...
* mailto:
* spaces in hostname
* double dots
*/
cli_dbgmsg("Phishcheck:skipping invalid host\n");
return CL_PHISH_CLEAN;
}
if(url->flags&CHECK_CLOAKING && !cli_regexec(&pchk->preg_hexurl,host->data,0,NULL,0)) {
@ -1127,6 +1137,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
urls->displayLink.data);
if(whitelist_check(engine, urls, 0))
return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */

Loading…
Cancel
Save