ignore invalid URLs containing double dots, optimization: cut URL after hostname

git-svn: trunk@3569
18 years ago · f12c2e6825
parent 8619da9120
commit f12c2e6825
4 changed files with 51 additions and 32 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+Thu Jan 31 17:44:35 EET 2008 (edwin)
+------------------------------------
+  * libclamav/phishcheck.c, docs/phishsigs_howto.tex: ignore invalid URLs
+  containing double dots, optimization: cut URL after hostname
+
 Thu Jan 31 16:33:56 CET 2008 (tk)
 ---------------------------------
  * libclamav/vba_extract.c: minor code tidy; drop broken sigtouint32()
--- a/docs/phishsigs_howto.pdf
+++ b/docs/phishsigs_howto.pdf
--- a/docs/phishsigs_howto.tex
+++ b/docs/phishsigs_howto.tex
@ -237,7 +237,10 @@ Furthermore you can restrict what checks are to be performed by specifying the 3
 \subsubsection{Extraction of \textsc{realURL}, \textsc{displayedURL} from HTML tags\label{sub:Extraction-of-realURL,}}

 The html parser extracts pairs of \textsc{realURL}/\textsc{displayedURL}
-based on the following rules:
+based on the following rules.
+
+In version 0.93: After URLs have been extracted, they are normalized, and cut after the hostname.
+\verb+http://test.example.com/path/somecgi?queryparameters+ becomes \verb+http://test.example.com/+

 \begin{description}
 \item [{a}] (anchor) the \emph{href} is the \textsc{realURL}, its \emph{contents}
@ -588,4 +591,4 @@ Then see what urls are being checked, see if any of them is in a
 whitelist, see if all urls are detected, etc.


-\end{document}
+\end{document}
--- a/libclamav/phishcheck.c
+++ b/libclamav/phishcheck.c
@ -173,46 +173,34 @@ static const size_t https_len  = sizeof(https)-1;
 #define URI_safe_nodot  "-$_@&"
 #define URI_safe	"-$_@.&"
 #define URI_extra	"!*\"'(),"
-#define URI_reserved    "=;/#?: "
-#define URI_national    "{}|[]\\^~"
-#define URI_punctuation "<>"

 #define URI_hex		 "[0-9a-fA-f]"
 #define URI_escape      "%"URI_hex"{2}"
 #define URI_xalpha "([" URI_safe URI_alpha URI_digit  URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
 #define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"

-#define URI_xalphas URI_xalpha"+"
 #define URI_xalphas_nodot URI_xalpha_nodot"*"

 #define URI_ialpha  "["URI_alpha"]"URI_xalphas_nodot""
 #define URI_xpalpha URI_xalpha"|\\+"
 #define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
-#define URI_xpalphas "("URI_xpalpha")+"
 #define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
-#define optional_URI_xpalphas "("URI_xpalpha"|=)*"

 #define URI_scheme URI_ialpha
 #define URI_tld iana_tld
 #define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
-#define URI_path2 URI_tld
-#define URI_path3 "(/"optional_URI_xpalphas")*"
-
-#define URI_search "("URI_xalphas")*"
-#define URI_fragmentid URI_xalphas

 #define URI_IP_digits "["URI_digit"]{1,3}"
-#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?"
-#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?"
-#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
+#define URI_path_start "[/?:]?"
+#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start
+#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path
+#define URI_numeric_fragmentaddress URI_numeric_URI

 #define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
-#define URI_URI2 URI_path2
-#define URI_URI3 URI_path3"(\\?" URI_search")?"
+#define URI_URI2 URI_tld

 #define URI_fragmentaddress1 URI_URI1
-#define URI_fragmentaddress2 URI_URI2
-#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
+#define URI_fragmentaddress2 URI_URI2""URI_path_start

 #define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"

@ -680,6 +668,9 @@ str_fixup_spaces(char **begin, const char **end)
 	/* strip leading/trailing garbage */
 	while(!isalnum(sbegin[0]) && sbegin <= send) sbegin++;
 	while(!isalnum(send[0]) && send >= sbegin) send--;
+
+	/* keep terminating slash character*/
+	if(send[1] == '/') send++;
 	*begin = sbegin;
 	*end = send;
 }
@ -715,7 +706,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
 	}
 	while(isspace(*end))
 		end--;
-	/*TODO: convert \ to /, and stuff like that*/
 	/* From mailscanner, my comments enclosed in {} */
 	if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) {
 		string_assign_null(URL);
@ -727,6 +717,32 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
 		int rc;

 		str_replace(begin,end,'\\','/');
+		/* find beginning of hostname, because:
+		 * - we want to keep only protocol, host, and 
+		 *  strip path & query parameter(s) 
+		 * - we want to make hostname lowercase*/
+		host_begin = strchr(begin,':');
+		while(host_begin && (host_begin < end) && (host_begin[1] == '/'))  host_begin++;
+		if(!host_begin) host_begin=begin;
+		else host_begin++;
+		host_len = strcspn(host_begin,":/?");
+	        if(host_begin + host_len > end + 1) {
+			/* prevent hostname extending beyond end, it can happen
+			 * if we have spaces at the end, we don't want those part of 
+			 * the hostname */
+			host_len = end - host_begin + 1;
+		} else {
+			/* cut the URL after the hostname */
+			/* @end points to last character we want to be part of the URL */
+			end = host_begin + host_len - 1;
+		}
+		/* terminate URL with a slash, except when we're at end of string */
+		if(host_begin[host_len]) {
+			host_begin[host_len] = '/';
+			end++;
+		}
+		/* convert hostname to lowercase, but only hostname! */
+		str_make_lowercase(host_begin, host_len);
 		/* some broken MUAs put > in the href, and then
 		 * we get a false positive, so remove them */
 		str_replace(begin,end,'<',' ');
@ -735,13 +751,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
 		str_replace(begin,end,';',' ');
 		str_strip(&begin,&end,lt,lt_len);
 		str_strip(&begin,&end,gt,gt_len);
-		/* convert hostname to lowercase, but only hostname! */
-		host_begin = strchr(begin,':');
-		while(host_begin && host_begin[1]=='/') host_begin++;
-		if(!host_begin) host_begin=begin;
-		else host_begin++;
-		host_len = strcspn(host_begin,"/?");
-		str_make_lowercase(host_begin,host_len);
 		/* convert %xx to real value */
 		str_hex_to_char(&begin,&end);
 		if(isReal) {
@ -929,7 +938,7 @@ int phishing_init(struct cl_engine* engine)
 		engine->phishcheck = NULL;
 		return CL_EFORMAT;
 	}
-	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
+	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$");
 	if(build_regex(&pchk->preg,url_regex,1)) {
 		free_regex(&pchk->preg_cctld);
 		free_regex(&pchk->preg_tld);
@ -939,7 +948,7 @@ int phishing_init(struct cl_engine* engine)
 		return CL_EFORMAT;
 	}
 	free(url_regex);
-	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_path1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
+	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$");
 	if(build_regex(&pchk->preg_realurl, realurl_regex,1)) {
 		free_regex(&pchk->preg_cctld);
 		free_regex(&pchk->preg_tld);
@ -1017,7 +1026,6 @@ static enum phish_status cleanupURLs(struct url_check* urls)
 {
 	if(urls->flags&CLEANUP_URL) {
 		cleanupURL(&urls->realLink,NULL,1);
-
 		cleanupURL(&urls->displayLink,&urls->pre_fixup.pre_displayLink,0);
 		if(!urls->displayLink.data || !urls->realLink.data)
 			return CL_PHISH_NODECISION;
@ -1045,12 +1053,14 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str

 	cli_dbgmsg("Phishcheck:host:%s\n", host->data);

-	if(!host->data || (isReal && host->data[0]=='\0') || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
+	if(!host->data || (isReal && (host->data[0]=='\0' || strstr(host->data,".."))) || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
 		/* no host,
 		 * link without domain, such as: href="/isapi.dll?...
 		 * mailto:
 		 * spaces in hostname
+		 * double dots
 		 */
+		cli_dbgmsg("Phishcheck:skipping invalid host\n");
 		return CL_PHISH_CLEAN;
 	}
 	if(url->flags&CHECK_CLOAKING && !cli_regexec(&pchk->preg_hexurl,host->data,0,NULL,0)) {
@ -1127,6 +1137,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url

 	cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
 		urls->displayLink.data);
+
 	if(whitelist_check(engine, urls, 0))
 		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */