From b02aff65f5e1a42307777d0a354406c482e47235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edvin?= Date: Mon, 16 Mar 2009 13:53:57 +0000 Subject: [PATCH] add comments, no functionality change. git-svn: trunk@4954 --- ChangeLog | 4 ++++ libclamav/phishcheck.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4764d0834..1feac384e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Mon Mar 16 15:53:52 EET 2009 (edwin) +------------------------------------ + * libclamav/phishcheck.c: add comments, no functionality change. + Mon Mar 16 15:41:17 EET 2009 (edwin) ------------------------------------ * docs/phishsigs_howto.tex, libclamav/phishcheck.c: document URL diff --git a/libclamav/phishcheck.c b/libclamav/phishcheck.c index d94a948fa..35ec40cf1 100644 --- a/libclamav/phishcheck.c +++ b/libclamav/phishcheck.c @@ -1235,18 +1235,22 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, urlbuff[dest_len] = urlbuff[dest_len+1] = urlbuff[dest_len+2] = '\0'; url = urlbuff; + /* canonicalize only real URLs, with a protocol */ host_begin = strchr(url, ':'); if(!host_begin) return CL_PHISH_CLEAN; ++host_begin; + /* ignore username in URL */ p = strchr(host_begin, '@'); if (p) host_begin = p+1; url = host_begin; + /* repeatedly % unescape characters */ str_hex_to_char(&url, &urlend); host_begin = url; len = urlend - url; + /* skip to beginning of hostname */ while((host_begin < urlend) && *host_begin == '/') ++host_begin; while(*host_begin == '.' && host_begin < urlend) ++host_begin; @@ -1255,11 +1259,13 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, while (p < urlend) { if (p+2 < urlend && *p == '/' && p[1] == '.' ) { if (p[2] == '/') { + /* remove /./ */ if (p + 3 < urlend) memmove(p+1, p+3, urlend - p - 3); urlend -= 2; } else if (p[2] == '.' && (p[3] == '/' || p[3] == '\0') && last) { + /* remove /component/../ */ if (p+4 < urlend) memmove(last+1, p+4, urlend - p - 4); urlend -= 3 + (p - last); @@ -1276,6 +1282,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, while (p < urlend && p+2 < url + dest_len) { unsigned char c = *p; if (c <= 32 || c >= 127 || c == '%' || c == '#') { + /* convert non-ascii characters back to % escaped */ const char hexchars[] = "0123456789ABCDEF"; memmove(p+3, p+1, urlend - p - 1); *p++ = '%'; @@ -1288,9 +1295,11 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, *p = '\0'; urlend = p; len = urlend - url; + /* determine end of hostname */ host_len = strcspn(host_begin, ":/?"); path_begin = host_begin + host_len; if(host_len < len) { + /* url without path, use a single / */ memmove(path_begin + 2, path_begin + 1, len - host_len); *path_begin++ = '/'; *path_begin++ = '\0'; @@ -1299,6 +1308,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, path_len = url + len - path_begin + 1; p = strchr(path_begin, '#'); if (p) { + /* ignore anchor */ *p = '\0'; path_len = p - path_begin; } @@ -1307,6 +1317,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, path_len = 0; *path = ""; } + /* lowercase entire URL */ str_make_lowercase(host_begin, host_len); *host = host_begin; *hostlen = host_len; @@ -1330,6 +1341,8 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl, unsigned count; if(!rlist || !rlist->sha256_hashes.bm_patterns) { + /* no hashes loaded -> don't waste time canonicalizing and + * looking up */ return CL_SUCCESS; } if(!inurl) @@ -1338,6 +1351,8 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl, rc = cli_url_canon(inurl, len, urlbuff, sizeof(urlbuff), &host_begin, &host_len, &path_begin, &path_len); if (rc == CL_PHISH_CLEAN) return rc; + + /* get last 5 components of hostname */ j=COMPONENTS; component = strrchr(host_begin, '.'); while(component && j > 0) { @@ -1351,6 +1366,7 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl, } lp[j] = host_begin; + /* get first 5 components of path */ pp[0] = path_len; if(path_len) { pp[1] = strcspn(path_begin, "?"); @@ -1376,6 +1392,7 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl, rlist->hostkey_prefix.bm_patterns; --ji; assert(pp[ki] <= path_len); + /* lookup prefix/suffix hashes of URL */ rc = hash_match(rlist, lp[ji], host_begin + host_len - lp[ji] + 1, path_begin, pp[ki], need_prefixmatch ? &prefix_matched : NULL); if(rc) { @@ -1383,6 +1400,9 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl, } count++; if (count == 2 && !prefix_matched && rlist->hostkey_prefix.bm_patterns) { + /* if hostkey is not matched, don't bother calculating + * hashes for other parts of the URL, they are not in the DB + */ cli_dbgmsg("hostkey prefix not matched, short-circuiting lookups\n"); return CL_SUCCESS; }