From 15021325efc5740d70a3789300c50df19ed85336 Mon Sep 17 00:00:00 2001 From: Nigel Horne Date: Tue, 14 Sep 2004 20:51:01 +0000 Subject: [PATCH] Use new normalise code git-svn: trunk@865 --- clamav-devel/ChangeLog | 7 ++ clamav-devel/libclamav/mbox.c | 163 ++++++++-------------------------- 2 files changed, 44 insertions(+), 126 deletions(-) diff --git a/clamav-devel/ChangeLog b/clamav-devel/ChangeLog index 41759b318..d73159c7d 100644 --- a/clamav-devel/ChangeLog +++ b/clamav-devel/ChangeLog @@ -1,3 +1,10 @@ +Tue Sep 14 21:48:36 BST 2004 (njh) +---------------------------------- + * libclamav/mbox.c: FOLLOWURL: now uses the new normalisation code to + find URLs to scan for trojans. This means + better scanning of HTML than the old FOLLOWURL + code and all is now done in RAM + Tue Sep 14 22:32:50 CEST 2004 (tk) ---------------------------------- * libclamav: do not print outdate warning for main.cvd diff --git a/clamav-devel/libclamav/mbox.c b/clamav-devel/libclamav/mbox.c index b5265e6da..898516f9f 100644 --- a/clamav-devel/libclamav/mbox.c +++ b/clamav-devel/libclamav/mbox.c @@ -17,6 +17,9 @@ * * Change History: * $Log: mbox.c,v $ + * Revision 1.119 2004/09/14 20:47:28 nigelhorne + * Use new normalise code + * * Revision 1.118 2004/09/14 12:09:37 nigelhorne * Include old normalise code * @@ -342,7 +345,7 @@ * Compilable under SCO; removed duplicate code with message.c * */ -static char const rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $"; +static char const rcsid[] = "$Id: mbox.c,v 1.119 2004/09/14 20:47:28 nigelhorne Exp $"; #if HAVE_CONFIG_H #include "clamav-config.h" @@ -427,6 +430,8 @@ typedef enum { FALSE = 0, TRUE = 1 } bool; #ifdef FOLLOWURLS +#include "htmlnorm.h" + #define MAX_URLS 5 /* * Maximum number of URLs scanned in a message * part @@ -2110,92 +2115,18 @@ saveTextPart(message *m, const char *dir) } #ifdef FOLLOWURLS - -/* - * TODO: Use the newer normalise code - * This is the old normalise code which normalises in memory. The new - * code uses temporary files and has a different API. - * -* Normalize an HTML buffer using the following rules: - o Remove multiple contiguous spaces - o Remove spaces around '<' and '>' in tags - o Remove spaces around '=' in tags - o Replace single quote with double quote in tags - o Convert to lowercase - o Convert all white space to a space character -*/ -static unsigned char * -mbox_html_normalize(unsigned char *in_buff, off_t in_size) -{ - unsigned char *out_buff; - off_t out_size=0, i; - int had_space=FALSE, tag_depth=0, in_quote=FALSE; - - out_buff = (unsigned char *)cli_malloc(in_size+1); - if (!out_buff) { - cli_errmsg("malloc failed"); - return NULL; - } - - for (i=0 ; i < in_size ; i++) { - if (in_buff[i] == '<') { - out_buff[out_size++] = '<'; - tag_depth++; - if (tag_depth == 1) { - had_space=TRUE; /* consume spaces */ - } - } else if ((in_buff[i] == '=') && (tag_depth == 1)) { - /* Remove preceeding spaces */ - while ((out_size > 0) && - (out_buff[out_size-1] == ' ')) { - out_size--; - } - out_buff[out_size++] = '='; - had_space=TRUE; - } else if (isspace(in_buff[i])) { - if (!had_space) { - out_buff[out_size++] = ' '; - had_space=TRUE; - } - } else if (in_buff[i] == '>') { - /* Remove preceeding spaces */ - if (tag_depth == 1) { - while ((out_size > 0) && - (out_buff[out_size-1] == ' ')) { - out_size--; - } - } - out_buff[out_size++] = '>'; - tag_depth--; - } else if ((in_buff[i] == '\'') && (tag_depth==1)) { - /* Convert single quotes to double quotes */ - if (in_quote || out_buff[out_size-1] == '=') { - out_buff[out_size++] = '\"'; - in_quote = !in_quote; - } else { - out_buff[out_size++] = '\''; - } - } else { - out_buff[out_size++] = tolower(in_buff[i]); - had_space=FALSE; - } - } - out_buff[out_size] = '\0'; - return out_buff; -} - static void checkURLs(message *m, const char *dir) { blob *b = messageToBlob(m); - char *ptr, *normalised; size_t len; table_t *t; - int n; + int i, n; #if defined(WITH_CURL) && defined(CL_THREAD_SAFE) pthread_t tid[MAX_URLS]; struct arg args[MAX_URLS]; #endif + tag_arguments_t hrefs; if(b == NULL) return; @@ -2213,23 +2144,26 @@ checkURLs(message *m, const char *dir) t = tableCreate(); - n = 0; - normalised = ptr = mbox_html_normalize(blobGetData(b), len); + memset(&hrefs, '\0', sizeof(hrefs)); + + cli_dbgmsg("checkURLs: calling html_normalise_mem\n"); + html_normalise_mem(blobGetData(b), len, NULL, &hrefs); + cli_dbgmsg("checkURLs: html_normalise_mem returned\n"); - if(normalised == NULL) { + /*if(href == NULL) { blobDestroy(b); tableDestroy(t); return; - } + }*/ /* TODO: Do we need to call remove_html_comments? */ - /* - * cli_memstr(ptr, len, " 0) && ((*p2 == '\"') || isspace(*p2))) { - len--; - p2++; - } - if(len == 0) - break; - ptr = p2; - while((len > 0) && (isalnum(*ptr) || strchr("./?:%", *ptr))) { - ptr++; - len--; - } - if(len == 0) - break; - *ptr = '\0'; - if(strncasecmp(p2, "mailto:", 7) == 0) - continue; - if(*p2 == '\0') - continue; - if(tableFind(t, p2) == 1) { - cli_dbgmsg("URL %s already downloaded\n", p2); + char name[NAME_MAX]; + + if(tableFind(t, url) == 1) { + cli_dbgmsg("URL %s already downloaded\n", url); continue; } if(n == MAX_URLS) { cli_warnmsg("Not all URLs will be scanned\n"); break; } - (void)tableInsert(t, p2, 1); - cli_dbgmsg("Downloading URL %s to be scanned\n", p2); - strncpy(name, p2, sizeof(name)); - for(p3 = name; *p3; p3++) - if(*p3 == '/') - *p3 = '_'; + (void)tableInsert(t, url, 1); + cli_dbgmsg("Downloading URL %s to be scanned\n", url); + strncpy(name, url, sizeof(name)); + for(ptr = name; *ptr; ptr++) + if(*ptr == '/') + *ptr = '_'; #ifdef WITH_CURL #ifdef CL_THREAD_SAFE - args[n].url = strdup(p2); + args[n].url = strdup(url); args[n].dir = strdup(dir); args[n].filename = strdup(name); pthread_create(&tid[n], NULL, getURL, &args[n]); #else - arg.url = p2; + arg.url = url; arg.dir = dir; arg.filename = name; getURL(&arg); @@ -2297,7 +2210,7 @@ checkURLs(message *m, const char *dir) /* * TODO: maximum size and timeouts */ - snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", p2, dir, name); + snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", url, dir, name); cli_dbgmsg("%s\n", cmd); #ifdef CL_THREAD_SAFE pthread_mutex_lock(&system_mutex); @@ -2309,7 +2222,7 @@ checkURLs(message *m, const char *dir) snprintf(cmd, sizeof(cmd), "%s/%s", dir, name); if(stat(cmd, &statb) >= 0) if(statb.st_size == 0) { - cli_warnmsg("URL %s failed to download\n", p2); + cli_warnmsg("URL %s failed to download\n", url); /* * Don't bother scanning an empty file */ @@ -2318,12 +2231,10 @@ checkURLs(message *m, const char *dir) #endif ++n; } - ptr++; - len--; } + html_tag_arg_free(&hrefs); blobDestroy(b); tableDestroy(t); - free(normalised); #if defined(WITH_CURL) && defined(CL_THREAD_SAFE) cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);