From 15021325efc5740d70a3789300c50df19ed85336 Mon Sep 17 00:00:00 2001
From: Nigel Horne <njh@clamav.net>
Date: Tue, 14 Sep 2004 20:51:01 +0000
Subject: [PATCH] Use new normalise code

git-svn: trunk@865
---
 clamav-devel/ChangeLog        |   7 ++
 clamav-devel/libclamav/mbox.c | 163 ++++++++--------------------------
 2 files changed, 44 insertions(+), 126 deletions(-)

diff --git a/clamav-devel/ChangeLog b/clamav-devel/ChangeLog
index 41759b318..d73159c7d 100644
--- a/clamav-devel/ChangeLog
+++ b/clamav-devel/ChangeLog
@@ -1,3 +1,10 @@
+Tue Sep 14 21:48:36 BST 2004 (njh)
+----------------------------------
+  * libclamav/mbox.c:	FOLLOWURL: now uses the new normalisation code to
+				find URLs to scan for trojans. This means
+				better scanning of HTML than the old FOLLOWURL
+				code and all is now done in RAM
+
 Tue Sep 14 22:32:50 CEST 2004 (tk)
 ----------------------------------
   * libclamav: do not print outdate warning for main.cvd
diff --git a/clamav-devel/libclamav/mbox.c b/clamav-devel/libclamav/mbox.c
index b5265e6da..898516f9f 100644
--- a/clamav-devel/libclamav/mbox.c
+++ b/clamav-devel/libclamav/mbox.c
@@ -17,6 +17,9 @@
  *
  * Change History:
  * $Log: mbox.c,v $
+ * Revision 1.119  2004/09/14 20:47:28  nigelhorne
+ * Use new normalise code
+ *
  * Revision 1.118  2004/09/14 12:09:37  nigelhorne
  * Include old normalise code
  *
@@ -342,7 +345,7 @@
  * Compilable under SCO; removed duplicate code with message.c
  *
  */
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $";
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.119 2004/09/14 20:47:28 nigelhorne Exp $";
 
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
@@ -427,6 +430,8 @@ typedef enum	{ FALSE = 0, TRUE = 1 } bool;
 
 #ifdef	FOLLOWURLS
 
+#include "htmlnorm.h"
+
 #define	MAX_URLS	5	/*
 				 * Maximum number of URLs scanned in a message
 				 * part
@@ -2110,92 +2115,18 @@ saveTextPart(message *m, const char *dir)
 }
 
 #ifdef	FOLLOWURLS
-
-/*
- * TODO: Use the newer normalise code
- * This is the old normalise code which normalises in memory. The new
- * code uses temporary files and has a different API.
- * 
-* Normalize an HTML buffer using the following rules:
-	o Remove multiple contiguous spaces
-	o Remove spaces around '<' and '>' in tags
-	o Remove spaces around '=' in tags
-	o Replace single quote with double quote in tags
-	o Convert to lowercase
-	o Convert all white space to a space character
-*/
-static unsigned char *
-mbox_html_normalize(unsigned char *in_buff, off_t in_size)
-{
-	unsigned char *out_buff;
-	off_t out_size=0, i;
-	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
-	
-	out_buff = (unsigned char *)cli_malloc(in_size+1);
-	if (!out_buff) {
-		cli_errmsg("malloc failed");
-		return NULL;
-	}
-	
-	for (i=0 ; i < in_size ; i++) {
-		if (in_buff[i] == '<') {
-			out_buff[out_size++] = '<';
-			tag_depth++;
-			if (tag_depth == 1) {
-				had_space=TRUE; /* consume spaces */
-			}
-		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
-			/* Remove preceeding spaces */
-			while ((out_size > 0) &&
-				(out_buff[out_size-1] == ' ')) {
-				out_size--;
-			}
-			out_buff[out_size++] = '=';
-			had_space=TRUE;
-		} else if (isspace(in_buff[i])) {
-			if (!had_space) {
-				out_buff[out_size++] = ' ';
-				had_space=TRUE;
-			}
-		} else if (in_buff[i] == '>') {
-			/* Remove preceeding spaces */
-			if (tag_depth == 1) {
-				while ((out_size > 0) &&
-					(out_buff[out_size-1] == ' ')) {
-					out_size--;
-				}
-			}
-			out_buff[out_size++] = '>';
-			tag_depth--;	
-		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
-			/* Convert single quotes to double quotes */
-			if (in_quote || out_buff[out_size-1] == '=') {
-				out_buff[out_size++] = '\"';
-				in_quote = !in_quote;
-			} else {
-				out_buff[out_size++] = '\'';
-			}
-		} else {
-			out_buff[out_size++] = tolower(in_buff[i]);
-			had_space=FALSE;
-		}
-	}
-	out_buff[out_size] = '\0';
-	return out_buff;
-}
-
 static void
 checkURLs(message *m, const char *dir)
 {
 	blob *b = messageToBlob(m);
-	char *ptr, *normalised;
 	size_t len;
 	table_t *t;
-	int n;
+	int i, n;
 #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
 	pthread_t tid[MAX_URLS];
 	struct arg args[MAX_URLS];
 #endif
+	tag_arguments_t hrefs;
 
 	if(b == NULL)
 		return;
@@ -2213,23 +2144,26 @@ checkURLs(message *m, const char *dir)
 
 	t = tableCreate();
 
-	n = 0;
-	normalised = ptr = mbox_html_normalize(blobGetData(b), len);
+	memset(&hrefs, '\0', sizeof(hrefs));
+
+	cli_dbgmsg("checkURLs: calling html_normalise_mem\n");
+	html_normalise_mem(blobGetData(b), len, NULL, &hrefs);
+	cli_dbgmsg("checkURLs: html_normalise_mem returned\n");
 
-	if(normalised == NULL) {
+	/*if(href == NULL) {
 		blobDestroy(b);
 		tableDestroy(t);
 		return;
-	}
+	}*/
 	/* TODO: Do we need to call remove_html_comments? */
 
-	/*
-	 * cli_memstr(ptr, len, "<a href=", 8)
-	 * Don't use cli_memstr() until bounds problem sorted
-	 * and it returns the place that the 'needle' was found
-	 */
-	while(len >= 8) {
-		if(strncasecmp(ptr, "<a href=", 8) == 0) {
+	n = 0;
+
+	for(i = 0; i < hrefs.count; i++) {
+		const char *url = hrefs.value[i];
+
+		if(strncasecmp("http://", url, 7) == 0) {
+			char *ptr;
 #ifdef	WITH_CURL
 #ifndef	CL_THREAD_SAFE
 			struct arg arg;
@@ -2242,52 +2176,31 @@ checkURLs(message *m, const char *dir)
 			struct stat statb;
 			char cmd[512];
 #endif	/*WITH_CURL*/
-			char *p2 = &ptr[8];
-			char *p3;
-			char name[512];
-
-			len -= 8;
-			while((len > 0) && ((*p2 == '\"') || isspace(*p2))) {
-				len--;
-				p2++;
-			}
-			if(len == 0)
-				break;
-			ptr = p2;
-			while((len > 0) && (isalnum(*ptr) || strchr("./?:%", *ptr))) {
-				ptr++;
-				len--;
-			}
-			if(len == 0)
-				break;
-			*ptr = '\0';
-			if(strncasecmp(p2, "mailto:", 7) == 0)
-				continue;
-			if(*p2 == '\0')
-				continue;
-			if(tableFind(t, p2) == 1) {
-				cli_dbgmsg("URL %s already downloaded\n", p2);
+			char name[NAME_MAX];
+
+			if(tableFind(t, url) == 1) {
+				cli_dbgmsg("URL %s already downloaded\n", url);
 				continue;
 			}
 			if(n == MAX_URLS) {
 				cli_warnmsg("Not all URLs will be scanned\n");
 				break;
 			}
-			(void)tableInsert(t, p2, 1);
-			cli_dbgmsg("Downloading URL %s to be scanned\n", p2);
-			strncpy(name, p2, sizeof(name));
-			for(p3 = name; *p3; p3++)
-				if(*p3 == '/')
-					*p3 = '_';
+			(void)tableInsert(t, url, 1);
+			cli_dbgmsg("Downloading URL %s to be scanned\n", url);
+			strncpy(name, url, sizeof(name));
+			for(ptr = name; *ptr; ptr++)
+				if(*ptr == '/')
+					*ptr = '_';
 
 #ifdef	WITH_CURL
 #ifdef	CL_THREAD_SAFE
-			args[n].url = strdup(p2);
+			args[n].url = strdup(url);
 			args[n].dir = strdup(dir);
 			args[n].filename = strdup(name);
 			pthread_create(&tid[n], NULL, getURL, &args[n]);
 #else
-			arg.url = p2;
+			arg.url = url;
 			arg.dir = dir;
 			arg.filename = name;
 			getURL(&arg);
@@ -2297,7 +2210,7 @@ checkURLs(message *m, const char *dir)
 			/*
 			 * TODO: maximum size and timeouts
 			 */
-			snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", p2, dir, name);
+			snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", url, dir, name);
 			cli_dbgmsg("%s\n", cmd);
 #ifdef	CL_THREAD_SAFE
 			pthread_mutex_lock(&system_mutex);
@@ -2309,7 +2222,7 @@ checkURLs(message *m, const char *dir)
 			snprintf(cmd, sizeof(cmd), "%s/%s", dir, name);
 			if(stat(cmd, &statb) >= 0)
 				if(statb.st_size == 0) {
-					cli_warnmsg("URL %s failed to download\n", p2);
+					cli_warnmsg("URL %s failed to download\n", url);
 					/*
 					 * Don't bother scanning an empty file
 					 */
@@ -2318,12 +2231,10 @@ checkURLs(message *m, const char *dir)
 #endif
 			++n;
 		}
-		ptr++;
-		len--;
 	}
+	html_tag_arg_free(&hrefs);
 	blobDestroy(b);
 	tableDestroy(t);
-	free(normalised);
 
 #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
 	cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);