From 90bb9c3e8fb0dbd0f7a8365348adcab239e15f46 Mon Sep 17 00:00:00 2001 From: Nigel Horne Date: Tue, 14 Sep 2004 12:14:29 +0000 Subject: [PATCH] Include old normalise code git-svn: trunk@863 --- clamav-devel/ChangeLog | 9 ++++ clamav-devel/libclamav/mbox.c | 83 +++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/clamav-devel/ChangeLog b/clamav-devel/ChangeLog index 619c436e0..f3dbc612c 100644 --- a/clamav-devel/ChangeLog +++ b/clamav-devel/ChangeLog @@ -1,3 +1,12 @@ +Tue Sep 14 13:10:38 BST 2004 (njh) +---------------------------------- + * libclamav/mbox.c: FOLLOWURL: include the text of the old HTML + normalisation code that works in RAM until the + code for the new HTML API that uses temporary + files is added to mbox.c. This allows clamAV to + link and work until the new code is called + from mbox.c. + Tue Sep 14 11:30:43 BST 2004 (njh) ---------------------------------- * libclamav/untar.c: Fix compilation error on AIX and OSF diff --git a/clamav-devel/libclamav/mbox.c b/clamav-devel/libclamav/mbox.c index a2a568636..b5265e6da 100644 --- a/clamav-devel/libclamav/mbox.c +++ b/clamav-devel/libclamav/mbox.c @@ -17,6 +17,9 @@ * * Change History: * $Log: mbox.c,v $ + * Revision 1.118 2004/09/14 12:09:37 nigelhorne + * Include old normalise code + * * Revision 1.117 2004/09/13 16:44:01 kojm * minor cleanup * @@ -339,7 +342,7 @@ * Compilable under SCO; removed duplicate code with message.c * */ -static char const rcsid[] = "$Id: mbox.c,v 1.117 2004/09/13 16:44:01 kojm Exp $"; +static char const rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $"; #if HAVE_CONFIG_H #include "clamav-config.h" @@ -424,8 +427,6 @@ typedef enum { FALSE = 0, TRUE = 1 } bool; #ifdef FOLLOWURLS -#include "htmlnorm.h" - #define MAX_URLS 5 /* * Maximum number of URLs scanned in a message * part @@ -2109,6 +2110,80 @@ saveTextPart(message *m, const char *dir) } #ifdef FOLLOWURLS + +/* + * TODO: Use the newer normalise code + * This is the old normalise code which normalises in memory. The new + * code uses temporary files and has a different API. + * +* Normalize an HTML buffer using the following rules: + o Remove multiple contiguous spaces + o Remove spaces around '<' and '>' in tags + o Remove spaces around '=' in tags + o Replace single quote with double quote in tags + o Convert to lowercase + o Convert all white space to a space character +*/ +static unsigned char * +mbox_html_normalize(unsigned char *in_buff, off_t in_size) +{ + unsigned char *out_buff; + off_t out_size=0, i; + int had_space=FALSE, tag_depth=0, in_quote=FALSE; + + out_buff = (unsigned char *)cli_malloc(in_size+1); + if (!out_buff) { + cli_errmsg("malloc failed"); + return NULL; + } + + for (i=0 ; i < in_size ; i++) { + if (in_buff[i] == '<') { + out_buff[out_size++] = '<'; + tag_depth++; + if (tag_depth == 1) { + had_space=TRUE; /* consume spaces */ + } + } else if ((in_buff[i] == '=') && (tag_depth == 1)) { + /* Remove preceeding spaces */ + while ((out_size > 0) && + (out_buff[out_size-1] == ' ')) { + out_size--; + } + out_buff[out_size++] = '='; + had_space=TRUE; + } else if (isspace(in_buff[i])) { + if (!had_space) { + out_buff[out_size++] = ' '; + had_space=TRUE; + } + } else if (in_buff[i] == '>') { + /* Remove preceeding spaces */ + if (tag_depth == 1) { + while ((out_size > 0) && + (out_buff[out_size-1] == ' ')) { + out_size--; + } + } + out_buff[out_size++] = '>'; + tag_depth--; + } else if ((in_buff[i] == '\'') && (tag_depth==1)) { + /* Convert single quotes to double quotes */ + if (in_quote || out_buff[out_size-1] == '=') { + out_buff[out_size++] = '\"'; + in_quote = !in_quote; + } else { + out_buff[out_size++] = '\''; + } + } else { + out_buff[out_size++] = tolower(in_buff[i]); + had_space=FALSE; + } + } + out_buff[out_size] = '\0'; + return out_buff; +} + static void checkURLs(message *m, const char *dir) { @@ -2139,7 +2214,7 @@ checkURLs(message *m, const char *dir) t = tableCreate(); n = 0; - normalised = ptr = html_normalize(blobGetData(b), len); + normalised = ptr = mbox_html_normalize(blobGetData(b), len); if(normalised == NULL) { blobDestroy(b);