Use new normalise code

git-svn: trunk@865
remotes/push_mirror/metadata
Nigel Horne 21 years ago
parent 37743d67ea
commit 15021325ef
  1. 7
      clamav-devel/ChangeLog
  2. 163
      clamav-devel/libclamav/mbox.c

@ -1,3 +1,10 @@
Tue Sep 14 21:48:36 BST 2004 (njh)
----------------------------------
* libclamav/mbox.c: FOLLOWURL: now uses the new normalisation code to
find URLs to scan for trojans. This means
better scanning of HTML than the old FOLLOWURL
code and all is now done in RAM
Tue Sep 14 22:32:50 CEST 2004 (tk)
----------------------------------
* libclamav: do not print outdate warning for main.cvd

@ -17,6 +17,9 @@
*
* Change History:
* $Log: mbox.c,v $
* Revision 1.119 2004/09/14 20:47:28 nigelhorne
* Use new normalise code
*
* Revision 1.118 2004/09/14 12:09:37 nigelhorne
* Include old normalise code
*
@ -342,7 +345,7 @@
* Compilable under SCO; removed duplicate code with message.c
*
*/
static char const rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $";
static char const rcsid[] = "$Id: mbox.c,v 1.119 2004/09/14 20:47:28 nigelhorne Exp $";
#if HAVE_CONFIG_H
#include "clamav-config.h"
@ -427,6 +430,8 @@ typedef enum { FALSE = 0, TRUE = 1 } bool;
#ifdef FOLLOWURLS
#include "htmlnorm.h"
#define MAX_URLS 5 /*
* Maximum number of URLs scanned in a message
* part
@ -2110,92 +2115,18 @@ saveTextPart(message *m, const char *dir)
}
#ifdef FOLLOWURLS
/*
* TODO: Use the newer normalise code
* This is the old normalise code which normalises in memory. The new
* code uses temporary files and has a different API.
*
* Normalize an HTML buffer using the following rules:
o Remove multiple contiguous spaces
o Remove spaces around '<' and '>' in tags
o Remove spaces around '=' in tags
o Replace single quote with double quote in tags
o Convert to lowercase
o Convert all white space to a space character
*/
static unsigned char *
mbox_html_normalize(unsigned char *in_buff, off_t in_size)
{
unsigned char *out_buff;
off_t out_size=0, i;
int had_space=FALSE, tag_depth=0, in_quote=FALSE;
out_buff = (unsigned char *)cli_malloc(in_size+1);
if (!out_buff) {
cli_errmsg("malloc failed");
return NULL;
}
for (i=0 ; i < in_size ; i++) {
if (in_buff[i] == '<') {
out_buff[out_size++] = '<';
tag_depth++;
if (tag_depth == 1) {
had_space=TRUE; /* consume spaces */
}
} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
/* Remove preceeding spaces */
while ((out_size > 0) &&
(out_buff[out_size-1] == ' ')) {
out_size--;
}
out_buff[out_size++] = '=';
had_space=TRUE;
} else if (isspace(in_buff[i])) {
if (!had_space) {
out_buff[out_size++] = ' ';
had_space=TRUE;
}
} else if (in_buff[i] == '>') {
/* Remove preceeding spaces */
if (tag_depth == 1) {
while ((out_size > 0) &&
(out_buff[out_size-1] == ' ')) {
out_size--;
}
}
out_buff[out_size++] = '>';
tag_depth--;
} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
/* Convert single quotes to double quotes */
if (in_quote || out_buff[out_size-1] == '=') {
out_buff[out_size++] = '\"';
in_quote = !in_quote;
} else {
out_buff[out_size++] = '\'';
}
} else {
out_buff[out_size++] = tolower(in_buff[i]);
had_space=FALSE;
}
}
out_buff[out_size] = '\0';
return out_buff;
}
static void
checkURLs(message *m, const char *dir)
{
blob *b = messageToBlob(m);
char *ptr, *normalised;
size_t len;
table_t *t;
int n;
int i, n;
#if defined(WITH_CURL) && defined(CL_THREAD_SAFE)
pthread_t tid[MAX_URLS];
struct arg args[MAX_URLS];
#endif
tag_arguments_t hrefs;
if(b == NULL)
return;
@ -2213,23 +2144,26 @@ checkURLs(message *m, const char *dir)
t = tableCreate();
n = 0;
normalised = ptr = mbox_html_normalize(blobGetData(b), len);
memset(&hrefs, '\0', sizeof(hrefs));
cli_dbgmsg("checkURLs: calling html_normalise_mem\n");
html_normalise_mem(blobGetData(b), len, NULL, &hrefs);
cli_dbgmsg("checkURLs: html_normalise_mem returned\n");
if(normalised == NULL) {
/*if(href == NULL) {
blobDestroy(b);
tableDestroy(t);
return;
}
}*/
/* TODO: Do we need to call remove_html_comments? */
/*
* cli_memstr(ptr, len, "<a href=", 8)
* Don't use cli_memstr() until bounds problem sorted
* and it returns the place that the 'needle' was found
*/
while(len >= 8) {
if(strncasecmp(ptr, "<a href=", 8) == 0) {
n = 0;
for(i = 0; i < hrefs.count; i++) {
const char *url = hrefs.value[i];
if(strncasecmp("http://", url, 7) == 0) {
char *ptr;
#ifdef WITH_CURL
#ifndef CL_THREAD_SAFE
struct arg arg;
@ -2242,52 +2176,31 @@ checkURLs(message *m, const char *dir)
struct stat statb;
char cmd[512];
#endif /*WITH_CURL*/
char *p2 = &ptr[8];
char *p3;
char name[512];
len -= 8;
while((len > 0) && ((*p2 == '\"') || isspace(*p2))) {
len--;
p2++;
}
if(len == 0)
break;
ptr = p2;
while((len > 0) && (isalnum(*ptr) || strchr("./?:%", *ptr))) {
ptr++;
len--;
}
if(len == 0)
break;
*ptr = '\0';
if(strncasecmp(p2, "mailto:", 7) == 0)
continue;
if(*p2 == '\0')
continue;
if(tableFind(t, p2) == 1) {
cli_dbgmsg("URL %s already downloaded\n", p2);
char name[NAME_MAX];
if(tableFind(t, url) == 1) {
cli_dbgmsg("URL %s already downloaded\n", url);
continue;
}
if(n == MAX_URLS) {
cli_warnmsg("Not all URLs will be scanned\n");
break;
}
(void)tableInsert(t, p2, 1);
cli_dbgmsg("Downloading URL %s to be scanned\n", p2);
strncpy(name, p2, sizeof(name));
for(p3 = name; *p3; p3++)
if(*p3 == '/')
*p3 = '_';
(void)tableInsert(t, url, 1);
cli_dbgmsg("Downloading URL %s to be scanned\n", url);
strncpy(name, url, sizeof(name));
for(ptr = name; *ptr; ptr++)
if(*ptr == '/')
*ptr = '_';
#ifdef WITH_CURL
#ifdef CL_THREAD_SAFE
args[n].url = strdup(p2);
args[n].url = strdup(url);
args[n].dir = strdup(dir);
args[n].filename = strdup(name);
pthread_create(&tid[n], NULL, getURL, &args[n]);
#else
arg.url = p2;
arg.url = url;
arg.dir = dir;
arg.filename = name;
getURL(&arg);
@ -2297,7 +2210,7 @@ checkURLs(message *m, const char *dir)
/*
* TODO: maximum size and timeouts
*/
snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", p2, dir, name);
snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", url, dir, name);
cli_dbgmsg("%s\n", cmd);
#ifdef CL_THREAD_SAFE
pthread_mutex_lock(&system_mutex);
@ -2309,7 +2222,7 @@ checkURLs(message *m, const char *dir)
snprintf(cmd, sizeof(cmd), "%s/%s", dir, name);
if(stat(cmd, &statb) >= 0)
if(statb.st_size == 0) {
cli_warnmsg("URL %s failed to download\n", p2);
cli_warnmsg("URL %s failed to download\n", url);
/*
* Don't bother scanning an empty file
*/
@ -2318,12 +2231,10 @@ checkURLs(message *m, const char *dir)
#endif
++n;
}
ptr++;
len--;
}
html_tag_arg_free(&hrefs);
blobDestroy(b);
tableDestroy(t);
free(normalised);
#if defined(WITH_CURL) && defined(CL_THREAD_SAFE)
cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);

Loading…
Cancel
Save