extract URLs from mail body (bb #1482).

git-svn: trunk@5014
remotes/push_mirror/0.95
Török Edvin 17 years ago
parent 18b8c7e9ee
commit f2b71eb961
  1. 5
      ChangeLog
  2. 2
      libclamav/htmlnorm.c
  3. 1
      libclamav/htmlnorm.h
  4. 37
      libclamav/mbox.c
  5. 45
      libclamav/phishcheck.c

@ -1,3 +1,8 @@
Thu Apr 2 22:59:30 EEST 2009 (edwin)
-------------------------------------
* libclamav/htmlnorm.c, libclamav/htmlnorm.h, libclamav/mbox.c,
libclamav/phishcheck.c: extract URLs from mail body (bb #1482).
Thu Apr 2 19:30:19 CEST 2009 (tk)
----------------------------------
* libclamav/cab.c: fix compiler warnings (bb#1494)

@ -346,7 +346,7 @@ static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char
}
return;
}
static void html_tag_arg_add(tag_arguments_t *tags,
void html_tag_arg_add(tag_arguments_t *tags,
const char *tag, char *value)
{
int len, i;

@ -40,6 +40,7 @@ int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirnam
int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf);
void html_tag_arg_free(tag_arguments_t *tags);
int html_screnc_decode(int fd, const char *dirname);
void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);
#endif

@ -3962,6 +3962,36 @@ hrefs_done(blob *b, tag_arguments_t *hrefs)
html_tag_arg_free(hrefs);
}
/* extract URLs from static text */
static void extract_text_urls(const unsigned char *mem, size_t len, tag_arguments_t *hrefs)
{
char url[1024];
size_t off;
for (off=0;off + 10 < len;off++) {
/* check whether this is the start of a URL */
int32_t proto = cli_readint32(mem + off);
/* convert to lowercase */
proto |= 0x20202020;
/* 'http:', 'https:', or 'ftp:' in little-endian */
if ((proto == 0x70747468 &&
(mem[off+4] == ':' || (mem[off+5] == 's' && mem[off+6] == ':')))
|| proto == 0x3a707466) {
size_t url_len;
for (url_len=4; off + url_len < len && url_len < (sizeof(url)-1); url_len++) {
unsigned char c = mem[off + url_len];
/* smart compilers will compile this if into
* a single bt + jb instruction */
if (c == ' ' || c == '\n' || c == '\t')
break;
}
memcpy(url, mem + off, url_len);
url[url_len] = '\0';
html_tag_arg_add(hrefs, "href", url);
off += url_len;
}
}
}
/*
* This used to be part of checkURLs, split out, because phishingScan needs it
* too, and phishingScan might be used in situations where checkURLs is
@ -3970,6 +4000,7 @@ hrefs_done(blob *b, tag_arguments_t *hrefs)
static blob *
getHrefs(message *m, tag_arguments_t *hrefs)
{
unsigned char *mem;
blob *b = messageToBlob(m, 0);
size_t len;
@ -3995,11 +4026,15 @@ getHrefs(message *m, tag_arguments_t *hrefs)
hrefs->contents = NULL;
cli_dbgmsg("getHrefs: calling html_normalise_mem\n");
if(!html_normalise_mem(blobGetData(b), (off_t)len, NULL, hrefs,m->ctx->dconf)) {
mem = blobGetData(b);
if(!html_normalise_mem(mem, (off_t)len, NULL, hrefs,m->ctx->dconf)) {
blobDestroy(b);
return NULL;
}
cli_dbgmsg("getHrefs: html_normalise_mem returned\n");
if (!hrefs->count && hrefs->scanContents) {
extract_text_urls(mem, len, hrefs);
}
/* TODO: Do we need to call remove_html_comments? */
return b;

@ -146,9 +146,9 @@ static const char src_text[] = "src";
static const char href_text[] = "href";
static const char mailto[] = "mailto:";
static const char mailto_proto[] = "mailto://";
static const char https[]="https://";
static const char http[]="http://";
static const char ftp[] = "ftp://";
static const char https[]="https:";
static const char http[]="http:";
static const char ftp[] = "ftp:";
static const size_t href_text_len = sizeof(href_text);
static const size_t src_text_len = sizeof(src_text);
@ -774,8 +774,7 @@ int phishingScan(cli_ctx* ctx,tag_arguments_t* hrefs)
fclose(f);
return 0;
#endif
for(i=0;i<hrefs->count;i++)
if(hrefs->contents[i]) {
for(i=0;i<hrefs->count;i++) {
struct url_check urls;
enum phish_status rc;
urls.flags = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS;
@ -841,10 +840,7 @@ int phishingScan(cli_ctx* ctx,tag_arguments_t* hrefs)
break;
}
return cli_found_possibly_unwanted(ctx);
}
else
if(strcmp((char*)hrefs->tag[i],"href"))
cli_dbgmsg("Phishcheck: href with no contents?\n");
}
return CL_CLEAN;
}
@ -1015,33 +1011,34 @@ static int isURL(char* URL, int accept_anyproto)
switch (URL[0]) {
case 'h':
if (strncmp(URL, https, https_len) == 0)
start = URL + https_len;
start = URL + https_len - 1;
else if (strncmp(URL, http, http_len) == 0)
start = URL + http_len;
start = URL + http_len - 1;
break;
case 'f':
if (strncmp(URL, ftp, ftp_len) == 0)
start = URL + ftp_len;
start = URL + ftp_len - 1;
break;
case 'm':
if (strncmp(URL, mailto_proto, mailto_proto_len) == 0)
start = URL + mailto_proto_len;
start = URL + mailto_proto_len - 1;
break;
}
if(start) {
if(start[0] == '\0')
return 0;/* empty URL */
if(start && start[1] == '/' && start[2] == '/') {
/* has a valid protocol, it is a URL */
return 1;
}
start = accept_anyproto ? strchr(URL, ':') : NULL;
start = accept_anyproto ? strchr(URL, ':') : start;
if(start) {
/* validate URI scheme */
if(validate_uri_ialpha(URL, start)) {
if(start[1] == '/' && start[2] == '/')
start += 3; /* skip :// */
else
/* skip :// */
if (start[1] == '/') {
start += 2;
if (*start == '/')
start++;
} else
start++;
}
else
start = URL; /* scheme invalid */
@ -1298,7 +1295,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
/* determine end of hostname */
host_len = strcspn(host_begin, ":/?");
path_begin = host_begin + host_len;
if(host_len < len) {
if(host_len <= len) {
/* url without path, use a single / */
memmove(path_begin + 2, path_begin + 1, len - host_len);
*path_begin++ = '/';
@ -1419,7 +1416,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
int phishy=0, blacklisted=0;
const struct phishcheck* pchk = (const struct phishcheck*) engine->phishcheck;
if(!urls->realLink.data || urls->displayLink.data[0]=='\0')
if(!urls->realLink.data)
return CL_PHISH_CLEAN;
cli_dbgmsg("Phishcheck:Checking url %s->%s\n", urls->realLink.data,
@ -1466,6 +1463,10 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
if (blacklisted)
return blacklisted;
if (urls->displayLink.data[0] == '\0') {
return CL_PHISH_CLEAN;
}
url_check_init(&host_url);
if((rc = url_get_host(urls, &host_url, DOMAIN_DISPLAY, &phishy))) {

Loading…
Cancel
Save