Phishing merge - htmlnorm

mbox.c pending...


git-svn: trunk@2246
remotes/push_mirror/metadata
aCaB 19 years ago
parent 67118e923d
commit 333d724dcf
  1. 6
      clamav-devel/ChangeLog
  2. 239
      clamav-devel/libclamav/htmlnorm.c

@ -1,3 +1,9 @@
Wed Sep 13 00:03:30 CEST 2006 (acab)
------------------------------------
* libclamav: Merge of the html normaliser part of the phishing module from
Torok Edvin <edwintorok*gmail.com>
Part of the Google Summer of Code program
Tue Sep 12 22:52:14 CEST 2006 (tk)
----------------------------------
* clamd: apply w32 patches from NJH

@ -47,7 +47,12 @@
#include "others.h"
#include "htmlnorm.h"
#ifdef CL_EXPERIMENTAL
#include "mbox.h"
#endif
#define HTML_STR_LENGTH 1024
#define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
typedef enum {
HTML_BAD_STATE,
@ -307,6 +312,16 @@ static void html_tag_arg_add(tag_arguments_t *tags,
if (!tags->value) {
goto abort;
}
#ifdef CL_EXPERIMENTAL
if(tags->scanContents) {
tags->contents= (blob **) cli_realloc(tags->contents,
tags->count*sizeof(*tags->contents));
if(!tags->contents) {
goto abort;
}
tags->contents[tags->count-1]=NULL;
}
#endif
tags->tag[tags->count-1] = strdup(tag);
if (value) {
if (*value == '"') {
@ -333,6 +348,12 @@ abort:
if (tags->value) {
free(tags->value[i]);
}
#ifdef CL_EXPERIMENTAL
if(tags->contents) {
if(tags->contents[i])
blobDestroy(tags->contents[i]);
}
#endif
}
if (tags->tag) {
free(tags->tag);
@ -340,6 +361,11 @@ abort:
if (tags->value) {
free(tags->value);
}
#ifdef CL_EXPERIMENTAL
if (tags->contents)
free(tags->contents);
tags->contents=NULL;
#endif
tags->tag = tags->value = NULL;
tags->count = 0;
return;
@ -375,6 +401,11 @@ void html_tag_arg_free(tag_arguments_t *tags)
if (tags->value[i]) {
free(tags->value[i]);
}
#ifdef CL_EXPERIMENTAL
if(tags->contents)
if (tags->contents[i])
blobDestroy(tags->contents[i]);
#endif
}
if (tags->tag) {
free(tags->tag);
@ -382,14 +413,59 @@ void html_tag_arg_free(tag_arguments_t *tags)
if (tags->value) {
free(tags->value);
}
#ifdef CL_EXPERIMENTAL
if(tags->contents)
free(tags->contents);
tags->contents = NULL;
#endif
tags->tag = tags->value = NULL;
tags->count = 0;
}
#ifdef CL_EXPERIMENTAL
/**
* this is used for img, and iframe tags. If they are inside an <a href> tag, then set the contents of the image|iframe to the real URL.
*/
static inline void html_tag_set_inahref(tag_arguments_t *tags,int idx,int in_ahref)
{
char x[]="";
tags->contents[idx-1]=blobCreate();
blobAddData(tags->contents[idx-1],tags->value[in_ahref-1],strlen(tags->value[in_ahref-1]));
blobAddData(tags->contents[idx-1],x,1);
blobClose(tags->contents[idx-1]);
}
/**
* the displayed text for an <a href> tag
*/
static inline void html_tag_contents_append(tag_arguments_t *tags,int idx,const unsigned char* begin,const unsigned char *end)
{
if(begin<end) {
blobAddData(tags->contents[idx-1],begin,end-begin);
}
}
static inline void html_tag_contents_done(tag_arguments_t *tags,int idx)
{
char x[]="";
blobAddData(tags->contents[idx-1],x,1);/*append NULL character*/
blobClose(tags->contents[idx-1]);
}
static inline void html_tag_contents_length_check(tag_arguments_t *tags,int* idx)
{
if (blobGetDataSize(tags->contents[*idx-1])>MAX_TAG_CONTENTS_LENGTH) {
html_tag_contents_done(tags,*idx);
*idx=0;/*in_ahref=0;*/
}
}
#endif
static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs)
{
int fd_tmp, tag_length, tag_arg_length, binary;
int retval=FALSE, escape, value, hex, tag_val_length, table_pos, in_script=FALSE;
int retval=FALSE, escape, value, hex, tag_val_length=0, table_pos, in_script=FALSE;
FILE *stream_in;
html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
@ -400,6 +476,13 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
unsigned long length;
file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script;
file_buff_t *file_tmp_o1;
#ifdef CL_EXPERIMENTAL
int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/
unsigned char* ptrend=NULL;/*end of <a> contents*/
unsigned char* in_form_action = NULL;/* the action URL of the current <form> tag, if any*/
tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
#endif
if (!m_area) {
if (fd < 0) {
@ -421,6 +504,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
tag_args.count = 0;
tag_args.tag = NULL;
tag_args.value = NULL;
#ifdef CL_EXPERIMENTAL
tag_args.contents = NULL;
#endif
if (dirname) {
snprintf(filename, 1024, "%s/rfc2397", dirname);
@ -498,6 +584,10 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
ptr = line = cli_readline(stream_in, m_area, 8192);
while (line) {
#ifdef CL_EXPERIMENTAL
if(href_contents_begin)
href_contents_begin=ptr;/*start of a new line, last line already appended to contents see below*/
#endif
while (*ptr && isspace(*ptr)) {
ptr++;
}
@ -542,10 +632,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
break;
case HTML_NORM:
if (*ptr == '<') {
#ifdef CL_EXPERIMENTAL
ptrend=ptr-1; /* for use by scanContents */
#endif
html_output_c(file_buff_o1, file_buff_o2, '<');
if (in_script) {
html_output_c(file_buff_script, NULL, '<');
}
#ifdef CL_EXPERIMENTAL
if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
/*append this text portion to the contents of <a>*/
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
html_tag_contents_length_check(hrefs,&in_ahref);
href_contents_begin=NULL;/*We just encountered another tag inside <a>, so skip it*/
}
#endif
ptr++;
state = HTML_SKIP_WS;
tag_length=0;
@ -815,6 +916,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
in_script=FALSE;
html_output_c(file_buff_script, NULL, '\n');
}
#ifdef CL_EXPERIMENTAL
if (hrefs && hrefs->scanContents && in_ahref) {
if(strcmp(tag,"/a") == 0) {
html_tag_contents_done(hrefs,in_ahref);
in_ahref=0;/* we are no longer inside an <a href>
nesting <a> tags not supported, and shouldn't be supported*/
}
href_contents_begin=ptr;
}
if (strcmp(tag, "/form") == 0) {
if (in_form_action)
free(in_form_action);
in_form_action = NULL;
}
#endif
} else if (strcmp(tag, "script") == 0) {
arg_value = html_tag_arg_value(&tag_args, "language");
if (arg_value && (strcasecmp(arg_value, "jscript.encode") == 0)) {
@ -830,26 +946,130 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
}
html_output_tag(file_buff_script, tag, &tag_args);
} else if (hrefs) {
#ifdef CL_EXPERIMENTAL
if(in_ahref && !href_contents_begin)
href_contents_begin=ptr;
#endif
if (strcmp(tag, "a") == 0) {
arg_value = html_tag_arg_value(&tag_args, "href");
if (arg_value && strlen(arg_value) > 0) {
#ifdef CL_EXPERIMENTAL
if (hrefs->scanContents) {
const unsigned char* arg_value_title = html_tag_arg_value(&tag_args,"title");
/*beginning of an <a> tag*/
if (in_ahref)
/*we encountered nested <a> tags, pretend previous closed*/
if (href_contents_begin) {
html_tag_contents_append(hrefs,in_ahref,
href_contents_begin,ptrend);
/*add pending contents between tags*/
html_tag_contents_done(hrefs,in_ahref);
in_ahref=0;
}
if (arg_value_title) {
/* title is a 'displayed link'*/
html_tag_arg_add(hrefs,"href_title",arg_value_title);
hrefs->contents[hrefs->count-1]=blobCreate();
html_tag_contents_append(hrefs,hrefs->count,arg_value,
arg_value+strlen(arg_value));
html_tag_contents_done(hrefs,hrefs->count);
}
if (in_form_action) {
/* form action is the real URL, and href is the 'displayed' */
html_tag_arg_add(hrefs,"form",arg_value);
hrefs->contents[hrefs->count-1] = blobCreate();
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
in_form_action + strlen(in_form_action));
html_tag_contents_done(hrefs,hrefs->count);
}
}
#endif
html_tag_arg_add(hrefs, "href", arg_value);
#ifdef CL_EXPERIMENTAL
if (hrefs->scanContents) {
in_ahref=hrefs->count; /* index of this tag (counted from 1) */
href_contents_begin=ptr;/* contents begin after <a ..> ends */
hrefs->contents[hrefs->count-1]=blobCreate();
}
#endif
}
#ifdef CL_EXPERIMENTAL
} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) {
const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action");
if (arg_action_value)
in_form_action = strdup(arg_action_value);
#endif
} else if (strcmp(tag, "img") == 0) {
arg_value = html_tag_arg_value(&tag_args, "src");
if (arg_value && strlen(arg_value) > 0) {
html_tag_arg_add(hrefs, "src", arg_value);
#ifdef CL_EXPERIMENTAL
if(hrefs->scanContents && in_ahref)
/* "contents" of an img tag, is the URL of its parent <a> tag */
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
if (in_form_action) {
/* form action is the real URL, and href is the 'displayed' */
html_tag_arg_add(hrefs,"form",arg_value);
hrefs->contents[hrefs->count-1] = blobCreate();
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
in_form_action + strlen(in_form_action));
html_tag_contents_done(hrefs,hrefs->count);
}
#endif
}
arg_value = html_tag_arg_value(&tag_args, "dynsrc");
if (arg_value && strlen(arg_value) > 0) {
html_tag_arg_add(hrefs, "dynsrc", arg_value);
#ifdef CL_EXPERIMENTAL
if(hrefs->scanContents && in_ahref)
/* see above */
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
if (in_form_action) {
/* form action is the real URL, and href is the 'displayed' */
html_tag_arg_add(hrefs,"form",arg_value);
hrefs->contents[hrefs->count-1] = blobCreate();
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
in_form_action + strlen(in_form_action));
html_tag_contents_done(hrefs,hrefs->count);
}
#endif
}
} else if (strcmp(tag, "iframe") == 0) {
arg_value = html_tag_arg_value(&tag_args, "src");
if (arg_value && strlen(arg_value) > 0) {
html_tag_arg_add(hrefs, "iframe", arg_value);
#ifdef CL_EXPERIMENTAL
if(hrefs->scanContents && in_ahref)
/* see above */
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
if (in_form_action) {
/* form action is the real URL, and href is the 'displayed' */
html_tag_arg_add(hrefs,"form",arg_value);
hrefs->contents[hrefs->count-1] = blobCreate();
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
in_form_action + strlen(in_form_action));
html_tag_contents_done(hrefs,hrefs->count);
}
}
} else if (strcmp(tag,"area") == 0) {
arg_value = html_tag_arg_value(&tag_args,"href");
if (arg_value && strlen(arg_value) > 0) {
html_tag_arg_add(hrefs, "area", arg_value);
if(hrefs->scanContents && in_ahref)
/* see above */
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
if (in_form_action) {
/* form action is the real URL, and href is the 'displayed' */
html_tag_arg_add(hrefs,"form",arg_value);
hrefs->contents[hrefs->count-1] = blobCreate();
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
in_form_action + strlen(in_form_action));
html_tag_contents_done(hrefs,hrefs->count);
}
#endif
}
}
/* TODO:imagemaps can have urls too */
}
html_tag_arg_free(&tag_args);
break;
@ -871,6 +1091,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
ptr++;
} else if (*ptr == ';') {
html_output_c(file_buff_o1, file_buff_o2, value);
#ifdef CL_EXPERIMENTAL
if (tag_val_length < HTML_STR_LENGTH) {
tag_val[tag_val_length++] = value; /* store encoded values too */
}
#endif
state = next_state;
next_state = HTML_BAD_STATE;
ptr++;
@ -1155,12 +1380,24 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
break;
}
}
#ifdef CL_EXPERIMENTAL
if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
/* end of line, append contents now, resume on next line */
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr-1);
ptrend = NULL;
#endif
free(line);
ptr = line = cli_readline(stream_in, m_area, 8192);
}
retval = TRUE;
abort:
#ifdef CL_EXPERIMENTAL
if (in_form_action)
free(in_form_action);
if (in_ahref) /* tag not closed, force closing */
html_tag_contents_done(hrefs,in_ahref);
#endif
html_tag_arg_free(&tag_args);
if (!m_area) {
fclose(stream_in);

Loading…
Cancel
Save