/*
* Normalise HTML text.
* Decode MS Script Encoder protection.
*
* Copyright (C) 2007-2008 Sourcefire, Inc.
*
* Authors: Trog
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include
#ifdef HAVE_UNISTD_H
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#if HAVE_MMAP
#if HAVE_SYS_MMAN_H
#include
#else /* HAVE_SYS_MMAN_H */
#undef HAVE_MMAP
#endif
#endif
#include "others.h"
#include "htmlnorm.h"
typedef enum {
INVALIDCLASS, BLOBCLASS
} object_type;
#include "blob.h"
#include "entconv.h"
#include "jsparse/js-norm.h"
#define HTML_STR_LENGTH 1024
#define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
typedef enum {
HTML_BAD_STATE,
HTML_NORM,
HTML_COMMENT,
HTML_CHAR_REF,
HTML_ENTITY_REF_DECODE,
HTML_SKIP_WS,
HTML_TRIM_WS,
HTML_TAG,
HTML_TAG_ARG,
HTML_TAG_ARG_VAL,
HTML_TAG_ARG_EQUAL,
HTML_PROCESS_TAG,
HTML_CHAR_REF_DECODE,
HTML_SKIP_LENGTH,
HTML_JSDECODE,
HTML_JSDECODE_LENGTH,
HTML_JSDECODE_DECRYPT,
HTML_SPECIAL_CHAR,
HTML_RFC2397_TYPE,
HTML_RFC2397_INIT,
HTML_RFC2397_DATA,
HTML_RFC2397_FINISH,
HTML_RFC2397_ESC,
HTML_ESCAPE_CHAR
} html_state;
typedef enum {
SINGLE_QUOTED,
DOUBLE_QUOTED,
NOT_QUOTED
} quoted_state;
#define HTML_FILE_BUFF_LEN 8192
typedef struct file_buff_tag {
int fd;
unsigned char buffer[HTML_FILE_BUFF_LEN];
int length;
} file_buff_t;
static const int base64_chars[256] = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
};
int table_order[] = {
00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02
};
int decrypt_tables[3][128] = {
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}
};
static inline unsigned int rewind_tospace(const unsigned char* chunk, unsigned int len)
{
unsigned int count = len;
while (!isspace(chunk[len - 1]) && (len > 1)) {
len--;
}
if (len == 1) {
return count;
}
return len;
}
/* read at most @max_len of data from @m_area or @stream, skipping NULL chars.
* This used to be called cli_readline, but we don't stop at end-of-line anymore */
static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)
{
unsigned char *chunk, *start, *ptr, *end;
unsigned int chunk_len, count;
chunk = (unsigned char *) cli_malloc(max_len);
if (!chunk) {
return NULL;
}
/* Try and use the memory buffer first */
if (m_area) {
start = ptr = m_area->buffer + m_area->offset;
end = m_area->buffer + m_area->length;
if (start >= end) {
free(chunk);
return NULL;
}
/* maximum we can copy into the buffer,
* we could have less than max_len bytes available */
chunk_len = MIN(end-start, max_len-1);
/* look for NULL chars */
ptr = memchr(start, 0, chunk_len);
if(!ptr) {
/* no NULL chars found, copy all */
memcpy(chunk, start, chunk_len);
chunk[chunk_len] = '\0';
m_area->offset += chunk_len;
/* point ptr to end of chunk,
* so we can check and rewind to a space below */
ptr = start + chunk_len;
} else {
/* copy portion that doesn't contain NULL chars */
chunk_len = ptr - start;
if(chunk_len < max_len) {
memcpy(chunk, start, chunk_len);
} else {
chunk_len = 0;
ptr = start;
}
/* we have unknown number of NULL chars,
* copy char-by-char and skip them */
while((ptr < end) && (chunk_len < max_len-1)) {
const unsigned char c = *ptr++;
if(c) {
chunk[chunk_len++] = c;
}
}
chunk[chunk_len] = '\0';
/* we can't use chunk_len to determine how many bytes we read, since
* we skipped chars */
m_area->offset = ptr - m_area->buffer;
}
if(ptr && ptr < end && !isspace(*ptr)) {
/* we hit max_len, rewind to a space */
count = rewind_tospace(chunk, chunk_len);
if(count < chunk_len) {
chunk[count] = '\0';
m_area->offset -= chunk_len - count;
}
}
} else {
if (!stream) {
cli_dbgmsg("No HTML stream\n");
free(chunk);
return NULL;
}
chunk_len = fread(chunk, 1, max_len-1, stream);
if(!chunk_len || chunk_len > max_len-1) {
/* EOF, or prevent overflow */
free(chunk);
return NULL;
}
/* Look for NULL chars */
ptr = memchr(chunk, 0, chunk_len);
if(ptr) {
/* NULL char found */
/* save buffer limits */
start = ptr;
end = chunk + chunk_len;
/* start of NULL chars, we will copy non-NULL characters
* to this position */
chunk_len = ptr - chunk;
/* find first non-NULL char */
while((ptr < end) && !(*ptr)) {
ptr++;
}
/* skip over NULL chars, and move back the rest */
while((ptr < end) && (chunk_len < max_len-1)) {
const unsigned char c = *ptr++;
if(c) {
chunk[chunk_len++] = c;
}
}
chunk[chunk_len] = '\0';
}
if(chunk_len == max_len - 1) {
/* rewind to a space (which includes newline) */
count = rewind_tospace(chunk, chunk_len);
if(count < chunk_len) {
chunk[count] = '\0';
/* seek-back to space */
fseek(stream, (long)(count - chunk_len), SEEK_CUR);
}
}
}
return chunk;
}
static void html_output_flush(file_buff_t *fbuff)
{
if (fbuff && (fbuff->length > 0)) {
cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
fbuff->length = 0;
}
}
static inline void html_output_c(file_buff_t *fbuff1, unsigned char c)
{
if (fbuff1) {
if (fbuff1->length == HTML_FILE_BUFF_LEN) {
html_output_flush(fbuff1);
}
fbuff1->buffer[fbuff1->length++] = c;
}
}
static void html_output_str(file_buff_t *fbuff, const unsigned char *str, int len)
{
if (fbuff) {
if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
html_output_flush(fbuff);
}
if (len >= HTML_FILE_BUFF_LEN) {
html_output_flush(fbuff);
cli_writen(fbuff->fd, str, len);
} else {
memcpy(fbuff->buffer + fbuff->length, str, len);
fbuff->length += len;
}
}
}
static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)
{
int i;
for (i=0; i < tags->count; i++) {
if (strcmp(tags->tag[i], tag) == 0) {
return tags->value[i];
}
}
return NULL;
}
static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)
{
int i;
for (i=0; i < tags->count; i++) {
if (strcmp(tags->tag[i], tag) == 0) {
free(tags->value[i]);
tags->value[i] = cli_strdup(value);
return;
}
}
return;
}
static void html_tag_arg_add(tag_arguments_t *tags,
const unsigned char *tag, unsigned char *value)
{
int len, i;
tags->count++;
tags->tag = (unsigned char **) cli_realloc2(tags->tag,
tags->count * sizeof(char *));
if (!tags->tag) {
goto abort;
}
tags->value = (unsigned char **) cli_realloc2(tags->value,
tags->count * sizeof(char *));
if (!tags->value) {
goto abort;
}
if(tags->scanContents) {
tags->contents= (blob **) cli_realloc2(tags->contents,
tags->count*sizeof(*tags->contents));
if(!tags->contents) {
goto abort;
}
tags->contents[tags->count-1]=NULL;
}
tags->tag[tags->count-1] = cli_strdup(tag);
if (value) {
if (*value == '"') {
tags->value[tags->count-1] = cli_strdup(value+1);
len = strlen(value+1);
if (len > 0) {
tags->value[tags->count-1][len-1] = '\0';
}
} else {
tags->value[tags->count-1] = cli_strdup(value);
}
} else {
tags->value[tags->count-1] = NULL;
}
return;
abort:
/* Bad error - can't do 100% recovery */
tags->count--;
for (i=0; i < tags->count; i++) {
if (tags->tag) {
free(tags->tag[i]);
}
if (tags->value) {
free(tags->value[i]);
}
if(tags->contents) {
if(tags->contents[i])
blobDestroy(tags->contents[i]);
}
}
if (tags->tag) {
free(tags->tag);
}
if (tags->value) {
free(tags->value);
}
if (tags->contents)
free(tags->contents);
tags->contents=NULL;
tags->tag = tags->value = NULL;
tags->count = 0;
return;
}
static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
{
int i, j, len;
html_output_c(fbuff, '<');
html_output_str(fbuff, tag, strlen(tag));
for (i=0; i < tags->count; i++) {
html_output_c(fbuff, ' ');
html_output_str(fbuff, tags->tag[i], strlen(tags->tag[i]));
if (tags->value[i]) {
html_output_str(fbuff, "=\"", 2);
len = strlen(tags->value[i]);
for (j=0 ; jvalue[i][j]));
}
html_output_c(fbuff, '"');
}
}
html_output_c(fbuff, '>');
}
void html_tag_arg_free(tag_arguments_t *tags)
{
int i;
for (i=0; i < tags->count; i++) {
free(tags->tag[i]);
if (tags->value[i]) {
free(tags->value[i]);
}
if(tags->contents)
if (tags->contents[i])
blobDestroy(tags->contents[i]);
}
if (tags->tag) {
free(tags->tag);
}
if (tags->value) {
free(tags->value);
}
if(tags->contents)
free(tags->contents);
tags->contents = NULL;
tags->tag = tags->value = NULL;
tags->count = 0;
}
/**
* this is used for img, and iframe tags. If they are inside an tag, then set the contents of the image|iframe to the real URL.
*/
static inline void html_tag_set_inahref(tag_arguments_t *tags,int idx,int in_ahref)
{
tags->contents[idx-1]=blobCreate();
blobAddData(tags->contents[idx-1],tags->value[in_ahref-1],strlen(tags->value[in_ahref-1]));
blobAddData(tags->contents[idx-1], "",1);
blobClose(tags->contents[idx-1]);
}
/**
* the displayed text for an tag
*/
static inline void html_tag_contents_append(tag_arguments_t *tags,int idx,const unsigned char* begin,const unsigned char *end)
{
if(end && (begincontents[idx-1]);
const size_t blob_sizeleft = blob_len <= MAX_TAG_CONTENTS_LENGTH ? (MAX_TAG_CONTENTS_LENGTH - blob_len) : 0;
const size_t str_len = end - begin;
if(blob_sizeleft)
blobAddData(tags->contents[idx-1],begin, blob_sizeleft < str_len ? blob_sizeleft : str_len );
}
}
static inline void html_tag_contents_done(tag_arguments_t *tags,int idx)
{
/* append NUL byte */
blobAddData(tags->contents[idx-1], "", 1);
blobClose(tags->contents[idx-1]);
}
static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
{
int fd_tmp, tag_length, tag_arg_length, binary;
int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE, text_space_written=FALSE;
FILE *stream_in = NULL;
html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
char tag_val[HTML_STR_LENGTH+1], *tmp_file;
unsigned char *line, *ptr, *arg_value;
tag_arguments_t tag_args;
quoted_state quoted;
unsigned long length;
file_buff_t *file_buff_o2, *file_buff_text;
file_buff_t *file_tmp_o1;
int in_ahref=0;/* index of tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of */
unsigned char* href_contents_begin=NULL;/*beginning of the next portion of contents*/
unsigned char* ptrend=NULL;/*end of contents*/
unsigned char* in_form_action = NULL;/* the action URL of the current