|
|
|
/*
|
|
|
|
* Normalise HTML text.
|
|
|
|
* Decode MS Script Encoder protection.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2004 trog@uncon.org
|
|
|
|
*
|
|
|
|
* The ScrEnc decoder was initially based upon an analysis by Andreas Marx.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
|
|
* MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if HAVE_CONFIG_H
|
|
|
|
#include "clamav-config.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
|
|
|
|
#if HAVE_MMAP
|
|
|
|
#if HAVE_SYS_MMAN_H
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#else /* HAVE_SYS_MMAN_H */
|
|
|
|
#undef HAVE_MMAP
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "others.h"
|
|
|
|
#include "htmlnorm.h"
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
INVALIDCLASS, BLOBCLASS
|
|
|
|
} object_type;
|
|
|
|
#include "blob.h"
|
|
|
|
|
|
|
|
#include "entconv.h"
|
|
|
|
|
|
|
|
#define HTML_STR_LENGTH 1024
|
|
|
|
#define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
HTML_BAD_STATE,
|
|
|
|
HTML_NORM,
|
|
|
|
HTML_COMMENT,
|
|
|
|
HTML_CHAR_REF,
|
|
|
|
HTML_ENTITY_REF_DECODE,
|
|
|
|
HTML_SKIP_WS,
|
|
|
|
HTML_TRIM_WS,
|
|
|
|
HTML_TAG,
|
|
|
|
HTML_TAG_ARG,
|
|
|
|
HTML_TAG_ARG_VAL,
|
|
|
|
HTML_TAG_ARG_EQUAL,
|
|
|
|
HTML_PROCESS_TAG,
|
|
|
|
HTML_CHAR_REF_DECODE,
|
|
|
|
HTML_SKIP_LENGTH,
|
|
|
|
HTML_JSDECODE,
|
|
|
|
HTML_JSDECODE_LENGTH,
|
|
|
|
HTML_JSDECODE_DECRYPT,
|
|
|
|
HTML_SPECIAL_CHAR,
|
|
|
|
HTML_RFC2397_TYPE,
|
|
|
|
HTML_RFC2397_INIT,
|
|
|
|
HTML_RFC2397_DATA,
|
|
|
|
HTML_RFC2397_FINISH,
|
|
|
|
HTML_RFC2397_ESC,
|
|
|
|
HTML_ESCAPE_CHAR
|
|
|
|
} html_state;
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
SINGLE_QUOTED,
|
|
|
|
DOUBLE_QUOTED,
|
|
|
|
NOT_QUOTED
|
|
|
|
} quoted_state;
|
|
|
|
|
|
|
|
|
|
|
|
#define HTML_FILE_BUFF_LEN 8192
|
|
|
|
|
|
|
|
typedef struct file_buff_tag {
|
|
|
|
int fd;
|
|
|
|
unsigned char buffer[HTML_FILE_BUFF_LEN];
|
|
|
|
int length;
|
|
|
|
} file_buff_t;
|
|
|
|
|
|
|
|
static const int base64_chars[256] = {
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
|
|
|
|
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
|
|
|
|
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
|
|
|
|
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
|
|
|
|
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
|
|
|
|
};
|
|
|
|
|
|
|
|
int table_order[] = {
|
|
|
|
00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
|
|
|
|
00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
|
|
|
|
00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
|
|
|
|
00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02
|
|
|
|
};
|
|
|
|
|
|
|
|
int decrypt_tables[3][128] = {
|
|
|
|
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
|
|
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
|
|
|
0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
|
|
|
|
0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
|
|
|
|
0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
|
|
|
|
0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
|
|
|
|
0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
|
|
|
|
0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
|
|
|
|
|
|
|
|
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
|
|
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
|
|
|
0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
|
|
|
|
0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
|
|
|
|
0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
|
|
|
|
0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
|
|
|
|
0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
|
|
|
|
0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
|
|
|
|
|
|
|
|
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
|
|
|
|
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
|
|
|
0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
|
|
|
|
0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
|
|
|
|
0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
|
|
|
|
0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
|
|
|
|
0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
|
|
|
|
0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}
|
|
|
|
};
|
|
|
|
|
|
|
|
unsigned char *cli_readline(FILE *stream, m_area_t *m_area, unsigned int max_len)
|
|
|
|
{
|
|
|
|
unsigned char *line, *ptr, *start, *end;
|
|
|
|
unsigned int line_len, count;
|
|
|
|
|
|
|
|
line = (unsigned char *) cli_malloc(max_len);
|
|
|
|
if (!line) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Try and use the memory buffer first */
|
|
|
|
if (m_area) {
|
|
|
|
start = ptr = m_area->buffer + m_area->offset;
|
|
|
|
end = m_area->buffer + m_area->length;
|
|
|
|
if (start >= end) {
|
|
|
|
free(line);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
line_len = 1;
|
|
|
|
while ((ptr < end) && (*ptr != '\n') && (line_len < (max_len-1))) {
|
|
|
|
ptr++;
|
|
|
|
line_len++;
|
|
|
|
}
|
|
|
|
if (ptr == end) {
|
|
|
|
line_len--;
|
|
|
|
memcpy(line, start, line_len);
|
|
|
|
line[line_len] = '\0';
|
|
|
|
} else if (*ptr == '\n') {
|
|
|
|
memcpy(line, start, line_len);
|
|
|
|
line[line_len] = '\0';
|
|
|
|
} else {
|
|
|
|
/* Hit max_len */
|
|
|
|
/* Store the current line end and length*/
|
|
|
|
count = line_len;
|
|
|
|
while (!isspace(*ptr) && (line_len > 1)) {
|
|
|
|
ptr--;
|
|
|
|
line_len--;
|
|
|
|
}
|
|
|
|
if (line_len == 1) {
|
|
|
|
line_len=count;
|
|
|
|
}
|
|
|
|
memcpy(line, start, line_len);
|
|
|
|
line[line_len] = '\0';
|
|
|
|
}
|
|
|
|
m_area->offset += line_len;
|
|
|
|
} else {
|
|
|
|
if (!stream) {
|
|
|
|
cli_dbgmsg("No HTML stream\n");
|
|
|
|
free(line);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (fgets(line, max_len, stream) == NULL) {
|
|
|
|
free(line);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
line_len=strlen(line);
|
|
|
|
if (line_len == 0) {
|
|
|
|
free(line);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (line_len == max_len-1) {
|
|
|
|
/* didn't find a whole line - rewind to a space*/
|
|
|
|
count = 0;
|
|
|
|
while (!isspace(line[--line_len])) {
|
|
|
|
count--;
|
|
|
|
if (line_len == 0) {
|
|
|
|
return line;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fseek(stream, count, SEEK_CUR);
|
|
|
|
line[line_len+1] = '\0';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return line;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void html_output_flush(file_buff_t *fbuff)
|
|
|
|
{
|
|
|
|
if (fbuff && (fbuff->length > 0)) {
|
|
|
|
cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
|
|
|
|
fbuff->length = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void html_output_c(file_buff_t *fbuff1, file_buff_t *fbuff2, unsigned char c)
|
|
|
|
{
|
|
|
|
if (fbuff1) {
|
|
|
|
if (fbuff1->length == HTML_FILE_BUFF_LEN) {
|
|
|
|
html_output_flush(fbuff1);
|
|
|
|
}
|
|
|
|
fbuff1->buffer[fbuff1->length++] = c;
|
|
|
|
}
|
|
|
|
if (fbuff2) {
|
|
|
|
if (fbuff2->length == HTML_FILE_BUFF_LEN) {
|
|
|
|
html_output_flush(fbuff2);
|
|
|
|
}
|
|
|
|
fbuff2->buffer[fbuff2->length++] = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void html_output_str(file_buff_t *fbuff, const unsigned char *str, int len)
|
|
|
|
{
|
|
|
|
if (fbuff) {
|
|
|
|
if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
|
|
|
|
html_output_flush(fbuff);
|
|
|
|
}
|
|
|
|
if (len >= HTML_FILE_BUFF_LEN) {
|
|
|
|
html_output_flush(fbuff);
|
|
|
|
cli_writen(fbuff->fd, str, len);
|
|
|
|
} else {
|
|
|
|
memcpy(fbuff->buffer + fbuff->length, str, len);
|
|
|
|
fbuff->length += len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i=0; i < tags->count; i++) {
|
|
|
|
if (strcmp(tags->tag[i], tag) == 0) {
|
|
|
|
return tags->value[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i=0; i < tags->count; i++) {
|
|
|
|
if (strcmp(tags->tag[i], tag) == 0) {
|
|
|
|
free(tags->value[i]);
|
|
|
|
tags->value[i] = cli_strdup(value);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
static void html_tag_arg_add(tag_arguments_t *tags,
|
|
|
|
const unsigned char *tag, unsigned char *value)
|
|
|
|
{
|
|
|
|
int len, i;
|
|
|
|
tags->count++;
|
|
|
|
tags->tag = (unsigned char **) cli_realloc2(tags->tag,
|
|
|
|
tags->count * sizeof(char *));
|
|
|
|
if (!tags->tag) {
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
tags->value = (unsigned char **) cli_realloc2(tags->value,
|
|
|
|
tags->count * sizeof(char *));
|
|
|
|
if (!tags->value) {
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
if(tags->scanContents) {
|
|
|
|
tags->contents= (blob **) cli_realloc2(tags->contents,
|
|
|
|
tags->count*sizeof(*tags->contents));
|
|
|
|
if(!tags->contents) {
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
tags->contents[tags->count-1]=NULL;
|
|
|
|
}
|
|
|
|
tags->tag[tags->count-1] = cli_strdup(tag);
|
|
|
|
if (value) {
|
|
|
|
if (*value == '"') {
|
|
|
|
tags->value[tags->count-1] = cli_strdup(value+1);
|
|
|
|
len = strlen(value+1);
|
|
|
|
if (len > 0) {
|
|
|
|
tags->value[tags->count-1][len-1] = '\0';
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
tags->value[tags->count-1] = cli_strdup(value);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
tags->value[tags->count-1] = NULL;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
|
|
|
|
abort:
|
|
|
|
/* Bad error - can't do 100% recovery */
|
|
|
|
tags->count--;
|
|
|
|
for (i=0; i < tags->count; i++) {
|
|
|
|
if (tags->tag) {
|
|
|
|
free(tags->tag[i]);
|
|
|
|
}
|
|
|
|
if (tags->value) {
|
|
|
|
free(tags->value[i]);
|
|
|
|
}
|
|
|
|
if(tags->contents) {
|
|
|
|
if(tags->contents[i])
|
|
|
|
blobDestroy(tags->contents[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tags->tag) {
|
|
|
|
free(tags->tag);
|
|
|
|
}
|
|
|
|
if (tags->value) {
|
|
|
|
free(tags->value);
|
|
|
|
}
|
|
|
|
if (tags->contents)
|
|
|
|
free(tags->contents);
|
|
|
|
tags->contents=NULL;
|
|
|
|
tags->tag = tags->value = NULL;
|
|
|
|
tags->count = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
|
|
|
|
{
|
|
|
|
int i, j, len;
|
|
|
|
|
|
|
|
html_output_c(fbuff, NULL, '<');
|
|
|
|
html_output_str(fbuff, tag, strlen(tag));
|
|
|
|
for (i=0; i < tags->count; i++) {
|
|
|
|
html_output_c(fbuff, NULL, ' ');
|
|
|
|
html_output_str(fbuff, tags->tag[i], strlen(tags->tag[i]));
|
|
|
|
if (tags->value[i]) {
|
|
|
|
html_output_str(fbuff, "=\"", 2);
|
|
|
|
len = strlen(tags->value[i]);
|
|
|
|
for (j=0 ; j<len ; j++) {
|
|
|
|
html_output_c(fbuff, NULL, tolower(tags->value[i][j]));
|
|
|
|
}
|
|
|
|
html_output_c(fbuff, NULL, '"');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
html_output_c(fbuff, NULL, '>');
|
|
|
|
}
|
|
|
|
|
|
|
|
void html_tag_arg_free(tag_arguments_t *tags)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i=0; i < tags->count; i++) {
|
|
|
|
free(tags->tag[i]);
|
|
|
|
if (tags->value[i]) {
|
|
|
|
free(tags->value[i]);
|
|
|
|
}
|
|
|
|
if(tags->contents)
|
|
|
|
if (tags->contents[i])
|
|
|
|
blobDestroy(tags->contents[i]);
|
|
|
|
}
|
|
|
|
if (tags->tag) {
|
|
|
|
free(tags->tag);
|
|
|
|
}
|
|
|
|
if (tags->value) {
|
|
|
|
free(tags->value);
|
|
|
|
}
|
|
|
|
if(tags->contents)
|
|
|
|
free(tags->contents);
|
|
|
|
tags->contents = NULL;
|
|
|
|
tags->tag = tags->value = NULL;
|
|
|
|
tags->count = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* this is used for img, and iframe tags. If they are inside an <a href> tag, then set the contents of the image|iframe to the real URL.
|
|
|
|
*/
|
|
|
|
static inline void html_tag_set_inahref(tag_arguments_t *tags,int idx,int in_ahref)
|
|
|
|
{
|
|
|
|
tags->contents[idx-1]=blobCreate();
|
|
|
|
blobAddData(tags->contents[idx-1],tags->value[in_ahref-1],strlen(tags->value[in_ahref-1]));
|
|
|
|
blobAddData(tags->contents[idx-1], "",1);
|
|
|
|
blobClose(tags->contents[idx-1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* the displayed text for an <a href> tag
|
|
|
|
*/
|
|
|
|
static inline void html_tag_contents_append(tag_arguments_t *tags,int idx,const unsigned char* begin,const unsigned char *end)
|
|
|
|
{
|
|
|
|
if(end && (begin<end)) {
|
|
|
|
blobAddData(tags->contents[idx-1],begin,end-begin);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void html_tag_contents_done(tag_arguments_t *tags,int idx)
|
|
|
|
{
|
|
|
|
/* append NUL byte */
|
|
|
|
blobAddData(tags->contents[idx-1], "", 1);
|
|
|
|
blobClose(tags->contents[idx-1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void html_tag_contents_length_check(tag_arguments_t *tags,int* idx)
|
|
|
|
{
|
|
|
|
if (blobGetDataSize(tags->contents[*idx-1])>MAX_TAG_CONTENTS_LENGTH) {
|
|
|
|
html_tag_contents_done(tags,*idx);
|
|
|
|
*idx=0;/*in_ahref=0;*/
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
|
|
|
|
{
|
|
|
|
int fd_tmp, tag_length, tag_arg_length, binary;
|
|
|
|
int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE;
|
|
|
|
FILE *stream_in = NULL;
|
|
|
|
html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
|
|
|
|
char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
|
|
|
|
char tag_val[HTML_STR_LENGTH+1], *tmp_file;
|
|
|
|
unsigned char *line, *ptr, *arg_value;
|
|
|
|
tag_arguments_t tag_args;
|
|
|
|
quoted_state quoted;
|
|
|
|
unsigned long length;
|
|
|
|
file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script;
|
|
|
|
file_buff_t *file_tmp_o1;
|
|
|
|
int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
|
|
|
|
unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/
|
|
|
|
unsigned char* ptrend=NULL;/*end of <a> contents*/
|
|
|
|
unsigned char* in_form_action = NULL;/* the action URL of the current <form> tag, if any*/
|
|
|
|
|
|
|
|
struct entity_conv conv;
|
|
|
|
int rc;
|
|
|
|
unsigned char entity_val[HTML_STR_LENGTH+1];
|
|
|
|
size_t entity_val_length = 0;
|
|
|
|
const int dconf_entconv = dconf && dconf->phishing&PHISHING_CONF_ENTCONV;
|
|
|
|
/* dconf for phishing engine sets scanContents, so no need for a flag here */
|
|
|
|
|
|
|
|
|
|
|
|
tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
|
|
|
|
if (!m_area) {
|
|
|
|
if (fd < 0) {
|
|
|
|
cli_dbgmsg("Invalid HTML fd\n");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
lseek(fd, 0, SEEK_SET);
|
|
|
|
fd_tmp = dup(fd);
|
|
|
|
if (fd_tmp < 0) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
stream_in = fdopen(fd_tmp, "r");
|
|
|
|
if (!stream_in) {
|
|
|
|
close(fd_tmp);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(dconf_entconv && (rc = init_entity_converter(&conv, UNKNOWN, 16384) )) {
|
|
|
|
if (!m_area) {
|
|
|
|
fclose(stream_in);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
tag_args.count = 0;
|
|
|
|
tag_args.tag = NULL;
|
|
|
|
tag_args.value = NULL;
|
|
|
|
tag_args.contents = NULL;
|
|
|
|
|
|
|
|
if (dirname) {
|
|
|
|
snprintf(filename, 1024, "%s/rfc2397", dirname);
|
|
|
|
if (mkdir(filename, 0700) && errno != EEXIST) {
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
file_buff_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
|
|
|
|
if (!file_buff_o1) {
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
|
|
|
|
if (!file_buff_o2) {
|
|
|
|
free(file_buff_o1);
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
file_buff_script = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
|
|
|
|
if (!file_buff_script) {
|
|
|
|
free(file_buff_o1);
|
|
|
|
free(file_buff_o2);
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(filename, 1024, "%s/comment.html", dirname);
|
|
|
|
file_buff_o1->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
|
|
|
|
if (!file_buff_o1->fd) {
|
|
|
|
cli_dbgmsg("open failed: %s\n", filename);
|
|
|
|
free(file_buff_o1);
|
|
|
|
free(file_buff_o2);
|
|
|
|
free(file_buff_script);
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(filename, 1024, "%s/nocomment.html", dirname);
|
|
|
|
file_buff_o2->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
|
|
|
|
if (!file_buff_o2->fd) {
|
|
|
|
cli_dbgmsg("open failed: %s\n", filename);
|
|
|
|
close(file_buff_o1->fd);
|
|
|
|
free(file_buff_o1);
|
|
|
|
free(file_buff_o2);
|
|
|
|
free(file_buff_script);
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(filename, 1024, "%s/script.html", dirname);
|
|
|
|
file_buff_script->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
|
|
|
|
if (!file_buff_script->fd) {
|
|
|
|
cli_dbgmsg("open failed: %s\n", filename);
|
|
|
|
close(file_buff_o1->fd);
|
|
|
|
close(file_buff_o2->fd);
|
|
|
|
free(file_buff_o1);
|
|
|
|
free(file_buff_o2);
|
|
|
|
free(file_buff_script);
|
|
|
|
file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
file_buff_o1->length = 0;
|
|
|
|
file_buff_o2->length = 0;
|
|
|
|
file_buff_script->length = 0;
|
|
|
|
} else {
|
|
|
|
file_buff_o1 = NULL;
|
|
|
|
file_buff_o2 = NULL;
|
|
|
|
file_buff_script = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
binary = FALSE;
|
|
|
|
|
|
|
|
if(dconf_entconv)
|
|
|
|
ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192);
|
|
|
|
else
|
|
|
|
ptr = line = cli_readline(stream_in, m_area, 8192);
|
|
|
|
|
|
|
|
while (line) {
|
|
|
|
if(href_contents_begin)
|
|
|
|
href_contents_begin=ptr;/*start of a new line, last line already appended to contents see below*/
|
|
|
|
while (*ptr && isspace(*ptr)) {
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
while (*ptr) {
|
|
|
|
if (!binary && *ptr == '\n') {
|
|
|
|
/* Convert it to a space and re-process */
|
|
|
|
*ptr = ' ';
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!binary && *ptr == '\r') {
|
|
|
|
ptr++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
switch (state) {
|
|
|
|
case HTML_SPECIAL_CHAR:
|
|
|
|
cli_dbgmsg("Impossible, special_char can't occur here\n");
|
|
|
|
break;
|
|
|
|
case HTML_BAD_STATE:
|
|
|
|
/* An engine error has occurred */
|
|
|
|
cli_dbgmsg("HTML Engine Error\n");
|
|
|
|
goto abort;
|
|
|
|
case HTML_SKIP_LENGTH:
|
|
|
|
length--;
|
|
|
|
ptr++;
|
|
|
|
if (!length) {
|
|
|
|
state = next_state;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_SKIP_WS:
|
|
|
|
if (isspace(*ptr)) {
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_TRIM_WS:
|
|
|
|
if (isspace(*ptr)) {
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, ' ');
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_NORM:
|
|
|
|
if (*ptr == '<') {
|
|
|
|
ptrend=ptr; /* for use by scanContents */
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '<');
|
|
|
|
if (in_script) {
|
|
|
|
html_output_c(file_buff_script, NULL, '<');
|
|
|
|
}
|
|
|
|
if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
|
|
|
|
/*append this text portion to the contents of <a>*/
|
|
|
|
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
|
|
|
|
html_tag_contents_length_check(hrefs,&in_ahref);
|
|
|
|
href_contents_begin=NULL;/*We just encountered another tag inside <a>, so skip it*/
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_length=0;
|
|
|
|
next_state = HTML_TAG;
|
|
|
|
} else if (isspace(*ptr)) {
|
|
|
|
state = HTML_TRIM_WS;
|
|
|
|
next_state = HTML_NORM;
|
|
|
|
} else if (*ptr == '&') {
|
|
|
|
state = HTML_CHAR_REF;
|
|
|
|
next_state = HTML_NORM;
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr));
|
|
|
|
if (in_script) {
|
|
|
|
html_output_c(file_buff_script, NULL, tolower(*ptr));
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_TAG:
|
|
|
|
if ((tag_length == 0) && (*ptr == '!')) {
|
|
|
|
/* Comment */
|
|
|
|
html_output_c(file_buff_o1, NULL, '!');
|
|
|
|
if (in_script) {
|
|
|
|
html_output_c(file_buff_script, NULL, '!');
|
|
|
|
}
|
|
|
|
/* Need to rewind in the no-comment output stream */
|
|
|
|
if (file_buff_o2 && (file_buff_o2->length > 0)) {
|
|
|
|
file_buff_o2->length--;
|
|
|
|
}
|
|
|
|
state = HTML_COMMENT;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
ptr++;
|
|
|
|
} else if (*ptr == '>') {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '>');
|
|
|
|
if (in_script) {
|
|
|
|
html_output_c(file_buff_script, NULL, '>');
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
tag[tag_length] = '\0';
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_PROCESS_TAG;
|
|
|
|
} else if (!isspace(*ptr)) {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr));
|
|
|
|
if (in_script) {
|
|
|
|
html_output_c(file_buff_script, NULL, tolower(*ptr));
|
|
|
|
}
|
|
|
|
if (tag_length < HTML_STR_LENGTH) {
|
|
|
|
tag[tag_length++] = tolower(*ptr);
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
tag[tag_length] = '\0';
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length = 0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_TAG_ARG:
|
|
|
|
if (*ptr == '=') {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '=');
|
|
|
|
tag_arg[tag_arg_length] = '\0';
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
escape = FALSE;
|
|
|
|
quoted = NOT_QUOTED;
|
|
|
|
tag_val_length = 0;
|
|
|
|
next_state = HTML_TAG_ARG_VAL;
|
|
|
|
} else if (isspace(*ptr)) {
|
|
|
|
ptr++;
|
|
|
|
tag_arg[tag_arg_length] = '\0';
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_TAG_ARG_EQUAL;
|
|
|
|
} else if (*ptr == '>') {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '>');
|
|
|
|
if (tag_arg_length > 0) {
|
|
|
|
tag_arg[tag_arg_length] = '\0';
|
|
|
|
html_tag_arg_add(&tag_args, tag_arg, NULL);
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
state = HTML_PROCESS_TAG;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
} else {
|
|
|
|
if (tag_arg_length == 0) {
|
|
|
|
/* Start of new tag - add space */
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2,' ');
|
|
|
|
}
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr));
|
|
|
|
if (tag_arg_length < HTML_STR_LENGTH) {
|
|
|
|
tag_arg[tag_arg_length++] = tolower(*ptr);
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_TAG_ARG_EQUAL:
|
|
|
|
if (*ptr == '=') {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '=');
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
escape = FALSE;
|
|
|
|
quoted = NOT_QUOTED;
|
|
|
|
tag_val_length = 0;
|
|
|
|
next_state = HTML_TAG_ARG_VAL;
|
|
|
|
} else {
|
|
|
|
if (tag_arg_length > 0) {
|
|
|
|
tag_arg[tag_arg_length] = '\0';
|
|
|
|
html_tag_arg_add(&tag_args, tag_arg, NULL);
|
|
|
|
}
|
|
|
|
tag_arg_length=0;
|
|
|
|
state = HTML_TAG_ARG;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_TAG_ARG_VAL:
|
|
|
|
if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) {
|
|
|
|
/* RFC2397 inline data */
|
|
|
|
|
|
|
|
/* Rewind one byte so we don't recursuive */
|
|
|
|
if (file_buff_o1 && (file_buff_o1->length > 0)) {
|
|
|
|
file_buff_o1->length--;
|
|
|
|
}
|
|
|
|
if (file_buff_o2 && (file_buff_o2->length > 0)) {
|
|
|
|
file_buff_o2->length--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (quoted != NOT_QUOTED) {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
}
|
|
|
|
tag_val_length = 0;
|
|
|
|
state = HTML_RFC2397_TYPE;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) {
|
|
|
|
/* RFC2397 inline data */
|
|
|
|
|
|
|
|
/* Rewind one byte so we don't recursuive */
|
|
|
|
if (file_buff_o1 && (file_buff_o1->length > 0)) {
|
|
|
|
file_buff_o1->length--;
|
|
|
|
}
|
|
|
|
if (file_buff_o2 && (file_buff_o2->length > 0)) {
|
|
|
|
file_buff_o2->length--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (quoted != NOT_QUOTED) {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
}
|
|
|
|
|
|
|
|
tag_val_length = 0;
|
|
|
|
state = HTML_RFC2397_TYPE;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else if (*ptr == '&') {
|
|
|
|
state = HTML_CHAR_REF;
|
|
|
|
next_state = HTML_TAG_ARG_VAL;
|
|
|
|
ptr++;
|
|
|
|
} else if (*ptr == '\'') {
|
|
|
|
if (tag_val_length == 0) {
|
|
|
|
quoted = SINGLE_QUOTED;
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
if (!escape && (quoted==SINGLE_QUOTED)) {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
tag_val[tag_val_length] = '\0';
|
|
|
|
html_tag_arg_add(&tag_args, tag_arg, tag_val);
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length=0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (*ptr == '"') {
|
|
|
|
if (tag_val_length == 0) {
|
|
|
|
quoted = DOUBLE_QUOTED;
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
if (!escape && (quoted==DOUBLE_QUOTED)) {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
tag_val[tag_val_length] = '\0';
|
|
|
|
html_tag_arg_add(&tag_args, tag_arg, tag_val);
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length=0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '"');
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (isspace(*ptr) || (*ptr == '>')) {
|
|
|
|
if (quoted == NOT_QUOTED) {
|
|
|
|
tag_val[tag_val_length] = '\0';
|
|
|
|
html_tag_arg_add(&tag_args, tag_arg, tag_val);
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length=0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, *ptr);
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
if (isspace(*ptr)) {
|
|
|
|
tag_val[tag_val_length++] = ' ';
|
|
|
|
} else {
|
|
|
|
tag_val[tag_val_length++] = '>';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
escape = FALSE;
|
|
|
|
quoted = NOT_QUOTED;
|
|
|
|
next_state = HTML_TAG_ARG_VAL;
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr));
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = *ptr;
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*ptr == '\\') {
|
|
|
|
escape = TRUE;
|
|
|
|
} else {
|
|
|
|
escape = FALSE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_COMMENT:
|
|
|
|
html_output_c(file_buff_o1, NULL, tolower(*ptr));
|
|
|
|
if (in_script) {
|
|
|
|
html_output_c(file_buff_script, NULL, tolower(*ptr));
|
|
|
|
}
|
|
|
|
if (*ptr == '>') {
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_NORM;
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
break;
|
|
|
|
case HTML_PROCESS_TAG:
|
|
|
|
|
|
|
|
/* Default to no action for this tag */
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_NORM;
|
|
|
|
if (tag[0] == '/') {
|
|
|
|
/* End tag */
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_NORM;
|
|
|
|
if (strcmp(tag, "/script") == 0) {
|
|
|
|
in_script=FALSE;
|
|
|
|
html_output_c(file_buff_script, NULL, '\n');
|
|
|
|
}
|
|
|
|
if (hrefs && hrefs->scanContents && in_ahref) {
|
|
|
|
if(strcmp(tag,"/a") == 0) {
|
|
|
|
html_tag_contents_done(hrefs,in_ahref);
|
|
|
|
in_ahref=0;/* we are no longer inside an <a href>
|
|
|
|
nesting <a> tags not supported, and shouldn't be supported*/
|
|
|
|
}
|
|
|
|
href_contents_begin=ptr;
|
|
|
|
}
|
|
|
|
if (strcmp(tag, "/form") == 0) {
|
|
|
|
if (in_form_action)
|
|
|
|
free(in_form_action);
|
|
|
|
in_form_action = NULL;
|
|
|
|
}
|
|
|
|
} else if (strcmp(tag, "script") == 0) {
|
|
|
|
arg_value = html_tag_arg_value(&tag_args, "language");
|
|
|
|
if (arg_value && (strcasecmp(arg_value, "jscript.encode") == 0)) {
|
|
|
|
html_tag_arg_set(&tag_args, "language", "javascript");
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_JSDECODE;
|
|
|
|
} else if (arg_value && (strcasecmp(arg_value, "vbscript.encode") == 0)) {
|
|
|
|
html_tag_arg_set(&tag_args, "language", "vbscript");
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
next_state = HTML_JSDECODE;
|
|
|
|
} else {
|
|
|
|
in_script = TRUE;
|
|
|
|
}
|
|
|
|
html_output_tag(file_buff_script, tag, &tag_args);
|
|
|
|
} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
|
|
|
|
const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
|
|
|
|
const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
|
|
|
|
if(http_equiv && http_content && strcasecmp(http_equiv,"content-type") == 0) {
|
|
|
|
size_t len = strlen((const char*)http_content);
|
|
|
|
unsigned char* http_content2 = cli_malloc( len + 1);
|
|
|
|
unsigned char* charset;
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
if(!http_content2)
|
|
|
|
return CL_EMEM;
|
|
|
|
for(i = 0; i < len; i++)
|
|
|
|
http_content2[i] = tolower(http_content[i]);
|
|
|
|
http_content2[len] = '\0';
|
|
|
|
charset = (unsigned char*) strstr((char*)http_content2,"charset");
|
|
|
|
if(charset) {
|
|
|
|
while(*charset && *charset != '=')
|
|
|
|
charset++;
|
|
|
|
if(*charset)
|
|
|
|
charset++;/* skip = */
|
|
|
|
len = strcspn((const char*)charset," \"'");
|
|
|
|
charset[len] = '\0';
|
|
|
|
if(len) {
|
|
|
|
process_encoding_set(&conv, charset, META);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
free(http_content2);
|
|
|
|
}
|
|
|
|
} else if (hrefs) {
|
|
|
|
if(in_ahref && !href_contents_begin)
|
|
|
|
href_contents_begin=ptr;
|
|
|
|
if (strcmp(tag, "a") == 0) {
|
|
|
|
arg_value = html_tag_arg_value(&tag_args, "href");
|
|
|
|
if (arg_value && strlen(arg_value) > 0) {
|
|
|
|
if (hrefs->scanContents) {
|
|
|
|
unsigned char* arg_value_title = html_tag_arg_value(&tag_args,"title");
|
|
|
|
/*beginning of an <a> tag*/
|
|
|
|
if (in_ahref)
|
|
|
|
/*we encountered nested <a> tags, pretend previous closed*/
|
|
|
|
if (href_contents_begin) {
|
|
|
|
html_tag_contents_append(hrefs,in_ahref,
|
|
|
|
href_contents_begin,ptrend);
|
|
|
|
/*add pending contents between tags*/
|
|
|
|
html_tag_contents_done(hrefs,in_ahref);
|
|
|
|
in_ahref=0;
|
|
|
|
}
|
|
|
|
if (arg_value_title) {
|
|
|
|
/* title is a 'displayed link'*/
|
|
|
|
html_tag_arg_add(hrefs,"href_title",arg_value_title);
|
|
|
|
hrefs->contents[hrefs->count-1]=blobCreate();
|
|
|
|
html_tag_contents_append(hrefs,hrefs->count,arg_value,
|
|
|
|
arg_value+strlen(arg_value));
|
|
|
|
html_tag_contents_done(hrefs,hrefs->count);
|
|
|
|
}
|
|
|
|
if (in_form_action) {
|
|
|
|
/* form action is the real URL, and href is the 'displayed' */
|
|
|
|
html_tag_arg_add(hrefs,"form",arg_value);
|
|
|
|
hrefs->contents[hrefs->count-1] = blobCreate();
|
|
|
|
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
|
|
|
|
in_form_action + strlen(in_form_action));
|
|
|
|
html_tag_contents_done(hrefs,hrefs->count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
html_tag_arg_add(hrefs, "href", arg_value);
|
|
|
|
if (hrefs->scanContents) {
|
|
|
|
in_ahref=hrefs->count; /* index of this tag (counted from 1) */
|
|
|
|
href_contents_begin=ptr;/* contents begin after <a ..> ends */
|
|
|
|
hrefs->contents[hrefs->count-1]=blobCreate();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) {
|
|
|
|
const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action");
|
|
|
|
if (arg_action_value) {
|
|
|
|
if(in_form_action)
|
|
|
|
free(in_form_action);
|
|
|
|
in_form_action = cli_strdup(arg_action_value);
|
|
|
|
}
|
|
|
|
} else if (strcmp(tag, "img") == 0) {
|
|
|
|
arg_value = html_tag_arg_value(&tag_args, "src");
|
|
|
|
if (arg_value && strlen(arg_value) > 0) {
|
|
|
|
html_tag_arg_add(hrefs, "src", arg_value);
|
|
|
|
if(hrefs->scanContents && in_ahref)
|
|
|
|
/* "contents" of an img tag, is the URL of its parent <a> tag */
|
|
|
|
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
|
|
|
|
if (in_form_action) {
|
|
|
|
/* form action is the real URL, and href is the 'displayed' */
|
|
|
|
html_tag_arg_add(hrefs,"form",arg_value);
|
|
|
|
hrefs->contents[hrefs->count-1] = blobCreate();
|
|
|
|
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
|
|
|
|
in_form_action + strlen(in_form_action));
|
|
|
|
html_tag_contents_done(hrefs,hrefs->count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
arg_value = html_tag_arg_value(&tag_args, "dynsrc");
|
|
|
|
if (arg_value && strlen(arg_value) > 0) {
|
|
|
|
html_tag_arg_add(hrefs, "dynsrc", arg_value);
|
|
|
|
if(hrefs->scanContents && in_ahref)
|
|
|
|
/* see above */
|
|
|
|
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
|
|
|
|
if (in_form_action) {
|
|
|
|
/* form action is the real URL, and href is the 'displayed' */
|
|
|
|
html_tag_arg_add(hrefs,"form",arg_value);
|
|
|
|
hrefs->contents[hrefs->count-1] = blobCreate();
|
|
|
|
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
|
|
|
|
in_form_action + strlen(in_form_action));
|
|
|
|
html_tag_contents_done(hrefs,hrefs->count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (strcmp(tag, "iframe") == 0) {
|
|
|
|
arg_value = html_tag_arg_value(&tag_args, "src");
|
|
|
|
if (arg_value && strlen(arg_value) > 0) {
|
|
|
|
html_tag_arg_add(hrefs, "iframe", arg_value);
|
|
|
|
if(hrefs->scanContents && in_ahref)
|
|
|
|
/* see above */
|
|
|
|
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
|
|
|
|
if (in_form_action) {
|
|
|
|
/* form action is the real URL, and href is the 'displayed' */
|
|
|
|
html_tag_arg_add(hrefs,"form",arg_value);
|
|
|
|
hrefs->contents[hrefs->count-1] = blobCreate();
|
|
|
|
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
|
|
|
|
in_form_action + strlen(in_form_action));
|
|
|
|
html_tag_contents_done(hrefs,hrefs->count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (strcmp(tag,"area") == 0) {
|
|
|
|
arg_value = html_tag_arg_value(&tag_args,"href");
|
|
|
|
if (arg_value && strlen(arg_value) > 0) {
|
|
|
|
html_tag_arg_add(hrefs, "area", arg_value);
|
|
|
|
if(hrefs->scanContents && in_ahref)
|
|
|
|
/* see above */
|
|
|
|
html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
|
|
|
|
if (in_form_action) {
|
|
|
|
/* form action is the real URL, and href is the 'displayed' */
|
|
|
|
html_tag_arg_add(hrefs,"form",arg_value);
|
|
|
|
hrefs->contents[hrefs->count-1] = blobCreate();
|
|
|
|
html_tag_contents_append(hrefs, hrefs->count, in_form_action,
|
|
|
|
in_form_action + strlen(in_form_action));
|
|
|
|
html_tag_contents_done(hrefs,hrefs->count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* TODO:imagemaps can have urls too */
|
|
|
|
}
|
|
|
|
html_tag_arg_free(&tag_args);
|
|
|
|
break;
|
|
|
|
case HTML_CHAR_REF:
|
|
|
|
if (*ptr == '#') {
|
|
|
|
value = 0;
|
|
|
|
hex = FALSE;
|
|
|
|
state = HTML_CHAR_REF_DECODE;
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
if(dconf_entconv)
|
|
|
|
state = HTML_ENTITY_REF_DECODE;
|
|
|
|
else {
|
|
|
|
if(next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '&';
|
|
|
|
}
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '&');
|
|
|
|
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_ENTITY_REF_DECODE:
|
|
|
|
if(*ptr == ';') {
|
|
|
|
size_t i;
|
|
|
|
unsigned char* normalized;
|
|
|
|
entity_val[entity_val_length] = '\0';
|
|
|
|
normalized = entity_norm(&conv, entity_val);
|
|
|
|
if(normalized) {
|
|
|
|
for(i=0; i < strlen(normalized); i++) {
|
|
|
|
const char c = tolower(normalized[i]);
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, c);
|
|
|
|
if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
free(normalized);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '&');
|
|
|
|
if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '&';
|
|
|
|
}
|
|
|
|
for(i=0; i < entity_val_length; i++) {
|
|
|
|
const char c = tolower(entity_val[i]);
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, c);
|
|
|
|
if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = ';';
|
|
|
|
}
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, ';');
|
|
|
|
}
|
|
|
|
entity_val_length = 0;
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
else if ( (isalnum(*ptr) || *ptr=='_' || *ptr==':' || (*ptr=='-')) && entity_val_length < HTML_STR_LENGTH) {
|
|
|
|
entity_val[entity_val_length++] = *ptr++;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* entity too long, or not valid, dump it */
|
|
|
|
size_t i;
|
|
|
|
if (next_state==HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '&';
|
|
|
|
}
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '&');
|
|
|
|
for(i=0; i < entity_val_length; i++) {
|
|
|
|
const char c = tolower(entity_val[i]);
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, c);
|
|
|
|
if (next_state==HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
entity_val_length = 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_CHAR_REF_DECODE:
|
|
|
|
if ((value==0) && ((*ptr == 'x') || (*ptr == 'X'))) {
|
|
|
|
hex=TRUE;
|
|
|
|
ptr++;
|
|
|
|
} else if (*ptr == ';') {
|
|
|
|
if (next_state==HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = value; /* store encoded values too */
|
|
|
|
}
|
|
|
|
if(dconf_entconv) {
|
|
|
|
|
|
|
|
if(value < 0x80)
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(value));
|
|
|
|
else {
|
|
|
|
unsigned char buff[10];
|
|
|
|
snprintf((char*)buff,9,"&#%d;",value);
|
|
|
|
buff[9] = '\0';
|
|
|
|
html_output_str(file_buff_o1, buff, strlen(buff));
|
|
|
|
html_output_str(file_buff_o2, buff, strlen(buff));
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(value&0xff));
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
ptr++;
|
|
|
|
} else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) {
|
|
|
|
if (hex) {
|
|
|
|
value *= 16;
|
|
|
|
} else {
|
|
|
|
value *= 10;
|
|
|
|
}
|
|
|
|
if (isdigit(*ptr)) {
|
|
|
|
value += (*ptr - '0');
|
|
|
|
} else {
|
|
|
|
value += (tolower(*ptr) - 'a' + 10);
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, value);
|
|
|
|
state = next_state;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_JSDECODE:
|
|
|
|
/* Check for start marker */
|
|
|
|
if (strncmp(ptr, "#@~^", 4) == 0) {
|
|
|
|
ptr += 4;
|
|
|
|
state = HTML_JSDECODE_LENGTH;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr));
|
|
|
|
html_output_c(file_buff_script, NULL, tolower(*ptr));
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_JSDECODE_LENGTH:
|
|
|
|
if (strlen(ptr) < 8) {
|
|
|
|
state = HTML_NORM;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
length = base64_chars[ptr[0]] << 2;
|
|
|
|
length += base64_chars[ptr[1]] >> 4;
|
|
|
|
length += (base64_chars[ptr[1]] & 0x0f) << 12;
|
|
|
|
length += (base64_chars[ptr[2]] >> 2) << 8;
|
|
|
|
length += (base64_chars[ptr[2]] & 0x03) << 22;
|
|
|
|
length += base64_chars[ptr[3]] << 16;
|
|
|
|
length += (base64_chars[ptr[4]] << 2) << 24;
|
|
|
|
length += (base64_chars[ptr[5]] >> 4) << 24;
|
|
|
|
table_pos = 0;
|
|
|
|
state = HTML_JSDECODE_DECRYPT;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
ptr += 8;
|
|
|
|
break;
|
|
|
|
case HTML_JSDECODE_DECRYPT:
|
|
|
|
if (length == 0) {
|
|
|
|
html_output_str(file_buff_script, "</script>\n", 10);
|
|
|
|
length = 12;
|
|
|
|
state = HTML_SKIP_LENGTH;
|
|
|
|
next_state = HTML_NORM;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (*ptr < 0x80) {
|
|
|
|
value = decrypt_tables[table_order[table_pos]][*ptr];
|
|
|
|
if (value == 0xFF) { /* special character */
|
|
|
|
ptr++;
|
|
|
|
length--;
|
|
|
|
switch (*ptr) {
|
|
|
|
case '\0':
|
|
|
|
/* Fixup for end of line */
|
|
|
|
ptr--;
|
|
|
|
break;
|
|
|
|
case 0x21:
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, 0x3c);
|
|
|
|
html_output_c(file_buff_script, NULL, 0x3c);
|
|
|
|
break;
|
|
|
|
case 0x23:
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, 0x0d);
|
|
|
|
html_output_c(file_buff_script, NULL, 0x0d);
|
|
|
|
break;
|
|
|
|
case 0x24:
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, 0x40);
|
|
|
|
html_output_c(file_buff_script, NULL, 0x40);
|
|
|
|
break;
|
|
|
|
case 0x26:
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, 0x0a);
|
|
|
|
html_output_c(file_buff_script, NULL, 0x0a);
|
|
|
|
break;
|
|
|
|
case 0x2a:
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, 0x3e);
|
|
|
|
html_output_c(file_buff_script, NULL, 0x3e);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, value);
|
|
|
|
html_output_c(file_buff_script, NULL, tolower(value));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
table_pos = (table_pos + 1) % 64;
|
|
|
|
ptr++;
|
|
|
|
length--;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case HTML_RFC2397_TYPE:
|
|
|
|
if (*ptr == '\'') {
|
|
|
|
if (!escape && (quoted==SINGLE_QUOTED)) {
|
|
|
|
/* Early end of data detected. Error */
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length=0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else {
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else if (*ptr == '"') {
|
|
|
|
if (!escape && (quoted==DOUBLE_QUOTED)) {
|
|
|
|
/* Early end of data detected. Error */
|
|
|
|
ptr++;
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length=0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else {
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = '"';
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else if (isspace(*ptr) || (*ptr == '>')) {
|
|
|
|
if (quoted == NOT_QUOTED) {
|
|
|
|
/* Early end of data detected. Error */
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
tag_arg_length=0;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
} else {
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
if (isspace(*ptr)) {
|
|
|
|
tag_val[tag_val_length++] = ' ';
|
|
|
|
} else {
|
|
|
|
tag_val[tag_val_length++] = '>';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
escape = FALSE;
|
|
|
|
quoted = NOT_QUOTED;
|
|
|
|
next_state = HTML_RFC2397_TYPE;
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else if (*ptr == ',') {
|
|
|
|
/* Beginning of data */
|
|
|
|
tag_val[tag_val_length] = '\0';
|
|
|
|
state = HTML_RFC2397_INIT;
|
|
|
|
escape = FALSE;
|
|
|
|
next_state = HTML_BAD_STATE;
|
|
|
|
ptr++;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
if (tag_val_length < HTML_STR_LENGTH) {
|
|
|
|
tag_val[tag_val_length++] = tolower(*ptr);
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
if (*ptr == '\\') {
|
|
|
|
escape = TRUE;
|
|
|
|
} else {
|
|
|
|
escape = FALSE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_RFC2397_INIT:
|
|
|
|
file_tmp_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
|
|
|
|
if (!file_tmp_o1) {
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
snprintf(filename, 1024, "%s/rfc2397", dirname);
|
|
|
|
tmp_file = cli_gentemp(filename);
|
|
|
|
cli_dbgmsg("RFC2397 data file: %s\n", tmp_file);
|
|
|
|
file_tmp_o1->fd = open(tmp_file, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
|
|
|
|
free(tmp_file);
|
|
|
|
if (!file_tmp_o1->fd) {
|
|
|
|
cli_dbgmsg("open failed: %s\n", filename);
|
|
|
|
free(file_tmp_o1);
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
file_tmp_o1->length = 0;
|
|
|
|
|
|
|
|
html_output_str(file_tmp_o1, "From html-normalise\n", 20);
|
|
|
|
html_output_str(file_tmp_o1, "Content-type: ", 14);
|
|
|
|
if ((tag_val_length == 0) && (*tag_val == ';')) {
|
|
|
|
html_output_str(file_tmp_o1, "text/plain\n", 11);
|
|
|
|
}
|
|
|
|
html_output_str(file_tmp_o1, tag_val, tag_val_length);
|
|
|
|
html_output_c(file_tmp_o1, NULL, '\n');
|
|
|
|
if (strstr(tag_val, ";base64") != NULL) {
|
|
|
|
html_output_str(file_tmp_o1, "Content-transfer-encoding: base64\n", 34);
|
|
|
|
}
|
|
|
|
html_output_c(file_tmp_o1, NULL, '\n');
|
|
|
|
state = HTML_RFC2397_DATA;
|
|
|
|
binary = TRUE;
|
|
|
|
break;
|
|
|
|
case HTML_RFC2397_DATA:
|
|
|
|
if (*ptr == '&') {
|
|
|
|
state = HTML_CHAR_REF;
|
|
|
|
next_state = HTML_RFC2397_DATA;
|
|
|
|
ptr++;
|
|
|
|
} else if (*ptr == '%') {
|
|
|
|
length = 0;
|
|
|
|
value = 0;
|
|
|
|
state = HTML_ESCAPE_CHAR;
|
|
|
|
next_state = HTML_RFC2397_ESC;
|
|
|
|
ptr++;
|
|
|
|
} else if (*ptr == '\'') {
|
|
|
|
if (!escape && (quoted==SINGLE_QUOTED)) {
|
|
|
|
state = HTML_RFC2397_FINISH;
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_tmp_o1, NULL, *ptr);
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else if (*ptr == '\"') {
|
|
|
|
if (!escape && (quoted==DOUBLE_QUOTED)) {
|
|
|
|
state = HTML_RFC2397_FINISH;
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_tmp_o1, NULL, *ptr);
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else if (isspace(*ptr) || (*ptr == '>')) {
|
|
|
|
if (quoted == NOT_QUOTED) {
|
|
|
|
state = HTML_RFC2397_FINISH;
|
|
|
|
ptr++;
|
|
|
|
} else {
|
|
|
|
html_output_c(file_tmp_o1, NULL, *ptr);
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
html_output_c(file_tmp_o1, NULL, *ptr);
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
if (*ptr == '\\') {
|
|
|
|
escape = TRUE;
|
|
|
|
} else {
|
|
|
|
escape = FALSE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_RFC2397_FINISH:
|
|
|
|
html_output_flush(file_tmp_o1);
|
|
|
|
close(file_tmp_o1->fd);
|
|
|
|
free(file_tmp_o1);
|
|
|
|
state = HTML_SKIP_WS;
|
|
|
|
escape = FALSE;
|
|
|
|
quoted = NOT_QUOTED;
|
|
|
|
next_state = HTML_TAG_ARG;
|
|
|
|
binary = FALSE;
|
|
|
|
break;
|
|
|
|
case HTML_RFC2397_ESC:
|
|
|
|
if (length == 2) {
|
|
|
|
html_output_c(file_tmp_o1, NULL, value);
|
|
|
|
} else if (length == 1) {
|
|
|
|
html_output_c(file_tmp_o1, NULL, '%');
|
|
|
|
html_output_c(file_tmp_o1, NULL, value+'0');
|
|
|
|
} else {
|
|
|
|
html_output_c(file_tmp_o1, NULL, '%');
|
|
|
|
}
|
|
|
|
state = HTML_RFC2397_DATA;
|
|
|
|
break;
|
|
|
|
case HTML_ESCAPE_CHAR:
|
|
|
|
value *= 16;
|
|
|
|
length++;
|
|
|
|
if (isxdigit(*ptr)) {
|
|
|
|
if (isdigit(*ptr)) {
|
|
|
|
value += (*ptr - '0');
|
|
|
|
} else {
|
|
|
|
value += (tolower(*ptr) - 'a' + 10);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
state = next_state;
|
|
|
|
}
|
|
|
|
if (length == 2) {
|
|
|
|
state = next_state;
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
|
|
|
|
/* end of line, append contents now, resume on next line */
|
|
|
|
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
|
|
|
|
ptrend = NULL;
|
|
|
|
free(line);
|
|
|
|
if(dconf_entconv)
|
|
|
|
ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192);
|
|
|
|
else
|
|
|
|
ptr = line = cli_readline(stream_in, m_area, 8192);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(dconf_entconv) {
|
|
|
|
/* handle "unfinished" entitites */
|
|
|
|
size_t i;
|
|
|
|
unsigned char* normalized;
|
|
|
|
entity_val[entity_val_length] = '\0';
|
|
|
|
normalized = entity_norm(&conv, entity_val);
|
|
|
|
if(normalized) {
|
|
|
|
for(i=0; i < strlen(normalized); i++)
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(normalized[i]));
|
|
|
|
free(normalized);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if(entity_val_length) {
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, '&');
|
|
|
|
for(i=0; i < entity_val_length; i++)
|
|
|
|
html_output_c(file_buff_o1, file_buff_o2, tolower(entity_val[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
retval = TRUE;
|
|
|
|
abort:
|
|
|
|
if (in_form_action)
|
|
|
|
free(in_form_action);
|
|
|
|
if (in_ahref) /* tag not closed, force closing */
|
|
|
|
html_tag_contents_done(hrefs,in_ahref);
|
|
|
|
|
|
|
|
if(dconf_entconv)
|
|
|
|
entity_norm_done(&conv);
|
|
|
|
html_tag_arg_free(&tag_args);
|
|
|
|
if (!m_area) {
|
|
|
|
fclose(stream_in);
|
|
|
|
}
|
|
|
|
if (file_buff_o1) {
|
|
|
|
html_output_flush(file_buff_o1);
|
|
|
|
close(file_buff_o1->fd);
|
|
|
|
free(file_buff_o1);
|
|
|
|
}
|
|
|
|
if (file_buff_o2) {
|
|
|
|
html_output_flush(file_buff_o2);
|
|
|
|
close(file_buff_o2->fd);
|
|
|
|
free(file_buff_o2);
|
|
|
|
}
|
|
|
|
if (file_buff_script) {
|
|
|
|
html_output_flush(file_buff_script);
|
|
|
|
close(file_buff_script->fd);
|
|
|
|
free(file_buff_script);
|
|
|
|
}
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
|
|
|
|
{
|
|
|
|
m_area_t m_area;
|
|
|
|
|
|
|
|
m_area.buffer = in_buff;
|
|
|
|
m_area.length = in_size;
|
|
|
|
m_area.offset = 0;
|
|
|
|
|
|
|
|
return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
|
|
|
|
}
|
|
|
|
|
|
|
|
int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
|
|
|
|
{
|
|
|
|
#if HAVE_MMAP
|
|
|
|
int retval=FALSE;
|
|
|
|
m_area_t m_area;
|
|
|
|
struct stat statbuf;
|
|
|
|
|
|
|
|
if (fstat(fd, &statbuf) == 0) {
|
|
|
|
m_area.length = statbuf.st_size;
|
|
|
|
m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0);
|
|
|
|
m_area.offset = 0;
|
|
|
|
if (m_area.buffer == MAP_FAILED) {
|
|
|
|
cli_dbgmsg("mmap HTML failed\n");
|
|
|
|
retval = cli_html_normalise(fd, NULL, dirname, hrefs, dconf);
|
|
|
|
} else {
|
|
|
|
cli_dbgmsg("mmap'ed file\n");
|
|
|
|
retval = cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
|
|
|
|
munmap(m_area.buffer, m_area.length);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
cli_dbgmsg("fstat HTML failed\n");
|
|
|
|
retval = cli_html_normalise(fd, NULL, dirname, hrefs, dconf);
|
|
|
|
}
|
|
|
|
return retval;
|
|
|
|
#else
|
|
|
|
return cli_html_normalise(fd, NULL, dirname, hrefs, dconf);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
int html_screnc_decode(int fd, const char *dirname)
|
|
|
|
{
|
|
|
|
int fd_tmp, table_pos=0, result, count, state, retval=FALSE;
|
|
|
|
unsigned char *line, tmpstr[6];
|
|
|
|
unsigned long length;
|
|
|
|
unsigned char *ptr, filename[1024];
|
|
|
|
FILE *stream_in;
|
|
|
|
file_buff_t file_buff;
|
|
|
|
|
|
|
|
lseek(fd, 0, SEEK_SET);
|
|
|
|
fd_tmp = dup(fd);
|
|
|
|
if (fd_tmp < 0) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
stream_in = fdopen(fd_tmp, "r");
|
|
|
|
if (!stream_in) {
|
|
|
|
close(fd_tmp);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(filename, 1024, "%s/screnc.html", dirname);
|
|
|
|
file_buff.fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
|
|
|
|
file_buff.length = 0;
|
|
|
|
|
|
|
|
if (!file_buff.fd) {
|
|
|
|
cli_dbgmsg("open failed: %s\n", filename);
|
|
|
|
fclose(stream_in);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
while ((line = cli_readline(stream_in, NULL, 8192)) != NULL) {
|
|
|
|
ptr = strstr(line, "#@~^");
|
|
|
|
if (ptr) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
free(line);
|
|
|
|
}
|
|
|
|
if (!line) {
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Calculate the length of the encoded string */
|
|
|
|
ptr += 4;
|
|
|
|
count = 0;
|
|
|
|
do {
|
|
|
|
if (! *ptr) {
|
|
|
|
free(line);
|
|
|
|
ptr = line = cli_readline(stream_in, NULL, 8192);
|
|
|
|
if (!line) {
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tmpstr[count++] = *ptr;
|
|
|
|
ptr++;
|
|
|
|
} while (count < 6);
|
|
|
|
|
|
|
|
length = base64_chars[tmpstr[0]] << 2;
|
|
|
|
length += base64_chars[tmpstr[1]] >> 4;
|
|
|
|
length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
|
|
|
|
length += (base64_chars[tmpstr[2]] >> 2) << 8;
|
|
|
|
length += (base64_chars[tmpstr[2]] & 0x03) << 22;
|
|
|
|
length += base64_chars[tmpstr[3]] << 16;
|
|
|
|
length += (base64_chars[tmpstr[4]] << 2) << 24;
|
|
|
|
length += (base64_chars[tmpstr[5]] >> 4) << 24;
|
|
|
|
|
|
|
|
/* Move forward 2 bytes */
|
|
|
|
count = 2;
|
|
|
|
state = HTML_SKIP_LENGTH;
|
|
|
|
|
|
|
|
while (length && line) {
|
|
|
|
while (length && *ptr) {
|
|
|
|
if ((*ptr == '\n') || (*ptr == '\r')) {
|
|
|
|
ptr++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
switch (state) {
|
|
|
|
case HTML_SKIP_LENGTH:
|
|
|
|
ptr++;
|
|
|
|
count--;
|
|
|
|
if (count == 0) {
|
|
|
|
state = HTML_NORM;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case HTML_SPECIAL_CHAR:
|
|
|
|
switch (*ptr) {
|
|
|
|
case 0x21:
|
|
|
|
html_output_c(&file_buff, NULL, 0x3c);
|
|
|
|
break;
|
|
|
|
case 0x23:
|
|
|
|
html_output_c(&file_buff, NULL, 0x0d);
|
|
|
|
break;
|
|
|
|
case 0x24:
|
|
|
|
html_output_c(&file_buff, NULL, 0x40);
|
|
|
|
break;
|
|
|
|
case 0x26:
|
|
|
|
html_output_c(&file_buff, NULL, 0x0a);
|
|
|
|
break;
|
|
|
|
case 0x2a:
|
|
|
|
html_output_c(&file_buff, NULL, 0x3e);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
length--;
|
|
|
|
state = HTML_NORM;
|
|
|
|
break;
|
|
|
|
case HTML_NORM:
|
|
|
|
if (*ptr < 0x80) {
|
|
|
|
result = decrypt_tables[table_order[table_pos]][*ptr];
|
|
|
|
if (result == 0xFF) { /* special character */
|
|
|
|
state = HTML_SPECIAL_CHAR;
|
|
|
|
} else {
|
|
|
|
html_output_c(&file_buff, NULL, (char)result);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
length--;
|
|
|
|
table_pos = (table_pos + 1) % 64;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
free(line);
|
|
|
|
if (length) {
|
|
|
|
ptr = line = cli_readline(stream_in, NULL, 8192);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
retval = TRUE;
|
|
|
|
|
|
|
|
abort:
|
|
|
|
fclose(stream_in);
|
|
|
|
html_output_flush(&file_buff);
|
|
|
|
close(file_buff.fd);
|
|
|
|
return retval;
|
|
|
|
}
|