use entconv to detect UTF-16BE, and UCS-4 variants

use only cli_readline() we don't need exact conversion
drop unused functions,
simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()


git-svn: trunk@3571
remotes/push_mirror/metadata
Török Edvin 18 years ago
parent 4addba22e4
commit b3fc7f9747
  1. 9
      ChangeLog
  2. 416
      libclamav/entconv.c
  3. 23
      libclamav/entconv.h
  4. 63
      libclamav/filetypes.c
  5. 59
      libclamav/htmlnorm.c
  6. 2
      libclamav/htmlnorm.h

@ -1,3 +1,12 @@
Fri Feb 1 21:19:58 EET 2008 (edwin)
------------------------------------
* libclamav/filetypes.c: use entconv to detect UTF-16BE, and UCS-4 variants
* libclamav/htmlnorm.c: use only cli_readline() we don't need exact
conversion
* libclamav/entconv.c:
* drop unused functions,
* simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()
Fri Feb 1 00:58:05 CET 2008 (tk)
---------------------------------
* libclamav: ndb sigs: add new target type (7) for ASCII files; handle

@ -127,67 +127,7 @@ const char* entity_norm(struct entity_conv* conv,const unsigned char* entity)
return NULL;
}
/* sane default, must be larger, than the longest possible return string,
* which is
* &#xxx;*/
#define MIN_BUFFER_SIZE 32
#define LINEMODE_LIMIT 16384
int init_entity_converter(struct entity_conv* conv, size_t buffer_size)
{
if(buffer_size < MIN_BUFFER_SIZE) {
cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE);
return CL_ENULLARG;
}
if(conv) {
conv->encoding = NULL;
conv->encoding_symbolic = E_UNKNOWN;
conv->bom_cnt = 0;
conv->buffer_size = buffer_size;
conv->priority = NOPRIO;
/* start in linemode */
conv->linemode = 1;
conv->linemode_processed = 0;
conv->tmp_area.offset = 0;
conv->tmp_area.length = 0;
conv->tmp_area.buffer = cli_malloc(buffer_size);
if(!conv->tmp_area.buffer) {
return CL_EMEM;
}
conv->out_area.offset = 0;
conv->out_area.length = buffer_size;
conv->out_area.buffer = cli_malloc(buffer_size);
if(!conv->out_area.buffer) {
free(conv->tmp_area.buffer);
return CL_EMEM;
}
conv->buffer_size = buffer_size;
conv->norm_area.offset = 0;
conv->norm_area.length = 0;
conv->norm_area.buffer = cli_malloc(buffer_size);
if(!conv->norm_area.buffer) {
free(conv->tmp_area.buffer);
free(conv->out_area.buffer);
return CL_EMEM;
}
conv->iconv_struct = cli_calloc(1, sizeof(iconv_t));
if(!conv->iconv_struct) {
free(conv->tmp_area.buffer);
free(conv->out_area.buffer);
free(conv->norm_area.buffer);
return CL_EMEM;
}
return 0;
}
else
return CL_ENULLARG;
}
#ifndef HAVE_ICONV
static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
{
/* special case for these unusual byteorders */
@ -217,7 +157,6 @@ static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
}
}
#ifndef HAVE_ICONV
static iconv_t iconv_open(const char *tocode, const char* fromcode)
{
iconv_t iconv = cli_malloc(sizeof(*iconv));
@ -236,7 +175,6 @@ static int iconv_close(iconv_t cd)
return 0;
}
static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
char** outbuf, size_t *outbytesleft)
{
@ -426,14 +364,11 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
#endif
/* new iconv() version */
static inline void process_bom(struct entity_conv* conv)
static inline const char* detect_encoding(const unsigned char* bom, uint8_t* bom_found, uint8_t* enc_width)
{
const unsigned char* bom = conv->bom;
const char* encoding = NULL;
const char* encoding;
int has_bom = 0;
uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/
uint8_t enc_bytes = 1; /* default is UTF8, which has a minimum of 1 bytes */
/* undecided 32-bit encodings are treated as ucs4, and
* 16 bit as utf16*/
switch(bom[0]) {
@ -442,23 +377,28 @@ static inline void process_bom(struct entity_conv* conv)
if(bom[2] == 0xFE && bom[3] == 0xFF) {
encoding = UCS4_1234;/* UCS-4 big-endian*/
has_bom = 1;
enc_bytes = 4;
}
else if(bom[2] == 0xFF && bom[3] == 0xFE) {
encoding = UCS4_2143;/* UCS-4 unusual order 2143 */
has_bom = 1;
enc_bytes = 4;
}
else if(bom[2] == 0x00 && bom[3] == 0x3C) {
/* undecided, treat as ucs4 */
encoding = UCS4_1234;
enc_bytes = 4;
}
else if(bom[2] == 0x3C && bom[3] == 0x00) {
encoding = UCS4_2143;
enc_bytes = 4;
}
}/* 0x00 0x00 */
else if(bom[1] == 0x3C) {
if(bom[2] == 0x00) {
if(bom[3] == 0x00) {
encoding = UCS4_3412;
enc_bytes = 4;
}
else if(bom[3] == 0x3F) {
encoding = UTF16_BE;
@ -471,6 +411,7 @@ static inline void process_bom(struct entity_conv* conv)
if(bom[1] == 0xFE) {
if(bom[2] == 0x00 && bom[3] == 0x00) {
encoding = UCS4_4321;
enc_bytes = 4;
has_bom = 1;
}
else {
@ -484,6 +425,7 @@ static inline void process_bom(struct entity_conv* conv)
if(bom[1] == 0xFF) {
if(bom[2] == 0x00 && bom[3] == 0x00) {
encoding = UCS4_3412;
enc_bytes = 4;
has_bom = 1;
}
else {
@ -504,6 +446,7 @@ static inline void process_bom(struct entity_conv* conv)
if(bom[1] == 0x00) {
if(bom[2] == 0x00 && bom[3] == 0x00) {
encoding = UCS4_4321;
enc_bytes = 4;
}
else if(bom[2] == 0x3F && bom[3] == 0x00) {
encoding = UTF16_LE;
@ -523,12 +466,19 @@ static inline void process_bom(struct entity_conv* conv)
}/*4C 6F A7 94*/
break;
}/*switch*/
if(encoding) {
cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding);
process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT);
}
conv->enc_bytes = enc_bytes;
conv->has_bom = has_bom;
*enc_width = enc_bytes;
*bom_found = has_bom;
return encoding;
}
/* detects UTF-16(LE/BE), UCS-4(all 4 variants).
* UTF-8 and simple ASCII are ignored, because we can process those as text */
const char* encoding_detect_bom(const unsigned char* bom)
{
uint8_t has_bom;
uint8_t enc_width;
const char* encoding = detect_encoding(bom, &has_bom, &enc_width);
return enc_width > 1 ? encoding : NULL;
}
/*()-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
@ -575,53 +525,6 @@ static char* normalize_encoding(const unsigned char* enc)
return norm;
}
static int encoding_norm_done(struct entity_conv* conv)
{
if(conv->encoding) {
free(conv->encoding);
conv->encoding = NULL;
}
conv->buffer_size = 0;
if(conv->tmp_area.buffer) {
free(conv->tmp_area.buffer);
conv->tmp_area.buffer = NULL;
}
if(conv->out_area.buffer) {
free(conv->out_area.buffer);
conv->out_area.buffer = NULL;
}
if(conv->norm_area.buffer) {
free(conv->norm_area.buffer);
conv->norm_area.buffer = NULL;
}
if(conv->iconv_struct) {
free(conv->iconv_struct);
}
return 0;
}
int entity_norm_done(struct entity_conv* conv)
{
return encoding_norm_done(conv);
}
static unsigned short bom_length(struct entity_conv* conv)
{
if(conv->has_bom) {
switch(conv->enc_bytes) {
case 1:
if(conv->encoding_symbolic == E_UTF8) {
return 3;
}
break;
case 2:
return 2;
case 4:
return 4;
}
}
return 0;
}
/* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
* just keep on each thread its own pool of iconvs*/
@ -774,99 +677,36 @@ static iconv_t iconv_open_cached(const char* fromcode)
cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode);
iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode);
if(iconv_struct != (iconv_t)-1) {
idx = cache->last++;
if(idx >= cache->len) {
cache->len += 16;
cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
if(!cache->tab) {
cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
errno = ENOMEM;
return (iconv_t)-1;
idx = cache->last++;
if(idx >= cache->len) {
cache->len += 16;
cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
if(!cache->tab) {
cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
errno = ENOMEM;
return (iconv_t)-1;
}
}
}
hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
cache->tab[idx] = iconv_struct;
cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
return cache->tab[idx];
}
return (iconv_t)-1;
}
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
{
char *tmp_encoding;
enum encodings tmp;
size_t new_size,old_size;
if(!encoding && prio == SWITCH_TO_BLOCKMODE) {
if(conv->linemode) {
cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
conv->linemode = 0;
}
return;
}
cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio);
if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) {
cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n");
return;
/* Content-type in header is highest priority, no overrides possible.
* Also no overrides after an encoding has been set.*/
}
/* validate encoding name, and normalize to uppercase */
if(!(tmp_encoding = normalize_encoding(encoding))) {
cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
return;
}
/* don't allow to change between unicode encodings that have different byte-size */
if(prio == META) {
/* need to consider minimum size of an encoding here */
old_size = conv->enc_bytes;
new_size = encoding_bytes(tmp_encoding,&tmp);
if(old_size != new_size) {
/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size);
free(tmp_encoding);
return;
}
}
conv->encoding = tmp_encoding;
cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding);
*(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding );
if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) {
cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding);
/* message shown only once/file */
/* what can we do? short-circuit iconv */
free(conv->encoding);
conv->encoding = NULL;
/* we will process using whatever we currently have for encoding_symbolic.
* If encoding was already set to iconv, we shouldn't be here.*/
assert(conv->encoding_symbolic != E_ICONV);
} else {
cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
conv->encoding_symbolic = E_ICONV;
conv->priority = prio;
conv->linemode = 0;
cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
return cache->tab[idx];
}
return (iconv_t)-1;
}
static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
static int in_iconv_u16(const m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
{
char tmp4[4];
size_t inleft = in_m_area->length - in_m_area->offset;
size_t rc, alignfix;
char* input = (char*)in_m_area->buffer + in_m_area->offset;
size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/
size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;
char* out = (char*)out_m_area->buffer;
out_m_area->offset = 0;
if(!inleft) {
/* EOF */
out_m_area->offset = out_m_area->length = 0;
return 0;
}
/* convert encoding conv->tmp_area. conv->out_area */
@ -886,7 +726,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
while (inleft && (outleft >= 2)) { /* iconv doesn't like inleft to be 0 */
const size_t outleft_last = outleft;
assert(*iconv_struct != (iconv_t)-1);
rc = iconv(*iconv_struct, (char**) &input, &inleft, (char**) &out, &outleft);
rc = iconv(*iconv_struct, &input, &inleft, &out, &outleft);
if(rc == (size_t)-1) {
if(errno == E2BIG) {
/* not enough space in output buffer */
@ -909,9 +749,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
*out++ = *input++;
inleft--;
}
/* length - offset - alignfix is original value of inleft, new value is inleft,
* difference tells how much it moved. */
in_m_area->offset = in_m_area->length - alignfix - inleft;
cli_dbgmsg("in_iconv_u16: unprocessed bytes: %lu\n", (unsigned long)inleft);
if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) {
out_m_area->length -= (off_t)outleft;
} else {
@ -922,156 +760,36 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
return 0;
}
#define NORMALIZE_CHAR(c, out, limit, linemode) \
{\
if (linemode && c == '\n') {\
i++;\
break;\
} else {\
unsigned char* out_new = u16_normalize(c, out, limit);\
if(out_new) {\
limit -= out_new - out;\
}\
out = out_new;\
}\
}
/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3
* assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */
#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit))
#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length))
/* EOF marker is m_area->length == 0 */
/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read.
* When we can't read anything due to EOF ->length will be set to 0.
* bounds checks offset and length*/
static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream)
int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area)
{
if(!m_area) {
size_t iread;
iconv_t iconv_struct;
off_t i, j;
char *encoding;
m_area = &conv->tmp_area;
if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
return m_area;
}
/* offset out of bounds -> all the buffer was processed, fill it again */
iread = fread(m_area->buffer, 1, conv->buffer_size, stream);
m_area->length = LIMIT_LENGTH(iread, conv->buffer_size);
m_area->offset = 0;
if(ferror(stream)) {
cli_errmsg("Error while reading HTML stream\n");
}
} else {
if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
cli_dbgmsg(MODULE_NAME "EOF reached\n");
m_area->offset = m_area->length; /* EOF marker */
}
if(!initial_encoding || !in_m_area || !out_m_area) {
return CL_ENULLARG;
}
return m_area;
}
static inline uint16_t get_u16(const unsigned char* buf, const size_t i)
{
return ((uint16_t)buf[i] << 8) | buf[i+1];
}
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area)
{
unsigned char* out = conv->out_area.buffer;
if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) {
return NULL;
}
if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) {
/* error encountered */
return NULL;
encoding = normalize_encoding((const unsigned char*)initial_encoding);
if(!encoding) {
cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
return -1;
}
else {
const off_t input_limit = in_m_area->length;
const unsigned char* input = in_m_area->buffer;
off_t input_offset = in_m_area->offset;
off_t limit = conv->out_area.length - 1;
off_t limit_prev = limit;
off_t i = 0;
/* read_raw() ensures this condition */
assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset <= input_limit));
if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */
size_t bom_len;
memcpy(conv->bom, input, 4);
process_bom(conv);
bom_len = bom_length(conv);
in_m_area->offset = input_offset = input_offset + bom_len;
conv->bom_cnt = 1;
}
if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) {
cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed);
conv->linemode = 0;
}
switch(conv->encoding_symbolic) {
case E_ICONV:/* only in block-mode */
/* normalize already converted characters from a previous pass
* (output buffer was full, and we couldn't normalize more in previous pass) */
for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) {
const uint16_t c = get_u16(conv->norm_area.buffer, i);
NORMALIZE_CHAR(c, out, limit, 0);
}
conv->norm_area.offset = i;
if(limit > 0) {
conv->norm_area.length = conv->buffer_size;
in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area);
/*in_iconv_u16 always fills entire norm_area buffer starting from 0. */
for(i = 0;i < conv->norm_area.length && limit > 0 && out; i += 2) {
const uint16_t c = get_u16(conv->norm_area.buffer, i);
NORMALIZE_CHAR(c, out, limit, 0);
}
if(i) {
conv->norm_area.offset = i;
}
}
if(limit == limit_prev) {
/* output pointer didn't move => EOF */
return NULL;
}
break;
/* out_area must have enough space to allow all bytes in norm_area normalized,
* if we norm with &x;, then we need 7* space. */
default:
cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic);
conv->encoding_symbolic = E_OTHER;
case E_UNKNOWN:
case E_OTHER:
if(!input_limit || input_offset == input_limit) {
/* nothing to do, EOF */
return NULL;
}
for(i = input_offset; i < input_limit && limit > 0; i++) {
const unsigned char c = input[i];
if(conv->linemode && c == '\n') {
i++;
break;
}
if(c) {
*out++ = c;
limit--;
}
}
in_m_area->offset = i;
}
if(conv->linemode) {
conv->linemode_processed += i - input_offset;
cli_dbgmsg(MODULE_NAME "Encoding %s\n", encoding);
iconv_struct = iconv_open_cached( encoding );
if(iconv_struct == (iconv_t)-1) {
cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open(): %s\n", encoding);
free(encoding);
return -1;
}
in_iconv_u16(in_m_area, &iconv_struct, out_m_area);
for(i = 0, j = 0; i < out_m_area->length ; i += 2) {
const unsigned char c = (out_m_area->buffer[i] << 4) + out_m_area->buffer[i+1];
if(c) {
out_m_area->buffer[j++] = c;
}
if(limit < 0) limit = 0;
conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0';
return conv->out_area.buffer;
}
out_m_area->length = j;
return 0;
}

@ -44,37 +44,20 @@
#define UNKNOWN "\0"
#define OTHER "OTHER"
enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE};
enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV};
#define MAX_ENTITY_SIZE 22
struct entity_conv {
char* encoding;
enum encoding_priority priority;
enum encodings encoding_symbolic;
size_t buffer_size;
void* iconv_struct;
unsigned char entity_buff[MAX_ENTITY_SIZE+2];
m_area_t tmp_area;
m_area_t out_area;
m_area_t norm_area;
int linemode;/* TODO:set */
int linemode_processed;
unsigned char bom[4];
uint8_t has_bom;
uint8_t enc_bytes;
uint8_t bom_cnt;
};
int init_entity_converter(struct entity_conv* conv, size_t buffer_size);
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
int entity_norm_done(struct entity_conv* conv);
unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size);
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area);
const char* entity_norm(struct entity_conv* conv,const unsigned char* entity);
int entitynorm_init(void);
const char* encoding_detect_bom(const unsigned char* bom);
int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area);
#endif

@ -182,37 +182,42 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
cli_ac_freedata(&mdata);
if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
struct entity_conv conv;
const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
/* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/
if(init_entity_converter(&conv, conv_size) == 0) {
m_area_t area;
area.buffer = (unsigned char *) smallbuff;
area.length = bread;
area.offset = 0;
/* switch to blockmode, so that we convert all the input buffer at once,
* rather than line-by-line */
process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
return ret;
decoded = encoding_norm_readline(&conv, NULL, &area);
if(decoded) {
sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
if(sret == CL_TYPE_HTML) {
ret = CL_TYPE_HTML;
const char* encoding;
/* check if we can autodetect this encoding.
* If we can't don't try to detect HTML sig, since
* we just tried that above, and failed */
if((encoding = encoding_detect_bom(smallbuff))) {
unsigned char decodedbuff[sizeof(smallbuff)*2];
m_area_t in_area, out_area;
in_area.buffer = (unsigned char *) smallbuff;
in_area.length = bread;
in_area.offset = 0;
out_area.buffer = decodedbuff;
out_area.length = sizeof(decodedbuff);
out_area.offset = 0;
/* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode
* (multibyte characters will not be exactly handled, but that is not a problem).
* However when detecting whether a file is HTML or not, we need exact conversion.
* (just eliminating zeros and matching would introduce false positives */
if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
out_area.buffer[out_area.length] = '\0';
if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
return ret;
if(out_area.length > 0) {
sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
if(sret == CL_TYPE_HTML) {
cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
/* htmlnorm is able to handle any unicode now, since it skips null chars */
ret = CL_TYPE_HTML;
}
}
}
cli_ac_freedata(&mdata);
entity_norm_done(&conv);
} else {
cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
cli_ac_freedata(&mdata);
}
}
}
}

@ -542,13 +542,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
}
}
if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) {
if (!m_area) {
fclose(stream_in);
}
return rc;
}
tag_args.count = 0;
tag_args.tag = NULL;
tag_args.value = NULL;
@ -628,10 +621,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
binary = FALSE;
if(dconf_entconv)
ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
else
ptr = line = cli_readchunk(stream_in, m_area, 8192);
ptr = line = cli_readchunk(stream_in, m_area, 8192);
while (line) {
if(href_contents_begin)
@ -989,37 +979,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
in_script = TRUE;
}
html_output_tag(file_buff_script, tag, &tag_args);
} else if (dconf_entconv && strcmp(tag, "body") == 0) {
/* no more charset changes accepted after body encountered */
process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
if(http_equiv && http_content && strcasecmp(http_equiv,"content-type") == 0) {
size_t len = strlen((const char*)http_content);
unsigned char* http_content2 = cli_malloc( len + 1);
unsigned char* charset;
size_t i;
if(!http_content2)
return CL_EMEM;
for(i = 0; i < len; i++)
http_content2[i] = tolower(http_content[i]);
http_content2[len] = '\0';
charset = (unsigned char*) strstr((char*)http_content2,"charset");
if(charset) {
while(*charset && *charset != '=')
charset++;
if(*charset)
charset++;/* skip = */
len = strcspn((const char*)charset," \"'");
charset[len] = '\0';
if(len) {
process_encoding_set(&conv, charset, META);
}
}
free(http_content2);
}
} else if (hrefs) {
if(in_ahref && !href_contents_begin)
href_contents_begin=ptr;
@ -1533,12 +1492,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
/* end of line, append contents now, resume on next line */
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
ptrend = NULL;
if(dconf_entconv)
ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
else {
free(line);
ptr = line = cli_readchunk(stream_in, m_area, 8192);
}
free(line);
ptr = line = cli_readchunk(stream_in, m_area, 8192);
}
if(dconf_entconv) {
@ -1566,8 +1521,6 @@ abort:
if (in_ahref) /* tag not closed, force closing */
html_tag_contents_done(hrefs,in_ahref);
if(dconf_entconv)
entity_norm_done(&conv);
html_tag_arg_free(&tag_args);
if (!m_area) {
fclose(stream_in);
@ -1593,11 +1546,11 @@ abort:
int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
{
m_area_t m_area;
m_area.buffer = in_buff;
m_area.length = in_size;
m_area.offset = 0;
return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
}
@ -1607,7 +1560,7 @@ int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const
int retval=FALSE;
m_area_t m_area;
struct stat statbuf;
if (fstat(fd, &statbuf) == 0) {
m_area.length = statbuf.st_size;
m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0);

@ -36,7 +36,7 @@ typedef struct m_area_tag {
} m_area_t;
int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf);
void html_tag_arg_free(tag_arguments_t *tags);
int html_screnc_decode(int fd, const char *dirname);

Loading…
Cancel
Save