use entconv to detect UTF-16BE, and UCS-4 variants

use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571
18 years ago · b3fc7f9747
parent 4addba22e4
commit b3fc7f9747
6 changed files with 120 additions and 452 deletions
--- a/9
+++ b/9
@ -1,3 +1,12 @@
+Fri Feb  1 21:19:58 EET 2008 (edwin)
+------------------------------------
+  * libclamav/filetypes.c: use entconv to detect UTF-16BE, and UCS-4 variants
+  * libclamav/htmlnorm.c: use only cli_readline() we don't need exact
+  conversion
+  * libclamav/entconv.c:
+	* drop unused functions,
+  	* simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()
+
 Fri Feb  1 00:58:05 CET 2008 (tk)
 ---------------------------------
  * libclamav: ndb sigs: add new target type (7) for ASCII files; handle
--- a/libclamav/entconv.c
+++ b/libclamav/entconv.c
@ -127,67 +127,7 @@ const char* entity_norm(struct entity_conv* conv,const unsigned char* entity)
 	return NULL;
 }

-/* sane default, must be larger, than the longest possible return string,
- * which is
- * &#xxx;*/
-#define MIN_BUFFER_SIZE 32
-
-#define LINEMODE_LIMIT 16384
-
-int init_entity_converter(struct entity_conv* conv, size_t buffer_size)
-{
-	if(buffer_size < MIN_BUFFER_SIZE) {
-		cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE);
-		return CL_ENULLARG;
-	}
-	if(conv) {
-		conv->encoding = NULL;
-		conv->encoding_symbolic = E_UNKNOWN;
-		conv->bom_cnt = 0;
-		conv->buffer_size = buffer_size;
-		conv->priority = NOPRIO;
-		/* start in linemode */
-		conv->linemode = 1;
-		conv->linemode_processed = 0;
-
-		conv->tmp_area.offset = 0;
-		conv->tmp_area.length = 0;
-		conv->tmp_area.buffer  =  cli_malloc(buffer_size);
-		if(!conv->tmp_area.buffer) {
-			return CL_EMEM;
-		}
-
-		conv->out_area.offset = 0;
-		conv->out_area.length = buffer_size;
-		conv->out_area.buffer = cli_malloc(buffer_size);
-		if(!conv->out_area.buffer) {
-			free(conv->tmp_area.buffer);
-			return CL_EMEM;
-		}
-
-		conv->buffer_size = buffer_size;
-		conv->norm_area.offset = 0;
-		conv->norm_area.length = 0;
-		conv->norm_area.buffer = cli_malloc(buffer_size);
-		if(!conv->norm_area.buffer) {
-			free(conv->tmp_area.buffer);
-			free(conv->out_area.buffer);
-			return CL_EMEM;
-		}
-
-		conv->iconv_struct = cli_calloc(1, sizeof(iconv_t));
-		if(!conv->iconv_struct) {
-			free(conv->tmp_area.buffer);
-			free(conv->out_area.buffer);
-			free(conv->norm_area.buffer);
-			return CL_EMEM;
-		}
-		return 0;
-	}
-	else 
-		return CL_ENULLARG;
-}
-
+#ifndef HAVE_ICONV
 static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
 {
 	/* special case for these unusual byteorders */
@ -217,7 +157,6 @@ static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
 	}
 }

-#ifndef HAVE_ICONV
 static iconv_t iconv_open(const char *tocode, const char* fromcode)
 {
 	iconv_t iconv = cli_malloc(sizeof(*iconv));
@ -236,7 +175,6 @@ static int iconv_close(iconv_t cd)
 	return 0;
 }

-
 static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
 		char** outbuf, size_t *outbytesleft)
 {
@ -426,14 +364,11 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,

 #endif

-/* new iconv() version */
-static inline void process_bom(struct entity_conv* conv)
+static inline const char* detect_encoding(const unsigned char* bom, uint8_t* bom_found, uint8_t* enc_width)
 {
-	const unsigned char* bom = conv->bom;
-	const char* encoding = NULL;
+	const char* encoding;
 	int has_bom = 0;
-	uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/
-
+	uint8_t enc_bytes = 1; /* default is UTF8, which has a minimum of 1 bytes */
 	/* undecided 32-bit encodings are treated as ucs4, and
 	 * 16 bit as utf16*/
 	switch(bom[0]) {
@ -442,23 +377,28 @@ static inline void process_bom(struct entity_conv* conv)
 				if(bom[2] == 0xFE && bom[3] == 0xFF) {
 					encoding = UCS4_1234;/* UCS-4 big-endian*/
 					has_bom = 1;
+					enc_bytes = 4;
 				}
 				else if(bom[2] == 0xFF && bom[3] == 0xFE) {
 					encoding = UCS4_2143;/* UCS-4 unusual order 2143 */
 					has_bom = 1;
+					enc_bytes = 4;
 				}
 				else if(bom[2] == 0x00 && bom[3] == 0x3C) {
 					/* undecided, treat as ucs4 */
 					encoding = UCS4_1234;
+					enc_bytes = 4;
 				}
 				else if(bom[2] == 0x3C && bom[3] == 0x00) {
 					encoding = UCS4_2143;
+					enc_bytes = 4;
 				}
 			}/* 0x00 0x00 */
 			else if(bom[1] == 0x3C) {
 				if(bom[2] == 0x00) {
 					if(bom[3] == 0x00) {
 						encoding = UCS4_3412;
+						enc_bytes = 4;
 					}
 					else if(bom[3] == 0x3F) {
 						encoding = UTF16_BE;
@ -471,6 +411,7 @@ static inline void process_bom(struct entity_conv* conv)
 			if(bom[1] == 0xFE) {
 				if(bom[2] == 0x00 && bom[3] == 0x00) {
 					encoding = UCS4_4321;
+					enc_bytes = 4;
 					has_bom = 1;
 				}
 				else {
@ -484,6 +425,7 @@ static inline void process_bom(struct entity_conv* conv)
 			if(bom[1] == 0xFF) {
 					if(bom[2] == 0x00 && bom[3] == 0x00) {
 						encoding = UCS4_3412;
+						enc_bytes = 4;
 						has_bom = 1;
 					}
 					else {
@ -504,6 +446,7 @@ static inline void process_bom(struct entity_conv* conv)
 				if(bom[1] == 0x00) {
 					if(bom[2] == 0x00 && bom[3] == 0x00) {
 						encoding = UCS4_4321;
+						enc_bytes = 4;
 					}
 					else if(bom[2] == 0x3F && bom[3] == 0x00) {
 						encoding = UTF16_LE;
@ -523,12 +466,19 @@ static inline void process_bom(struct entity_conv* conv)
 				}/*4C 6F A7 94*/
 				break;
 	}/*switch*/
-	if(encoding) {
-		cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding);
-		process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT);
-	}
-	conv->enc_bytes = enc_bytes;
-	conv->has_bom = has_bom;
+	*enc_width = enc_bytes;
+	*bom_found = has_bom;
+	return encoding;
+}
+
+/* detects UTF-16(LE/BE), UCS-4(all 4 variants).
+ * UTF-8 and simple ASCII are ignored, because we can process those as text */
+const char* encoding_detect_bom(const unsigned char* bom)
+{
+	uint8_t has_bom;
+	uint8_t enc_width;
+	const char* encoding = detect_encoding(bom, &has_bom, &enc_width);
+	return enc_width > 1 ? encoding : NULL;
 }

 /*()-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
@ -575,53 +525,6 @@ static char* normalize_encoding(const unsigned char* enc)
 	return norm;
 }

-static int encoding_norm_done(struct entity_conv* conv)
-{
-	if(conv->encoding) {
-		free(conv->encoding);
-		conv->encoding = NULL;
-	}
-	conv->buffer_size = 0;
-	if(conv->tmp_area.buffer) {
-		free(conv->tmp_area.buffer);
-		conv->tmp_area.buffer = NULL;
-	}
-	if(conv->out_area.buffer) {
-		free(conv->out_area.buffer);
-		conv->out_area.buffer = NULL;
-	}
-	if(conv->norm_area.buffer) {
-		free(conv->norm_area.buffer);
-		conv->norm_area.buffer = NULL;
-	}
-	if(conv->iconv_struct) {
-		free(conv->iconv_struct);
-	}
-	return 0;
-}
-
-int entity_norm_done(struct entity_conv* conv)
-{
-	return encoding_norm_done(conv);
-}
-
-static unsigned short bom_length(struct entity_conv* conv)
-{
-	if(conv->has_bom) {
-		switch(conv->enc_bytes) {
-			case 1:
-				if(conv->encoding_symbolic == E_UTF8) {
-					return 3;
-				}
-				break;
-			case 2:
-				return 2;
-			case 4:
-				return 4;
-		}
-	}
-	return 0;
-}
 /* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
 * just keep on each thread its own pool of iconvs*/

@ -774,99 +677,36 @@ static iconv_t iconv_open_cached(const char* fromcode)
 	cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode);
 	iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode);
 	if(iconv_struct != (iconv_t)-1) {
-	idx = cache->last++;
-	if(idx >= cache->len) {
-		cache->len += 16;
-		cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
-		if(!cache->tab) {
-			cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
-			errno = ENOMEM;
-			return (iconv_t)-1;
+		idx = cache->last++;
+		if(idx >= cache->len) {
+			cache->len += 16;
+			cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
+			if(!cache->tab) {
+				cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
+				errno = ENOMEM;
+				return (iconv_t)-1;
+			}
 		}
-	}

-	hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
+		hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
 		cache->tab[idx] = iconv_struct;
-	cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
-	return cache->tab[idx];
-}
-	return (iconv_t)-1;
-}
-
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
-{
-	char *tmp_encoding;
-	enum encodings tmp;
-	size_t new_size,old_size;
-
-	if(!encoding && prio == SWITCH_TO_BLOCKMODE) {
-		if(conv->linemode) {
-			cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
-			conv->linemode = 0;
-		}
-		return;
-	}
-
-	cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio);
-
-	if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) {
-		cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n");
-		return;
-		/* Content-type in header is highest priority, no overrides possible.
-		 * Also no overrides after an encoding has been set.*/
-	}
-
-	/* validate encoding name, and normalize to uppercase */
-	if(!(tmp_encoding = normalize_encoding(encoding))) {
-		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
-		return;
-	}
-
-	/* don't allow to change between unicode encodings that have different byte-size */
-	if(prio == META) {
-		/* need to consider minimum size of an encoding here */
-		old_size =  conv->enc_bytes;
-		new_size = encoding_bytes(tmp_encoding,&tmp);
-		if(old_size != new_size)  {
-			/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
-			cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size);
-			free(tmp_encoding);
-			return;
-		}
-	}
-
-	conv->encoding = tmp_encoding;
-	cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding);
-	*(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding );
-	if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) {
-		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding);
-		/* message shown only once/file */
-		/* what can we do? short-circuit iconv */
-		free(conv->encoding);
-		conv->encoding = NULL;
-		/* we will process using whatever we currently have for encoding_symbolic.
-		 * If encoding was already set to iconv, we shouldn't be here.*/
-		assert(conv->encoding_symbolic != E_ICONV);
-	} else {
-		cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
-		conv->encoding_symbolic = E_ICONV;
-		conv->priority = prio;
-		conv->linemode = 0;
+		cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
+		return cache->tab[idx];
 	}
+	return (iconv_t)-1;
 }

-static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
+static int in_iconv_u16(const m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
 {
 	char   tmp4[4];
 	size_t inleft = in_m_area->length - in_m_area->offset;
 	size_t rc, alignfix;
 	char*  input   = (char*)in_m_area->buffer + in_m_area->offset;
-	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/
+	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;
 	char* out      = (char*)out_m_area->buffer;

+	out_m_area->offset = 0;
 	if(!inleft) {
-		/* EOF */
-		out_m_area->offset = out_m_area->length = 0;
 		return 0;
 	}
 	/* convert encoding conv->tmp_area. conv->out_area */
@ -886,7 +726,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
 	while (inleft && (outleft >= 2)) { /* iconv doesn't like inleft to be 0 */
 		const size_t outleft_last = outleft;
 		assert(*iconv_struct != (iconv_t)-1);
-		rc = iconv(*iconv_struct, (char**) &input,  &inleft, (char**) &out, &outleft);
+		rc = iconv(*iconv_struct, &input,  &inleft, &out, &outleft);
 		if(rc == (size_t)-1) {
 			if(errno == E2BIG) {
 				/* not enough space in output buffer */
@ -909,9 +749,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
 		*out++ = *input++;
 		inleft--;
 	}
-	/* length - offset - alignfix is original value of inleft, new value is inleft, 
-	 * difference tells how much it moved. */
-	in_m_area->offset = in_m_area->length - alignfix - inleft;
+	cli_dbgmsg("in_iconv_u16: unprocessed bytes: %lu\n", (unsigned long)inleft);
 	if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) {
 		out_m_area->length -= (off_t)outleft;
 	} else {
@ -922,156 +760,36 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
 	return 0;
 }

-
-#define NORMALIZE_CHAR(c, out, limit, linemode) \
-{\
-	        if (linemode && c == '\n') {\
-			i++;\
-			break;\
-		} else {\
-			unsigned char* out_new = u16_normalize(c, out, limit);\
-			if(out_new) {\
-				limit -= out_new - out;\
-			}\
-			out = out_new;\
-		}\
-}
-
-/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3
- * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */
-#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit))
-#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length))
-
-/* EOF marker is m_area->length == 0 */
-
-/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read.
- * When we can't read anything due to EOF ->length will be set to 0.
- * bounds checks offset and length*/
-static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream)
+int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area)
 {
-	if(!m_area) {
-		size_t iread;
+	iconv_t iconv_struct;
+	off_t i, j;
+	char *encoding;

-		m_area = &conv->tmp_area;
-		if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
-			return m_area;
-		}
-		/* offset out of bounds -> all the buffer was processed, fill it again */
-		iread = fread(m_area->buffer, 1, conv->buffer_size, stream);
-		m_area->length = LIMIT_LENGTH(iread, conv->buffer_size);
-		m_area->offset = 0;
-		if(ferror(stream)) {
-			cli_errmsg("Error while reading HTML stream\n");
-		}
-	} else {
-		if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
-			cli_dbgmsg(MODULE_NAME "EOF reached\n");
-			m_area->offset = m_area->length; /* EOF marker */
-		}
+	if(!initial_encoding || !in_m_area || !out_m_area) {
+		return CL_ENULLARG;
 	}
-	return m_area;
-}
-
-static inline uint16_t get_u16(const unsigned char* buf, const size_t i)
-{
-	return ((uint16_t)buf[i] << 8) | buf[i+1];
-}

-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area)
-{
-	unsigned char* out = conv->out_area.buffer;
-	if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) {
-		return NULL;
-	}
-	if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) {
-		/* error encountered */
-		return NULL;
+	encoding = normalize_encoding((const unsigned char*)initial_encoding);
+	if(!encoding) {
+		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
+		return -1;
 	}
-	else {
-		const off_t input_limit  = in_m_area->length;
-		const unsigned char* input = in_m_area->buffer;
-		off_t input_offset = in_m_area->offset;
-		off_t limit = conv->out_area.length - 1;
-		off_t limit_prev = limit;
-		off_t i = 0;
-
-		/* read_raw() ensures this condition */
-		assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset <= input_limit));
-
-		if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */
-			size_t bom_len;
-			memcpy(conv->bom, input, 4);
-			process_bom(conv);
-			bom_len = bom_length(conv);
-			in_m_area->offset = input_offset = input_offset + bom_len;
-			conv->bom_cnt = 1;
-		}
-
-		if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) {
-			cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed);
-			conv->linemode = 0;
-		}
-
-		switch(conv->encoding_symbolic) {
-			case E_ICONV:/* only in block-mode */
-				/* normalize already converted characters from a previous pass
-				 * (output buffer was full, and we couldn't normalize more in previous pass) */
-				for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) {
-					const uint16_t c = get_u16(conv->norm_area.buffer, i);
-					NORMALIZE_CHAR(c, out, limit, 0);
-				}
-				conv->norm_area.offset = i;
-			        if(limit > 0) {
-					conv->norm_area.length = conv->buffer_size;
-					in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area);
-
-					/*in_iconv_u16 always fills entire norm_area buffer starting from 0. */
-					for(i = 0;i < conv->norm_area.length && limit >  0 && out; i += 2) {
-						const uint16_t c = get_u16(conv->norm_area.buffer, i);
-						NORMALIZE_CHAR(c, out, limit, 0);
-					}
-					if(i) {
-						conv->norm_area.offset = i;
-					}
-				}
-				if(limit == limit_prev) {
-					/* output pointer didn't move => EOF */
-					return NULL;
-				}
-				break;
-				/* out_area must have enough space to allow all bytes in norm_area normalized,
-				 * if we norm with &x;, then we need 7* space. */
-			default:
-				cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic);
-				conv->encoding_symbolic = E_OTHER;
-			case E_UNKNOWN:
-			case E_OTHER:
-				if(!input_limit || input_offset == input_limit) {
-					/* nothing to do, EOF */
-					return NULL;
-				}
-				for(i = input_offset; i < input_limit && limit > 0; i++) {
-					const unsigned char c = input[i];
-					if(conv->linemode && c == '\n') {
-						i++;
-						break;
-					}
-					if(c) {
-						*out++ = c;
-						limit--;
-					}
-				}
-				in_m_area->offset = i;
-		}

-
-		if(conv->linemode) {
-			conv->linemode_processed += i - input_offset;
+	cli_dbgmsg(MODULE_NAME "Encoding %s\n", encoding);
+	iconv_struct = iconv_open_cached( encoding );
+	if(iconv_struct == (iconv_t)-1) {
+		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open(): %s\n", encoding);
+		free(encoding);
+		return -1;
+	}
+	in_iconv_u16(in_m_area, &iconv_struct, out_m_area);
+	for(i = 0, j = 0; i < out_m_area->length ; i += 2) {
+		const unsigned char c = (out_m_area->buffer[i] << 4) + out_m_area->buffer[i+1];
+		if(c) {
+			out_m_area->buffer[j++] = c;
 		}
-
-		if(limit < 0) limit = 0;
-		conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0';
-		return conv->out_area.buffer;
 	}
+	out_m_area->length = j;
+	return 0;
 }
-
--- a/libclamav/entconv.h
+++ b/libclamav/entconv.h
@ -44,37 +44,20 @@
 #define UNKNOWN "\0"
 #define OTHER   "OTHER"

+
 enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE};

 enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV};
 #define MAX_ENTITY_SIZE 22

 struct entity_conv {
-	char* encoding;
-	enum encoding_priority priority;
-	enum encodings encoding_symbolic;
-	size_t buffer_size;
-	void* iconv_struct;
 	unsigned char entity_buff[MAX_ENTITY_SIZE+2];
-	m_area_t tmp_area;
-	m_area_t out_area;
-	m_area_t norm_area;
-	int      linemode;/* TODO:set */
-	int      linemode_processed;
-	unsigned char bom[4];
-	uint8_t has_bom;
-	uint8_t enc_bytes;
-	uint8_t  bom_cnt;
 };

-int init_entity_converter(struct entity_conv* conv, size_t buffer_size);
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
-int entity_norm_done(struct entity_conv* conv);
-
 unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size);
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area);
 const char* entity_norm(struct entity_conv* conv,const unsigned char* entity);
-int entitynorm_init(void);
+const char* encoding_detect_bom(const unsigned char* bom);
+int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area);

 #endif

--- a/libclamav/filetypes.c
+++ b/libclamav/filetypes.c
@ -182,37 +182,42 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
 	    cli_ac_freedata(&mdata);

 	    if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
-		    struct entity_conv conv;
-		    const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
-
-		    /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/
-		    if(init_entity_converter(&conv, conv_size) == 0) {
-			    m_area_t area;
-			    area.buffer = (unsigned char *) smallbuff;
-			    area.length = bread;
-			    area.offset = 0;
-
-			    /* switch to blockmode, so that we convert all the input buffer at once,
-			     * rather than line-by-line */
-			    process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
-
-			    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
-				    return ret;
-
-			    decoded =  encoding_norm_readline(&conv, NULL, &area);
-
-			    if(decoded) {
-				    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
-				    if(sret == CL_TYPE_HTML) {
-					    ret = CL_TYPE_HTML;
+		    const char* encoding;
+
+		    /* check if we can autodetect this encoding.
+		     * If we can't don't try to detect HTML sig, since
+		     * we just tried that above, and failed */
+		    if((encoding = encoding_detect_bom(smallbuff))) {
+			    unsigned char decodedbuff[sizeof(smallbuff)*2];
+			    m_area_t in_area, out_area;
+
+			    in_area.buffer = (unsigned char *) smallbuff;
+			    in_area.length = bread;
+			    in_area.offset = 0;
+			    out_area.buffer = decodedbuff;
+			    out_area.length = sizeof(decodedbuff);
+			    out_area.offset = 0;
+
+			    /* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode 
+			     * (multibyte characters will not be exactly handled, but that is not a problem).
+			     * However when detecting whether a file is HTML or not, we need exact conversion.
+			     * (just eliminating zeros and matching would introduce false positives */
+			    if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
+				    out_area.buffer[out_area.length] = '\0';
+				    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
+					    return ret;
+
+				    if(out_area.length > 0) {
+					    sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
+					    if(sret == CL_TYPE_HTML) {
+						    cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
+						    /* htmlnorm is able to handle any unicode now, since it skips null chars */
+						    ret = CL_TYPE_HTML;
+					    }
 				    }
-			    }

-			    cli_ac_freedata(&mdata);
-
-			    entity_norm_done(&conv);
-		    } else {
-			    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
+				    cli_ac_freedata(&mdata);
+			    }
 		    }
 	    }
 	}
--- a/libclamav/htmlnorm.c
+++ b/libclamav/htmlnorm.c
@ -542,13 +542,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
 		}
 	}

-	if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) {
-		if (!m_area) {
-			fclose(stream_in);
-		}
-		return rc;
-	}
-
 	tag_args.count = 0;
 	tag_args.tag = NULL;
 	tag_args.value = NULL;
@ -628,10 +621,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag

 	binary = FALSE;

-	if(dconf_entconv)
-		ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
-	else
-		ptr = line = cli_readchunk(stream_in, m_area, 8192);
+	ptr = line = cli_readchunk(stream_in, m_area, 8192);

 	while (line) {
 		if(href_contents_begin)
@ -989,37 +979,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
 						in_script = TRUE;
 					}
 					html_output_tag(file_buff_script, tag, &tag_args);
-				} else if (dconf_entconv && strcmp(tag, "body") == 0) {
-					/* no more charset changes accepted after body encountered */
-					process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
-				} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
-					const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
-					const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
-					if(http_equiv && http_content && strcasecmp(http_equiv,"content-type") == 0) {
-						size_t len = strlen((const char*)http_content);
-						unsigned char* http_content2 = cli_malloc( len + 1);
-						unsigned char* charset;
-						size_t i;
-
-						if(!http_content2)
-							return CL_EMEM;
-						for(i = 0; i < len; i++)
-							http_content2[i] = tolower(http_content[i]);
-						http_content2[len] = '\0';
-						charset = (unsigned char*) strstr((char*)http_content2,"charset");
-						if(charset) {
-							while(*charset && *charset != '=')
-								charset++;
-							if(*charset)
-								charset++;/* skip = */
-							len = strcspn((const char*)charset," \"'");
-							charset[len] = '\0';
-							if(len) {
-								process_encoding_set(&conv, charset, META);
-							}
-						}
-						free(http_content2);
-					}
 				} else if (hrefs) {
 					if(in_ahref && !href_contents_begin)
 						href_contents_begin=ptr;
@ -1533,12 +1492,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
 			/* end of line, append contents now, resume on next line */
 			html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
 		ptrend = NULL;
-		if(dconf_entconv)
-			ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
-		else {
-			free(line);
-			ptr = line = cli_readchunk(stream_in, m_area, 8192);
-		}
+		free(line);
+		ptr = line = cli_readchunk(stream_in, m_area, 8192);
 	}

 	if(dconf_entconv) {
@ -1566,8 +1521,6 @@ abort:
 	if (in_ahref) /* tag not closed, force closing */
 		html_tag_contents_done(hrefs,in_ahref);

-	if(dconf_entconv)
-		entity_norm_done(&conv);
 	html_tag_arg_free(&tag_args);
 	if (!m_area) {
 		fclose(stream_in);
@ -1593,11 +1546,11 @@ abort:
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
 {
 	m_area_t m_area;
-	
+
 	m_area.buffer = in_buff;
 	m_area.length = in_size;
 	m_area.offset = 0;
-	
+
 	return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
 }

@ -1607,7 +1560,7 @@ int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const
 	int retval=FALSE;
 	m_area_t m_area;
 	struct stat statbuf;
-	
+
 	if (fstat(fd, &statbuf) == 0) {
 		m_area.length = statbuf.st_size;
 		m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0);
--- a/libclamav/htmlnorm.h
+++ b/libclamav/htmlnorm.h
@ -36,7 +36,7 @@ typedef struct m_area_tag {
 } m_area_t;

 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
-int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
+int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf);
 void html_tag_arg_free(tag_arguments_t *tags);
 int html_screnc_decode(int fd, const char *dirname);