bb #4097

Normalize Big5 dot in html.
14 years ago · 13bfb27361
parent f5092717cd
commit 13bfb27361
2 changed files with 71 additions and 3 deletions
--- a/libclamav/entconv.c
+++ b/libclamav/entconv.c
@ -78,8 +78,11 @@ static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, con
 	if(u16 < 0xff) {
 		assert((uint8_t)u16 != 0);
 		*out++ = (uint8_t)u16;
-	} else {
-		size_t i;
+	} else if (u16 == 0x3002 || u16 == 0xFF0E || u16 == 0xFE52) {
+            /* bb #4097 */
+                *out++ = '.';
+        } else {
+                size_t i;
 		/* normalize only >255 to speed up */
 		if(limit <=  8) {
 			/* not enough space available */
--- a/libclamav/htmlnorm.c
+++ b/libclamav/htmlnorm.c
@ -53,6 +53,7 @@
 typedef enum {
    HTML_BAD_STATE,
    HTML_NORM,
+    HTML_8BIT,
    HTML_COMMENT,
    HTML_CHAR_REF,
    HTML_ENTITY_REF_DECODE,
@ -475,10 +476,36 @@ void html_tag_arg_free(tag_arguments_t *tags)
 static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end)
 {
 	size_t i;
+        uint32_t mbchar = 0;
 	if(!begin || !end)
 		return;
 	for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) {
-		cont->contents[i] = *begin++;
+            uint8_t c = *begin++;
+            if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
+                if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
+                    || mbchar == 0xEFB992 ||
+                    mbchar == 0xA143 || mbchar == 0xA144 ||
+                    mbchar == 0xA14F) {
+                    cont->contents[i++] = '.';
+                } else {
+                    uint8_t c0 = mbchar >> 16;
+                    uint8_t c1 = (mbchar >> 8)&0xff;
+                    uint8_t c2 = (mbchar & 0xff);
+                    if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH)
+                        cont->contents[i++] = c0;
+                    if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH)
+                        cont->contents[i++] = c1;
+                    if (i+1 < MAX_TAG_CONTENTS_LENGTH)
+                        cont->contents[i++] = c2;
+                }
+                mbchar = 0;
+            }
+            if (c >= 0x80) {
+                mbchar = (mbchar << 8) | c;
+                --i;
+            }
+            else
+		cont->contents[i] = c;
 	}
 	cont->pos = i;
 }
@ -636,6 +663,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
 	struct parser_state *js_state = NULL;
 	const unsigned char *js_begin = NULL, *js_end = NULL;
 	struct tag_contents contents;
+        uint32_t mbchar = 0;

 	tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
 	contents.pos = 0;
@ -754,6 +782,38 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
 					next_state = HTML_BAD_STATE;
 				}
 				break;
+                        case HTML_8BIT:
+                                if (*ptr < 0x80 || mbchar >= 0x10000) {
+                                    if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
+                                        || mbchar == 0xEFB992 ||
+                                        mbchar == 0xA143 || mbchar == 0xA144 ||
+                                        mbchar == 0xA14F) {
+                                        /* bb #4097 */
+                                        html_output_c(file_buff_o2, '.');
+                                        html_output_c(file_buff_text, '.');
+                                    } else {
+                                        uint8_t c0 = mbchar >> 16;
+                                        uint8_t c1 = (mbchar >> 8)&0xff;
+                                        uint8_t c2 = (mbchar & 0xff);
+                                        if (c0) {
+                                            html_output_c(file_buff_o2, c0);
+                                            html_output_c(file_buff_text, c0);
+                                        }
+                                        if (c0 || c1) {
+                                            html_output_c(file_buff_o2, c1);
+                                            html_output_c(file_buff_text, c1);
+                                        }
+                                        html_output_c(file_buff_o2, c2);
+                                        html_output_c(file_buff_text, c1);
+                                    }
+                                    mbchar = 0;
+                                    state = next_state;
+                                    next_state = HTML_NORM;
+                                } else {
+                                    mbchar = (mbchar << 8) | *ptr;
+                                    ptr++;
+                                }
+                                break;
 			case HTML_NORM:
 				if (*ptr == '<') {
 					ptrend=ptr; /* for use by scanContents */
@ -786,6 +846,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
 					state = HTML_CHAR_REF;
 					next_state = HTML_NORM;
 					ptr++;
+                                } else if (*ptr >= 0x80) {
+                                        state = HTML_8BIT;
+                                        next_state = HTML_NORM;
+                                        mbchar = *ptr;
+                                        ptr++;
 				} else {
 					unsigned char c = tolower(*ptr);
 					/* normalize ' to " for scripts */