Normalize Big5 dot in html.
pull/25/head
Török Edvin 14 years ago
parent f5092717cd
commit 13bfb27361
  1. 7
      libclamav/entconv.c
  2. 67
      libclamav/htmlnorm.c

@ -78,8 +78,11 @@ static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, con
if(u16 < 0xff) {
assert((uint8_t)u16 != 0);
*out++ = (uint8_t)u16;
} else {
size_t i;
} else if (u16 == 0x3002 || u16 == 0xFF0E || u16 == 0xFE52) {
/* bb #4097 */
*out++ = '.';
} else {
size_t i;
/* normalize only >255 to speed up */
if(limit <= 8) {
/* not enough space available */

@ -53,6 +53,7 @@
typedef enum {
HTML_BAD_STATE,
HTML_NORM,
HTML_8BIT,
HTML_COMMENT,
HTML_CHAR_REF,
HTML_ENTITY_REF_DECODE,
@ -475,10 +476,36 @@ void html_tag_arg_free(tag_arguments_t *tags)
static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end)
{
size_t i;
uint32_t mbchar = 0;
if(!begin || !end)
return;
for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) {
cont->contents[i] = *begin++;
uint8_t c = *begin++;
if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
|| mbchar == 0xEFB992 ||
mbchar == 0xA143 || mbchar == 0xA144 ||
mbchar == 0xA14F) {
cont->contents[i++] = '.';
} else {
uint8_t c0 = mbchar >> 16;
uint8_t c1 = (mbchar >> 8)&0xff;
uint8_t c2 = (mbchar & 0xff);
if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH)
cont->contents[i++] = c0;
if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH)
cont->contents[i++] = c1;
if (i+1 < MAX_TAG_CONTENTS_LENGTH)
cont->contents[i++] = c2;
}
mbchar = 0;
}
if (c >= 0x80) {
mbchar = (mbchar << 8) | c;
--i;
}
else
cont->contents[i] = c;
}
cont->pos = i;
}
@ -636,6 +663,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
struct parser_state *js_state = NULL;
const unsigned char *js_begin = NULL, *js_end = NULL;
struct tag_contents contents;
uint32_t mbchar = 0;
tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
contents.pos = 0;
@ -754,6 +782,38 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
next_state = HTML_BAD_STATE;
}
break;
case HTML_8BIT:
if (*ptr < 0x80 || mbchar >= 0x10000) {
if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
|| mbchar == 0xEFB992 ||
mbchar == 0xA143 || mbchar == 0xA144 ||
mbchar == 0xA14F) {
/* bb #4097 */
html_output_c(file_buff_o2, '.');
html_output_c(file_buff_text, '.');
} else {
uint8_t c0 = mbchar >> 16;
uint8_t c1 = (mbchar >> 8)&0xff;
uint8_t c2 = (mbchar & 0xff);
if (c0) {
html_output_c(file_buff_o2, c0);
html_output_c(file_buff_text, c0);
}
if (c0 || c1) {
html_output_c(file_buff_o2, c1);
html_output_c(file_buff_text, c1);
}
html_output_c(file_buff_o2, c2);
html_output_c(file_buff_text, c1);
}
mbchar = 0;
state = next_state;
next_state = HTML_NORM;
} else {
mbchar = (mbchar << 8) | *ptr;
ptr++;
}
break;
case HTML_NORM:
if (*ptr == '<') {
ptrend=ptr; /* for use by scanContents */
@ -786,6 +846,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
state = HTML_CHAR_REF;
next_state = HTML_NORM;
ptr++;
} else if (*ptr >= 0x80) {
state = HTML_8BIT;
next_state = HTML_NORM;
mbchar = *ptr;
ptr++;
} else {
unsigned char c = tolower(*ptr);
/* normalize ' to " for scripts */

Loading…
Cancel
Save