|
|
|
@ -53,6 +53,7 @@ |
|
|
|
|
typedef enum { |
|
|
|
|
HTML_BAD_STATE, |
|
|
|
|
HTML_NORM, |
|
|
|
|
HTML_8BIT, |
|
|
|
|
HTML_COMMENT, |
|
|
|
|
HTML_CHAR_REF, |
|
|
|
|
HTML_ENTITY_REF_DECODE, |
|
|
|
@ -475,10 +476,36 @@ void html_tag_arg_free(tag_arguments_t *tags) |
|
|
|
|
static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end) |
|
|
|
|
{ |
|
|
|
|
size_t i; |
|
|
|
|
uint32_t mbchar = 0; |
|
|
|
|
if(!begin || !end) |
|
|
|
|
return; |
|
|
|
|
for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) { |
|
|
|
|
cont->contents[i] = *begin++; |
|
|
|
|
uint8_t c = *begin++; |
|
|
|
|
if (mbchar && (c < 0x80 || mbchar >= 0x10000)) { |
|
|
|
|
if (mbchar == 0xE38082 || mbchar == 0xEFBC8E |
|
|
|
|
|| mbchar == 0xEFB992 || |
|
|
|
|
mbchar == 0xA143 || mbchar == 0xA144 || |
|
|
|
|
mbchar == 0xA14F) { |
|
|
|
|
cont->contents[i++] = '.'; |
|
|
|
|
} else { |
|
|
|
|
uint8_t c0 = mbchar >> 16; |
|
|
|
|
uint8_t c1 = (mbchar >> 8)&0xff; |
|
|
|
|
uint8_t c2 = (mbchar & 0xff); |
|
|
|
|
if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH) |
|
|
|
|
cont->contents[i++] = c0; |
|
|
|
|
if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH) |
|
|
|
|
cont->contents[i++] = c1; |
|
|
|
|
if (i+1 < MAX_TAG_CONTENTS_LENGTH) |
|
|
|
|
cont->contents[i++] = c2; |
|
|
|
|
} |
|
|
|
|
mbchar = 0; |
|
|
|
|
} |
|
|
|
|
if (c >= 0x80) { |
|
|
|
|
mbchar = (mbchar << 8) | c; |
|
|
|
|
--i; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
cont->contents[i] = c; |
|
|
|
|
} |
|
|
|
|
cont->pos = i; |
|
|
|
|
} |
|
|
|
@ -636,6 +663,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
|
|
|
|
struct parser_state *js_state = NULL; |
|
|
|
|
const unsigned char *js_begin = NULL, *js_end = NULL; |
|
|
|
|
struct tag_contents contents; |
|
|
|
|
uint32_t mbchar = 0; |
|
|
|
|
|
|
|
|
|
tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/ |
|
|
|
|
contents.pos = 0; |
|
|
|
@ -754,6 +782,38 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
|
|
|
|
next_state = HTML_BAD_STATE; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
case HTML_8BIT: |
|
|
|
|
if (*ptr < 0x80 || mbchar >= 0x10000) { |
|
|
|
|
if (mbchar == 0xE38082 || mbchar == 0xEFBC8E |
|
|
|
|
|| mbchar == 0xEFB992 || |
|
|
|
|
mbchar == 0xA143 || mbchar == 0xA144 || |
|
|
|
|
mbchar == 0xA14F) { |
|
|
|
|
/* bb #4097 */ |
|
|
|
|
html_output_c(file_buff_o2, '.'); |
|
|
|
|
html_output_c(file_buff_text, '.'); |
|
|
|
|
} else { |
|
|
|
|
uint8_t c0 = mbchar >> 16; |
|
|
|
|
uint8_t c1 = (mbchar >> 8)&0xff; |
|
|
|
|
uint8_t c2 = (mbchar & 0xff); |
|
|
|
|
if (c0) { |
|
|
|
|
html_output_c(file_buff_o2, c0); |
|
|
|
|
html_output_c(file_buff_text, c0); |
|
|
|
|
} |
|
|
|
|
if (c0 || c1) { |
|
|
|
|
html_output_c(file_buff_o2, c1); |
|
|
|
|
html_output_c(file_buff_text, c1); |
|
|
|
|
} |
|
|
|
|
html_output_c(file_buff_o2, c2); |
|
|
|
|
html_output_c(file_buff_text, c1); |
|
|
|
|
} |
|
|
|
|
mbchar = 0; |
|
|
|
|
state = next_state; |
|
|
|
|
next_state = HTML_NORM; |
|
|
|
|
} else { |
|
|
|
|
mbchar = (mbchar << 8) | *ptr; |
|
|
|
|
ptr++; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
case HTML_NORM: |
|
|
|
|
if (*ptr == '<') { |
|
|
|
|
ptrend=ptr; /* for use by scanContents */ |
|
|
|
@ -786,6 +846,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
|
|
|
|
state = HTML_CHAR_REF; |
|
|
|
|
next_state = HTML_NORM; |
|
|
|
|
ptr++; |
|
|
|
|
} else if (*ptr >= 0x80) { |
|
|
|
|
state = HTML_8BIT; |
|
|
|
|
next_state = HTML_NORM; |
|
|
|
|
mbchar = *ptr; |
|
|
|
|
ptr++; |
|
|
|
|
} else { |
|
|
|
|
unsigned char c = tolower(*ptr); |
|
|
|
|
/* normalize ' to " for scripts */ |
|
|
|
|