apply entconv patch from Edwin

git-svn: trunk@2675
remotes/push_mirror/metadata
Tomasz Kojm 19 years ago
parent bda5598b7e
commit c8184020c4
  1. 8
      clamav-devel/ChangeLog
  2. 2
      clamav-devel/libclamav/encoding_aliases.h
  3. 303
      clamav-devel/libclamav/entconv.c
  4. 1
      clamav-devel/libclamav/entconv.h
  5. 2
      clamav-devel/libclamav/htmlnorm.c

@ -1,3 +1,11 @@
Wed Feb 7 17:20:12 CET 2007 (tk)
---------------------------------
* libclamav: apply entconv patch from Edwin:
- workaround sarge libc leak using a per-thread cache
- normalize <0x20 chars too
- fix utf-16 double-decoding
- fix performance issue with some encodings
Sun Feb 4 17:58:16 CET 2007 (tk) Sun Feb 4 17:58:16 CET 2007 (tk)
--------------------------------- ---------------------------------
* libclamav: remove some warnings from gcc * libclamav: remove some warnings from gcc

@ -23,7 +23,6 @@
#define _ENCODING_ALIASES_H #define _ENCODING_ALIASES_H
#include "clamav-config.h" #include "clamav-config.h"
#ifndef HAVE_ICONV_H
#include <stdio.h> #include <stdio.h>
#include "hashtab.h" #include "hashtab.h"
@ -90,4 +89,3 @@ const struct hashtable aliases_htable = {
}; };
#endif #endif
#endif

@ -32,6 +32,11 @@
#include <dirent.h> #include <dirent.h>
#include <errno.h> #include <errno.h>
#ifdef CL_THREAD_SAFE
#include <pthread.h>
#endif
#include "clamav.h" #include "clamav.h"
#include "others.h" #include "others.h"
#include "htmlnorm.h" #include "htmlnorm.h"
@ -41,9 +46,9 @@
#ifdef HAVE_ICONV_H #ifdef HAVE_ICONV_H
#include <iconv.h> #include <iconv.h>
#else
#include "encoding_aliases.h"
#endif #endif
#include "encoding_aliases.h"
#define MAX_LINE 1024 #define MAX_LINE 1024
@ -136,6 +141,7 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
} }
conv->ht = &entities_htable; conv->ht = &entities_htable;
conv->msg_zero_shown = 0;
return 0; return 0;
} }
@ -143,55 +149,57 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
return CL_ENULLARG; return CL_ENULLARG;
} }
#ifndef HAVE_ICONV_H static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* encoding)
typedef struct {
enum encodings encoding;
size_t size;
} * iconv_t;
static iconv_t iconv_open(const char *tocode, const char *fromcode)
{ {
const unsigned char* from = (const unsigned char*) fromcode; const unsigned char* from = (const unsigned char*) fromcode;
iconv_t iconv = cli_malloc(sizeof(*iconv));
if(!iconv)
return NULL;
iconv->encoding = E_OTHER;
iconv->size = 1;
/*TODO: check that tocode is UTF16BE */
/* special case for these unusual byteorders */ /* special case for these unusual byteorders */
*encoding=E_OTHER;
if(from == UCS4_2143) if(from == UCS4_2143)
iconv->encoding = E_UCS4_2134; *encoding = E_UCS4_2134;
else if (from == UCS4_3412) else if (from == UCS4_3412)
iconv->encoding = E_UCS4_3412; *encoding = E_UCS4_3412;
else { else {
struct element * e = hashtab_find(&aliases_htable,from,strlen(fromcode)); struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode));
if(e && e->key) { if(e && e->key) {
iconv->encoding = e->data; *encoding = e->data;
} }
} }
switch(iconv->encoding) { switch(*encoding) {
case E_UCS4: case E_UCS4:
case E_UCS4_1234: case E_UCS4_1234:
case E_UCS4_4321: case E_UCS4_4321:
case E_UCS4_2134: case E_UCS4_2134:
case E_UCS4_3412: case E_UCS4_3412:
iconv->size = 4; return 4;
break;
case E_UTF16: case E_UTF16:
case E_UTF16_BE: case E_UTF16_BE:
case E_UTF16_LE: case E_UTF16_LE:
iconv->size = 2; return 2;
break;
case E_UTF8: case E_UTF8:
case E_UNKNOWN: case E_UNKNOWN:
case E_OTHER: case E_OTHER:
default: default:
iconv->size = 1; return 1;
} }
}
#ifndef HAVE_ICONV_H
typedef struct {
enum encodings encoding;
size_t size;
} * iconv_t;
static iconv_t iconv_open(const char *tocode, const char* fromcode)
{
iconv_t iconv = cli_malloc(sizeof(*iconv));
if(!iconv)
return NULL;
/* TODO: check that tocode is UTF16BE */
iconv->size = encoding_bytes(fromcode,&iconv->encoding);
return iconv; return iconv;
} }
}
static int iconv_close(iconv_t cd) static int iconv_close(iconv_t cd)
{ {
@ -379,6 +387,10 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
return 0; return 0;
} }
#else
#endif #endif
/* new iconv() version */ /* new iconv() version */
@ -495,18 +507,20 @@ static unsigned char* normalize_encoding(const unsigned char* enc)
return norm; return norm;
} }
static const char* encoding_name(unsigned char* encoding) static const unsigned char* encoding_name(unsigned char* encoding)
{ {
if(!encoding) if(!encoding)
return "ISO-8859-1"; return (const unsigned char*)"ISO-8859-1";
else else
return (char*)encoding; return encoding;
} }
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio) void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
{ {
unsigned char *tmp_encoding;
enum encodings tmp;
size_t new_size,old_size;
cli_dbgmsg("Setting encoding for %x to %s, priority: %d\n",conv, encoding, prio); cli_dbgmsg("Setting encoding for %x to %s, priority: %d\n",conv, encoding, prio);
if(encoding == OTHER) if(encoding == OTHER)
return; return;
@ -514,8 +528,17 @@ void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding
return;/* Content-type in header is highest priority, no overrides possible*/ return;/* Content-type in header is highest priority, no overrides possible*/
if(conv->priority == BOM && prio == NOBOM_AUTODETECT) if(conv->priority == BOM && prio == NOBOM_AUTODETECT)
return; return;
tmp_encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/
old_size = encoding_bytes(conv->encoding,&tmp);
new_size = encoding_bytes(tmp_encoding,&tmp);
if(old_size != new_size) {
cli_dbgmsg("process_encoding_set: refusing to override encoding - new encoding size differs: %s(%ld) != %s(%ld)\n",conv->encoding,old_size,tmp_encoding,new_size);
free(tmp_encoding);
return;
}
free(conv->encoding); free(conv->encoding);
conv->encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/ conv->encoding = tmp_encoding;
cli_dbgmsg("New encoding for %x:%s\n",conv,conv->encoding); cli_dbgmsg("New encoding for %x:%s\n",conv,conv->encoding);
/* reset stream */ /* reset stream */
} }
@ -595,24 +618,192 @@ static size_t read_raw(FILE *stream, m_area_t *m_area, unsigned int max_len, uns
} }
} }
static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in) static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in,size_t* inleft)
{ {
if(conv->has_bom) { if(conv->has_bom) {
switch(conv->enc_bytes) { switch(conv->enc_bytes) {
case 1: case 1:
if(conv->autodetected == UTF8) if(conv->autodetected == UTF8) {
*in += 3; *in += 3;
*inleft -= 3;
}
break; break;
case 2: case 2:
*in += 2; *in += 2;
*inleft -= 2;
break; break;
case 4: case 4:
*in += 4; *in += 4;
*inleft -= 4;
break; break;
} }
} }
} }
/* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
* just keep on each thread its own pool of iconvs*/
struct iconv_cache {
iconv_t* tab;
size_t len;
size_t last;
struct hashtable hashtab;
};
static void iconv_cache_init(struct iconv_cache* cache)
{
/* cache->tab = NULL;
cache->len = 0;
cache->used = 0; - already done by memset*/
cli_dbgmsg("Initializing iconv pool:%p\n",cache);
hashtab_init(&cache->hashtab, 32);
}
static void iconv_cache_destroy(struct iconv_cache* cache)
{
size_t i;
cli_dbgmsg("Destroying iconv pool:%p\n",cache);
for(i=0;i < cache->last;i++) {
cli_dbgmsg("closing iconv:%p\n",cache->tab[i]);
iconv_close(cache->tab[i]);
}
hashtab_clear(&cache->hashtab);
free(cache->hashtab.htable);
free(cache->tab);
free(cache);
}
#ifdef CL_THREAD_SAFE
static pthread_key_t iconv_pool_tls_key;
static pthread_once_t iconv_pool_tls_key_once = PTHREAD_ONCE_INIT;
/* destructor called for all threads that exit via pthread_exit, or cancellation. Unfortunately that doesn't include
* the main thread, so we have to call this manually for the main thread.*/
static int cache_atexit_registered = 0;
static void iconv_pool_tls_instance_destroy(void* ptr)
{
if(ptr) {
iconv_cache_destroy(ptr);
}
}
static void iconv_cache_cleanup_main(void)
{
struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key);
if(cache) {
iconv_pool_tls_instance_destroy(cache);
pthread_setspecific(iconv_pool_tls_key,NULL);
}
pthread_key_delete(iconv_pool_tls_key);
}
static void iconv_pool_tls_key_alloc(void)
{
pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy);
if(!cache_atexit_registered) {
cli_dbgmsg("iconv:registering atexit\n");
if(atexit(iconv_cache_cleanup_main)) {
cli_dbgmsg("failed to register atexit\n");
}
cache_atexit_registered = 1;
}
}
static void init_iconv_pool_ifneeded(void)
{
pthread_once(&iconv_pool_tls_key_once, iconv_pool_tls_key_alloc);
}
static inline struct iconv_cache* cache_get_tls_instance(void)
{
struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key);
if(!cache) {
cache = cli_calloc(1,sizeof(*cache));
if(!cache) {
cli_dbgmsg("!Out of memory allocating TLS iconv instance\n");
return NULL;
}
iconv_cache_init(cache);
pthread_setspecific(iconv_pool_tls_key, cache);
}
return cache;
}
#else
static struct iconv_cache* global_iconv_cache = NULL;
static int iconv_global_inited = 0;
static void iconv_cache_cleanup_main(void)
{
iconv_cache_destroy(global_iconv_cache);
}
static inline void init_iconv_pool_ifneeded()
{
if(!iconv_global_inited) {
global_iconv_cache = cli_calloc(1,sizeof(*global_iconv_cache));
if(global_iconv_cache) {
iconv_cache_init(global_iconv_cache);
atexit(iconv_cache_cleanup_main);
iconv_global_inited = 1;
}
}
}
static inline struct iconv_cache* cache_get_tls_instance(void)
{
return global_iconv_cache;
}
#endif
static iconv_t iconv_open_cached(const unsigned char* fromcode)
{
struct iconv_cache * cache;
size_t idx;
const size_t fromcode_len = strlen((const char*)fromcode);
struct element * e;
init_iconv_pool_ifneeded();
cache = cache_get_tls_instance();/* gets TLS iconv pool */
if(!cache) {
cli_dbgmsg("!Unable to get TLS iconv cache!\n");
errno = EINVAL;
return (iconv_t)-1;
}
e = hashtab_find(&cache->hashtab, fromcode, fromcode_len);
if(e && (e->data < 0 || (size_t)e->data > cache->len)) {
e = NULL;
}
if(e) {
return cache->tab[e->data];
}
cli_dbgmsg("iconv not found in cache, for encoding:%s\n",fromcode);
idx = cache->last++;
if(idx >= cache->len) {
cache->len += 16;
cache->tab = cli_realloc(cache->tab, cache->len*sizeof(cache->tab[0]));
if(!cache->tab) {
cli_dbgmsg("!Out of mem in iconv-pool\n");
errno = ENOMEM;
return (iconv_t)-1;
}
}
hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
cache->tab[idx] = iconv_open("UTF-16BE",(const char*)fromcode);
cli_dbgmsg("iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
return cache->tab[idx];
}
/* tmp_m_area and conv->out_area are of size maxlen */ /* tmp_m_area and conv->out_area are of size maxlen */
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen) unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
{ {
@ -638,7 +829,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
size_t rc, inleft; size_t rc, inleft;
ssize_t i; ssize_t i;
char alignfix; signed char alignfix;
/* move whatever left in conv->tmp_area to beginning */ /* move whatever left in conv->tmp_area to beginning */
if(tmp_move) if(tmp_move)
@ -654,16 +845,16 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
conv->out_area.offset = 0; conv->out_area.offset = 0;
tmpbuff = conv->tmp_area.buffer; tmpbuff = conv->tmp_area.buffer;
inleft = conv->tmp_area.length;
if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */ if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */
memcpy( conv->bom, tmpbuff, 4); memcpy( conv->bom, tmpbuff, 4);
process_bom(conv); process_bom(conv);
process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT); process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT);
output_first(conv,&out,&tmpbuff); output_first(conv,&out,&tmpbuff,&inleft);
conv->bom_cnt++; conv->bom_cnt++;
} }
/* convert encoding conv->tmp_area. conv->out_area */ /* convert encoding conv->tmp_area. conv->out_area */
inleft = conv->tmp_area.length;
alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert,
and we are using ucs4, ditto for utf16, and 1 byte*/ and we are using ucs4, ditto for utf16, and 1 byte*/
inleft -= alignfix; inleft -= alignfix;
@ -676,24 +867,30 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
alignfix = -inleft; alignfix = -inleft;
} }
iconv_struct = iconv_open("UTF-16BE",encoding_name(conv->encoding)); iconv_struct = iconv_open_cached(encoding_name(conv->encoding));
if(iconv_struct == (iconv_t)-1) { if(iconv_struct == (iconv_t)-1) {
cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding)); cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding));
/* message shown only once/file */
/* what can we do? just fall back for it being an ISO-8859-1 */ /* what can we do? just fall back for it being an ISO-8859-1 */
iconv_struct = iconv_open("UTF-16BE","ISO-8859-1"); free(conv->encoding);
conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1");
iconv_struct = iconv_open_cached(conv->encoding);
if(iconv_struct == (iconv_t)-1) { if(iconv_struct == (iconv_t)-1) {
cli_dbgmsg("fallback failed... bail out\n"); cli_dbgmsg("fallback failed... bail out\n");
return cli_readline(NULL,&conv->tmp_area,maxlen); return cli_readline(NULL,&conv->tmp_area,maxlen);
} }
} }
if(inleft) /* iconv doesn't like inleft to be 0 */ if(inleft && outleft > conv->buffer_size/2 ) /* iconv doesn't like inleft to be 0 */ {
rc = iconv(iconv_struct, (char**) &tmpbuff, &inleft, (char**) &out, &outleft); rc = iconv(iconv_struct, (char**) &tmpbuff, &inleft, (char**) &out, &outleft);
}
else else
rc = 0; rc = 0;
iconv_close(iconv_struct); #if 0
iconv_close(iconv_struct);/* - don't close, we are using a cached instance */
#endif
if(rc==(size_t)-1 && errno != E2BIG) { if(rc==(size_t)-1 && errno != E2BIG) {
cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft); cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft);
@ -705,20 +902,31 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
} }
conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0); conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0);
conv->out_area.length = out - conv->out_area.buffer; conv->out_area.length = out - conv->out_area.buffer - out_move;
conv->tmp_area.offset = tmpbuff - conv->tmp_area.buffer; conv->tmp_area.offset = tmpbuff - conv->tmp_area.buffer;
conv->tmp_area.length += conv->tmp_area.offset; conv->tmp_area.length += conv->tmp_area.offset;
/* move whatever left in conv->norm_area to beginning */ /* move whatever left in conv->norm_area to beginning */
if(norm_move) if(norm_move) {
if(norm_move < conv->buffer_size/2) {
memmove(conv->norm_area.buffer, conv->norm_area.buffer + conv->norm_area.offset, norm_move); memmove(conv->norm_area.buffer, conv->norm_area.buffer + conv->norm_area.offset, norm_move);
conv->norm_area.offset = 0; conv->norm_area.offset = 0;
norm = conv->norm_area.buffer + norm_move;
}
else {
/* don't modify offset here */
norm = conv->norm_area.buffer + conv->norm_area.length;
}
}
else {
conv->norm_area.offset = 0;
norm = conv->norm_area.buffer;
}
/* now do the real normalization */ /* now do the real normalization */
out = conv->out_area.buffer;/* skip over utf16 bom, FIXME: check if iconv really outputted a BOM */ out = conv->out_area.buffer;/* skip over utf16 bom, FIXME: check if iconv really outputted a BOM */
norm = conv->norm_area.buffer + norm_move;
norm_end = conv->norm_area.buffer + conv->buffer_size; norm_end = conv->norm_area.buffer + conv->buffer_size;
if(conv->out_area.length>0 && out[0] == 0xFF && out[1] == 0xFE) if(conv->out_area.length>0 && out[0] == 0xFF && out[1] == 0xFE)
i = 2; i = 2;
@ -727,10 +935,12 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
for(; i < conv->out_area.length; i += 2) { for(; i < conv->out_area.length; i += 2) {
uint16_t u16 = ( ((uint16_t)out[i]) << 8 ) | out[i+1]; uint16_t u16 = ( ((uint16_t)out[i]) << 8 ) | out[i+1];
if(!u16) { if(!u16) {
if(alignfix >= 0) /* if alignfix is negative, this 0 byte is on-purpose, its padding */ if(alignfix >= 0 && !conv->msg_zero_shown) /* if alignfix is negative, this 0 byte is on-purpose, its padding */ {
conv->msg_zero_shown = 1;
cli_dbgmsg("Skipping null character in html stream\n"); cli_dbgmsg("Skipping null character in html stream\n");
} }
else if(u16 < 0x80) { }
else if((u16 < 0x80 && u16 >= 0x20) || u16 == 0x0d || u16 == 0x0a) {
if(norm >= norm_end) if(norm >= norm_end)
break; break;
if((unsigned char)u16 ==0) if((unsigned char)u16 ==0)
@ -753,6 +963,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
} }
} }
conv->out_area.offset = i; /* so that we can resume next time from here */ conv->out_area.offset = i; /* so that we can resume next time from here */
conv->norm_area.length = norm - conv->norm_area.buffer; conv->norm_area.length = norm - conv->norm_area.buffer;
/* /*
conv->norm_area.buffer[conv->buffer_size-1]=0;DONT DO THIS conv->norm_area.buffer[conv->buffer_size-1]=0;DONT DO THIS

@ -72,6 +72,7 @@ struct entity_conv {
m_area_t tmp_area; m_area_t tmp_area;
m_area_t out_area; m_area_t out_area;
m_area_t norm_area; m_area_t norm_area;
int msg_zero_shown;
}; };

@ -1199,7 +1199,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
tag_val[tag_val_length++] = value; /* store encoded values too */ tag_val[tag_val_length++] = value; /* store encoded values too */
} }
if(value < 0x80) if((value < 0x80 && value >= 0x20) || value == 0x0d || value == 0x0a)
html_output_c(file_buff_o1, file_buff_o2, tolower(value)); html_output_c(file_buff_o1, file_buff_o2, tolower(value));
else { else {
unsigned char buff[10]; unsigned char buff[10];

Loading…
Cancel
Save