mirror of https://github.com/Cisco-Talos/clamav
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
199 lines
4.9 KiB
199 lines
4.9 KiB
/*
|
|
* Copyright (C) 2006 Török Edvin <edwin@clamav.net>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
* MA 02110-1301, USA.
|
|
*
|
|
*/
|
|
|
|
|
|
#include <clamav-config.h>
|
|
#include <others.h>
|
|
#include <htmlnorm.h>
|
|
#include <hashtab.h>
|
|
#include <entconv.h>
|
|
#include <regex.h>
|
|
#include <sys/types.h>
|
|
#include <dirent.h>
|
|
|
|
#define MAX_LINE 1024
|
|
/* ------------ generating entity tables from .ent files ---------------- */
|
|
/* TODO: move this into contrib/entitynorm/ ------------*/
|
|
|
|
|
|
static char* extract_str(const char* l,const regmatch_t* pmatch)
|
|
{
|
|
const int len = pmatch->rm_eo - pmatch->rm_so;
|
|
char* s ;
|
|
if(pmatch->rm_so==-1)
|
|
return NULL;
|
|
s = malloc(len+1);
|
|
strncpy(s, l+pmatch->rm_so, len);
|
|
s[len] = '\0';
|
|
return s;
|
|
}
|
|
|
|
static regex_t entity_regex;
|
|
static const char ent_head[]="<!ENTITY";
|
|
static const size_t ent_head_size = sizeof(ent_head);
|
|
static int entity_extract(const char* line,char* entity_name,int* entity_value)
|
|
{
|
|
regmatch_t pmatch[5];
|
|
|
|
if(regexec(&entity_regex,line,5,pmatch,0)==0) {
|
|
const char* entity_val;
|
|
strncpy(entity_name,extract_str(line,&pmatch[1]),MAX_LINE);
|
|
entity_val = extract_str(line,&pmatch[3]);
|
|
if(entity_val[0] == 'x') {
|
|
if(sscanf(entity_val+1,"%x",entity_value)!=1)
|
|
return -2;
|
|
}
|
|
else {
|
|
if(sscanf( entity_val,"%d",entity_value)!=1)
|
|
return -2;
|
|
}
|
|
if(*entity_value > 65535) {
|
|
fprintf(stderr,"Entity value outside of utf-16 range:%u; %s\n",*entity_value,line);
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
else {
|
|
if(strncmp(line,ent_head,ent_head_size-1)==0) {
|
|
fprintf(stderr,"Unable to interpret entity decl:%s\n",line);
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
static void loadEntitiesFromFile(const char* file,struct hashtable* s,char* xt[])
|
|
{
|
|
char line[MAX_LINE];
|
|
FILE* f = fopen(file,"rt");
|
|
if(!f) {
|
|
fprintf(stderr,"Unable to open file:%s",file);
|
|
exit(3);
|
|
}
|
|
|
|
while( fgets(line,MAX_LINE,f) ) {
|
|
unsigned char name[MAX_LINE];
|
|
int val;
|
|
int rc = entity_extract(line,name,&val);
|
|
if(rc<0) {
|
|
printf("error during extraction:%s!",line);
|
|
exit(3);
|
|
}
|
|
else if(rc) {
|
|
struct element* elem;
|
|
if(elem = hashtab_find(s,name,strlen(name))) {
|
|
if(elem->data != val)
|
|
cli_dbgmsg("Overriding entity value for %s: %d -> %d\n", name, elem->data, val);
|
|
else {
|
|
cli_dbgmsg("Duplicate entity value for %s:%d\n",name, elem->data);
|
|
continue;
|
|
}
|
|
}
|
|
if(xt[val] && strcmp(xt[val],name))
|
|
cli_dbgmsg("Duplicate entity reference to same code:%s->%d<-%s\n",name,val,xt[val]);
|
|
else if(xt[val]) {
|
|
fprintf(stderr,"Impossible: element not found in hashtable, but we did add it!! %s:%d:%s:%p\n",xt[val],val,name,elem);
|
|
abort();
|
|
}
|
|
xt[val] = strdup(name);
|
|
hashtab_insert(s,name,strlen(name),val);
|
|
}
|
|
}
|
|
fclose(f);
|
|
}
|
|
|
|
|
|
static void init_entity_parser(void)
|
|
{
|
|
int rc;
|
|
char errbuff[MAX_LINE];
|
|
if(( rc = regcomp(&entity_regex,".*<!ENTITY +([^ \t]+) +\" *&(#38;)?#(([0-9]+)|x([0-9a-fA-F])+); *\" *>.*",REG_EXTENDED) )) {
|
|
regerror(rc,&entity_regex,errbuff,MAX_LINE);
|
|
fprintf(stderr,"Error compiling regex:%s\n",errbuff);
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
struct entity_conv conv;
|
|
const char* ent_dir;
|
|
struct dirent* entry;
|
|
struct hashtable ht;
|
|
char* xt[65536];
|
|
|
|
memset(xt,0,65536*sizeof(xt[0]));
|
|
cl_debug();
|
|
init_entity_parser();
|
|
hashtab_init(&ht,512);
|
|
|
|
if(argc<2) {
|
|
fprintf(stderr,"Usage: %s <entity directory>\n",argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
ent_dir = argv[1];
|
|
DIR* dir = opendir(ent_dir);
|
|
if(!dir) {
|
|
cli_errmsg("Can't open directory\n");
|
|
return 2;
|
|
}
|
|
do {
|
|
entry = readdir(dir);
|
|
if(entry) {
|
|
char buffer[4096];
|
|
snprintf(buffer,4095,"%s/%s",ent_dir,entry->d_name);
|
|
buffer[4095] = '\0';
|
|
cli_dbgmsg("Loading entities from:%s\n", entry->d_name);
|
|
loadEntitiesFromFile(buffer,&ht,xt);
|
|
}
|
|
} while(entry);
|
|
closedir(dir);
|
|
#if 0
|
|
FILE* f1=fopen("/tmp/test.out","w");
|
|
hashtab_store(&ht,f1);
|
|
fclose(f1);
|
|
init_entity_converter(&conv,UNKNOWN,8192);
|
|
FILE* f = fopen(argv[1],"rb");
|
|
if(!f) {
|
|
perror("FIle not found!\n");
|
|
exit(1);
|
|
}
|
|
/*
|
|
int c;
|
|
while((c=fgetc(f))!=EOF) {
|
|
const char* x = process_byte(&conv,c);
|
|
if(x)
|
|
printf("%s",x);
|
|
}
|
|
*/
|
|
unsigned char* s;
|
|
do{
|
|
s = encoding_norm_readline(&conv, f, NULL, 8192);
|
|
if(s)
|
|
printf("%s",s);
|
|
free(s);
|
|
} while(s);
|
|
encoding_norm_done(&conv);
|
|
fflush(stdout);
|
|
#endif
|
|
hashtab_generate_c(&ht,"entities_htable");
|
|
return 0;
|
|
}
|
|
|
|
|