new implementation of the Aho-Corasick pattern matcher

git-svn: trunk@3038
remotes/push_mirror/metadata
Tomasz Kojm 18 years ago
parent a7a2e2d48c
commit fbcef1b0b5
  1. 9
      ChangeLog
  2. 7
      libclamav/filetypes.c
  3. 861
      libclamav/matcher-ac.c
  4. 19
      libclamav/matcher-ac.h
  5. 5
      libclamav/matcher.h
  6. 236
      libclamav/readdb.c
  7. 2
      libclamav/str.c

@ -1,3 +1,12 @@
Sat Apr 28 19:51:22 CEST 2007 (tk)
----------------------------------
* libclamav: new implementation of the Aho-Corasick pattern matcher:
- remove static depth limitation
- optimize memory usage
- min/max depth can be set on per-tree basis
- use higher max-depth by default (3)
- much better detection of wildcarded sigs
Tue Apr 24 13:48:04 BST 2007 (njh)
----------------------------------
* libclamav/mbox.c: Bug 366

@ -381,13 +381,12 @@ int cli_addtypesigs(struct cl_engine *engine)
return CL_EMEM;
}
root->ac_root = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
if(!root->ac_root) {
cli_errmsg("cli_addtypesigs: Can't initialise AC pattern matcher\n");
if((ret = cli_ac_init(root, AC_DEFAULT_MIN_DEPTH, AC_DEFAULT_MAX_DEPTH))) {
/* No need to free previously allocated memory here - all engine
* elements will be properly freed by cl_free()
*/
return CL_EMEM;
cli_errmsg("cli_addtypesigs: Can't initialise AC pattern matcher\n");
return ret;
}
} else {
root = engine->root[0];

File diff suppressed because it is too large Load Diff

@ -24,24 +24,20 @@
#include "filetypes.h"
#include "cltypes.h"
#define AC_DEFAULT_DEPTH 2
#define AC_DEFAULT_MIN_DEPTH 2
#define AC_DEFAULT_MAX_DEPTH 3
#define AC_DEFAULT_TRACKLEN 8
struct cli_ac_data {
uint32_t partsigs;
off_t *inioff;
uint16_t *partcnt;
uint32_t **partoff;
uint8_t *offcnt;
uint8_t *offidx;
int32_t *maxshift;
int32_t ***offmatrix;
};
struct cli_ac_patt {
uint16_t *pattern, *prefix, length, prefix_length;
uint8_t depth;
uint32_t mindist, maxdist;
char *virname, *offset;
const char *viralias;
uint32_t sigid;
uint16_t parts, partno, alt, *altn, alt_pattern;
uint8_t target;
@ -51,9 +47,9 @@ struct cli_ac_patt {
};
struct cli_ac_node {
uint8_t islast;
uint8_t leaf, final;
struct cli_ac_patt *list;
struct cli_ac_node *trans[256], *fail;
struct cli_ac_node **trans, *fail;
};
#include "matcher.h"
@ -63,6 +59,9 @@ int cli_ac_initdata(struct cli_ac_data *data, uint32_t partsigs, uint8_t trackle
void cli_ac_freedata(struct cli_ac_data *data);
int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **virname, const struct cli_matcher *root, struct cli_ac_data *mdata, uint8_t otfrec, uint32_t offset, cli_file_t ftype, int fd, struct cli_matched_type **ftoffset);
int cli_ac_buildtrie(struct cli_matcher *root);
int cli_ac_init(struct cli_matcher *root, uint8_t mindepth, uint8_t maxdepth);
void cli_ac_free(struct cli_matcher *root);
int cli_ac_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, uint32_t sigid, uint16_t parts, uint16_t partno, uint16_t type, uint32_t mindist, uint32_t maxdist, const char *offset, uint8_t target);
#endif

@ -46,9 +46,10 @@ struct cli_matcher {
struct cli_bm_patt **bm_suffix;
/* Extended Aho-Corasick */
uint8_t ac_depth;
uint8_t ac_mindepth, ac_maxdepth;
struct cli_ac_node *ac_root, **ac_nodetable;
uint32_t ac_partsigs, ac_nodes;
struct cli_ac_patt **ac_pattable;
uint32_t ac_partsigs, ac_nodes, ac_patterns;
};
#define CL_TARGET_TABLE_SIZE 7

@ -82,237 +82,6 @@ static pthread_mutex_t cli_ref_mutex = PTHREAD_MUTEX_INITIALIZER;
int cl_loaddb(const char *filename, struct cl_engine **engine, unsigned int *signo);
int cl_loaddbdir(const char *dirname, struct cl_engine **engine, unsigned int *signo);
/* TODO: clean up the code */
static int cli_ac_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, int sigid, int parts, int partno, unsigned short type, unsigned int mindist, unsigned int maxdist, const char *offset, unsigned short target)
{
struct cli_ac_patt *new;
char *pt, *hex = NULL;
int virlen, ret, error = 0;
unsigned int i, j, wprefix = 0;
#define FREE_ALT \
if(new->alt) { \
free(new->altn); \
for(i = 0; i < new->alt; i++) \
free(new->altc[i]); \
free(new->altc); \
free(hex); \
}
if(strlen(hexsig) / 2 < AC_DEFAULT_DEPTH)
return CL_EPATSHORT;
if((new = (struct cli_ac_patt *) cli_calloc(1, sizeof(struct cli_ac_patt))) == NULL)
return CL_EMEM;
new->type = type;
new->sigid = sigid;
new->parts = parts;
new->partno = partno;
new->mindist = mindist;
new->maxdist = maxdist;
new->target = target;
if(offset) {
new->offset = cli_strdup(offset);
if(!new->offset)
return CL_EMEM;
}
if(strchr(hexsig, '(')) {
char *hexcpy, *hexnew, *start, *h, *c;
if(!(hexcpy = cli_strdup(hexsig))) {
if(new->offset)
free(new->offset);
free(new);
return CL_EMEM;
}
if(!(hexnew = (char *) cli_calloc(strlen(hexsig) + 1, 1))) {
free(hexcpy);
if(new->offset)
free(new->offset);
free(new);
return CL_EMEM;
}
start = pt = hexcpy;
while((pt = strchr(start, '('))) {
*pt++ = 0;
if(!start) {
error = 1;
break;
}
strcat(hexnew, start);
strcat(hexnew, "@@");
if(!(start = strchr(pt, ')'))) {
error = 1;
break;
}
*start++ = 0;
new->alt++;
new->altn = (unsigned short int *) cli_realloc(new->altn, new->alt * sizeof(unsigned short int));
new->altn[new->alt - 1] = 0;
new->altc = (unsigned char **) cli_realloc(new->altc, new->alt * sizeof(char *));
new->altc[new->alt - 1] = NULL;
for(i = 0; i < strlen(pt); i++)
if(pt[i] == '|')
new->altn[new->alt - 1]++;
if(!new->altn[new->alt - 1]) {
error = 1;
break;
} else
new->altn[new->alt - 1]++;
if(!(new->altc[new->alt - 1] = (unsigned char *) cli_calloc(new->altn[new->alt - 1], 1))) {
error = 1;
break;
}
for(i = 0; i < new->altn[new->alt - 1]; i++) {
if((h = cli_strtok(pt, i, "|")) == NULL) {
error = 1;
break;
}
if((c = cli_hex2str(h)) == NULL) {
free(h);
error = 1;
break;
}
new->altc[new->alt - 1][i] = *c;
free(c);
free(h);
}
if(error)
break;
}
if(start)
strcat(hexnew, start);
hex = hexnew;
free(hexcpy);
if(error) {
FREE_ALT;
if(new->offset)
free(new->offset);
free(new);
return CL_EMALFDB;
}
}
if((new->pattern = cli_hex2ui(new->alt ? hex : hexsig)) == NULL) {
FREE_ALT;
if(new->offset)
free(new->offset);
free(new);
return CL_EMALFDB;
}
new->length = strlen(new->alt ? hex : hexsig) / 2;
for(i = 0; i < AC_DEFAULT_DEPTH; i++) {
if(new->pattern[i] & CLI_MATCH_WILDCARD) {
wprefix = 1;
break;
}
}
if(wprefix) {
for(; i < (uint16_t) (new->length - AC_DEFAULT_DEPTH + 1); i++) {
wprefix = 0;
for(j = i; j < i + AC_DEFAULT_DEPTH; j++) {
if(new->pattern[j] & CLI_MATCH_WILDCARD) {
wprefix = 1;
break;
}
}
if(!wprefix)
break;
}
if(wprefix) {
FREE_ALT;
if(new->offset)
free(new->offset);
free(new->pattern);
free(new);
return CL_EMALFDB;
}
new->prefix = new->pattern;
new->prefix_length = i;
new->pattern = &new->prefix[i];
new->length -= i;
for(i = 0; i < new->prefix_length; i++)
if((new->prefix[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_ALTERNATIVE)
new->alt_pattern++;
}
if(new->length > root->maxpatlen)
root->maxpatlen = new->length;
if((pt = strstr(virname, "(Clam)")))
virlen = strlen(virname) - strlen(pt) - 1;
else
virlen = strlen(virname);
if(virlen <= 0) {
if(new->prefix)
free(new->prefix);
else
free(new->pattern);
FREE_ALT;
if(new->offset)
free(new->offset);
free(new);
return CL_EMALFDB;
}
if((new->virname = cli_calloc(virlen + 1, sizeof(char))) == NULL) {
if(new->prefix)
free(new->prefix);
else
free(new->pattern);
FREE_ALT;
if(new->offset)
free(new->offset);
free(new);
return CL_EMEM;
}
strncpy(new->virname, virname, virlen);
if((ret = cli_ac_addpatt(root, new))) {
if(new->prefix)
free(new->prefix);
else
free(new->pattern);
free(new->virname);
FREE_ALT;
if(new->offset)
free(new->offset);
free(new);
return ret;
}
if(new->alt)
free(hex);
return CL_SUCCESS;
}
int cli_parse_add(struct cli_matcher *root, const char *virname, const char *hexsig, unsigned short type, const char *offset, unsigned short target)
{
@ -570,11 +339,10 @@ static int cli_initroots(struct cl_engine *engine, unsigned int options)
}
cli_dbgmsg("Initialising AC pattern matcher of root[%d]\n", i);
root->ac_root = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
if(!root->ac_root) {
if((ret = cli_ac_init(root, AC_DEFAULT_MIN_DEPTH, AC_DEFAULT_MAX_DEPTH))) {
/* no need to free previously allocated memory here */
cli_errmsg("Can't initialise AC pattern matcher\n");
return CL_EMEM;
return ret;
}
if(!root->ac_only) {

@ -97,7 +97,7 @@ uint16_t *cli_hex2ui(const char *hex)
}
val |= CLI_MATCH_NIBBLE_LOW;
} else if(hex[i] == '@') {
} else if(hex[i] == '(') {
val |= CLI_MATCH_ALTERNATIVE;
} else {

Loading…
Cancel
Save