continue draft of new regex_list.c

git-svn: trunk@3098
remotes/push_mirror/metadata
Török Edvin 18 years ago
parent b9a533ece2
commit dbf3c4d9ba
  1. 4
      ChangeLog
  2. 183
      libclamav/regex_list.c

@ -1,3 +1,7 @@
Sat Jun 09 23:16:00 EEST 2007 (edwin)
------------------------------------
* libclamav/regex_list.c: draft of new regex_list.c
Sat Jun 09 18:37:00 EEST 2007 (edwin)
------------------------------------
* libclamav/regex_list.c: first draft of new implementation for regex_list.c

@ -1527,11 +1527,14 @@ void dump_tree(struct tree_node* root)
*
* "go to parent" below actually means, return from recursive call.
*
* fail_action: we need to return to closest failure point (recursive call point),
* and switch current node to node pointed by fail_action
*
* Node types:
* OP_ROOT: contains information that applies to the entire trie.
* it can only appear as root node, and not as child node.
* On child fail: match has failed
* This is a recursive call point
* This is NOT a recursive call point
* OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character;
* using binary-search
* On fail: go to node indicated by fail_action, or if
@ -1548,6 +1551,8 @@ void dump_tree(struct tree_node* root)
* On fail: fail_action, or parent if NULL
* On child fail: reduce match repeat count, try again on child, if
* repeat count<min_range, execute fail of current node
* Also has a bitmap on what characters are accepted beyond it,
* as an optimizations for the case, when a maximum match isn't possible
* Not recomended to use this when min_range=max_range=1
* This is a recursive call point
* OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character
@ -1577,6 +1582,7 @@ void dump_tree(struct tree_node* root)
*
*/
#include <string.h>
#include "cltypes.h"
#include "others.h"
@ -1624,7 +1630,8 @@ struct trie_node_binary_search
struct trie_node node;
uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/
struct trie_node* fail_action;
struct trie_node** children;
unsigned char* char_choices;/* children_count elements */
struct trie_node** children;/*children_count elements */
};
struct trie_node_alternatives
@ -1642,6 +1649,9 @@ struct trie_node_char_repeat
struct trie_node node;
unsigned char character;
uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this,
to optimize repeat < max_range case; if its NULL
there is no optimization*/
struct trie_node* child;
struct trie_node* fail_action;
};
@ -1650,6 +1660,9 @@ struct trie_node_dot_repeat
{
struct trie_node node;
uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this,
to optimize repeat < max_range case; if its NULL
there is no optimization*/
struct trie_node* child;
struct trie_node* fail_action;
};
@ -1674,22 +1687,65 @@ struct trie_node_strcmp
uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */
unsigned char* string;
struct trie_node* child;
struct trie_node** fail_actions;/* this has string_length elements */
struct trie_node** fail_actions;/* this has string_length elements, or NULL if no fail_actions are computed */
};
struct trie_node_char_class_repeat
{
struct trie_node node;
struct char_bitmap* bitmap;
struct char_bitmap* bitmap_accept_after;
uint8_t range_min, range_max;
struct trie_node* child;
struct trie_node* fail_action;
};
static inline int bitmap_accepts(const struct char_bitmap* bitmap, const char c)
{
/* TODO: check if c is accepted by bitmap */
return 0;
}
#define MATCH_FAILED 0
#define MATCH_OK 1
static int match_node(const struct trie_node* node)
#define FAIL_ACTION( fail_node ) (*fail_action = (fail_node), MATCH_FAILED)
#ifndef MIN
#define MIN(a,b) ((a)<(b) ? (a) : (b))
#endif
static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action);
static int match_repeat(const unsigned char* text, const unsigned char* text_end, const size_t range_min, const size_t repeat_start,
const struct char_bitmap* bitmap_accept_after, const struct trie_node* child, const struct trie_node** fail_action,
const struct trie_node* this_fail_action)
{
while(node->type != OP_MATCH_OK) {
size_t i;
for(i = repeat_start;i > range_min;i--) {
if(!bitmap_accept_after || bitmap_accepts( bitmap_accept_after, text[i-1])) {
int rc = match_node(child, &text[i], text_end, fail_action);
/* ignore fail_action for now, we have the bitmap_accepts_after optimization */
if(rc) {
return MATCH_OK;
}
}
}
if(!range_min) {
/* this match is optional, try child only */
int rc = match_node(child, text, text_end, fail_action);
if(rc) {
return MATCH_OK;
}
}
return FAIL_ACTION(this_fail_action);
}
/* text_end points to \0 in text */
static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action)
{
while(node && text < text_end) {
switch(node->type) {
case OP_ROOT:
{
@ -1698,56 +1754,151 @@ static int match_node(const struct trie_node* node)
break;
}
case OP_CHAR_BINARY_SEARCH:
{
{
const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node);
/* TODO: binary search */
const unsigned char csearch = *text;
size_t mid, left = 0, right = bin_node->children_count-1;
while(left<=right) {
mid = left+(right-left)/2;
if(bin_node->char_choices[mid] == csearch)
break;
else if(bin_node->char_choices[mid] < csearch)
left = mid+1;
else
right = mid-1;
}
if(left <= right) {
/* match successful */
node = bin_node->children[mid];
++text;
}
else {
return FAIL_ACTION( bin_node->fail_action );
}
break;
}
case OP_ALTERNATIVES:
{
const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node);
/* TODO: op_alt */
size_t i;
*fail_action = NULL;
for(i=0;i < alt_node->alternatives_count;i++) {
int rc = match_node(alt_node->children[i], text, text_end, fail_action);
if(rc) {
return MATCH_OK;
}
/* supporting fail_actions is tricky,
* if we just go to the node specified, what happens if the match fails, and no
* further fail_action is specified? We should know where to continue the search.
* For now fail_action isn't supported for OP_ALTERNATIVES*/
}
break;
}
case OP_CHAR_REPEAT:
{
const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node);
break;
const size_t max_len = MIN( text_end - text, char_rep_node->range_max-1);
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
const char caccept = char_rep_node->character;
size_t rep;
if(max_len < char_rep_node->range_min)
return FAIL_ACTION(char_rep_node->fail_action);
for(rep=0;rep < max_len;rep++) {
if(text[rep] != caccept) {
break;
}
}
return match_repeat(text, text_end, char_rep_node->range_min, rep,
char_rep_node->bitmap_accept_after, char_rep_node->child, fail_action,
char_rep_node->fail_action);
}
case OP_DOT_REPEAT:
{
const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node);
break;
const size_t max_len = MIN( text_end - text, dot_rep_node->range_max-1);
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
if(max_len < dot_rep_node->range_min)
return FAIL_ACTION(dot_rep_node->fail_action);
return match_repeat(text, text_end, dot_rep_node->range_min, max_len,
dot_rep_node->bitmap_accept_after, dot_rep_node->child, fail_action,
dot_rep_node->fail_action);
}
case OP_CHAR_CLASS_REPEAT:
{
const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node);
const size_t max_len = MIN( text_end - text, class_rep_node->range_max-1);
/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
size_t rep;
if(max_len < class_rep_node->range_min)
return FAIL_ACTION(class_rep_node->fail_action);
for(rep=0;rep < max_len;rep++) {
if(!bitmap_accepts( class_rep_node->bitmap, text[rep])) {
break;
}
}
return match_repeat(text, text_end, class_rep_node->range_min, rep,
class_rep_node->bitmap_accept_after, class_rep_node->child, fail_action,
class_rep_node->fail_action);
break;
}
case OP_STRCMP:
{
const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node);
size_t i;
if(strcmp_node->fail_actions) {
const size_t max_len = MIN(strcmp_node->string_length, text_end-text);
/* we don't use strncmp, because we need the exact match-fail point */
for(i=0;i < max_len;i++) {
if(text[i] != strcmp_node->string[i]) {
return FAIL_ACTION( strcmp_node->fail_actions[i] );
}
}
if(max_len < strcmp_node->string_length) {
/* failed, because text was shorter */
return FAIL_ACTION( strcmp_node->fail_actions[max_len] );
}
}
else {
/* no fail_actions computed, some shortcuts possible on compare */
if((text_end - text < strcmp_node->string_length) ||
strncmp((const char*)text, (const char*)strcmp_node->string, strcmp_node->string_length)) {
return FAIL_ACTION( NULL );
}
}
/* match successful */
node = strcmp_node->child;
text += strcmp_node->string_length;
break;
}
case OP_GROUP_START:
{
const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node);
/* TODO: implement */
break;
}
case OP_GROUP_END:
{
{
const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node);
/* TODO: implement */
break;
}
default:
case OP_MATCH_OK:
{
cli_warnmsg("Unimplemented node type:%d", node->type);
return 0;
break;
return MATCH_OK;
}
}
}
return 1;/* match */
/* if fail_action was NULL, or text ended*/
return MATCH_FAILED;
}
#endif

Loading…
Cancel
Save