Make some of the new PDF code a little more efficient

pull/6/head
Shawn Webb 11 years ago
parent dd101bee5b
commit 5b734eb4aa
  1. 79
      libclamav/pdf.c

@ -83,6 +83,7 @@ static const char *pdf_nextobject(const char *ptr, size_t len);
static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar);
static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
static int is_object_reference(char *begin, char **endchar, uint32_t *id);
static void pdf_free_dict(struct pdf_dict *dict);
static void pdf_free_array(struct pdf_array *array);
static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags);
@ -2901,12 +2902,23 @@ static char *pdf_convert_utf(char *begin, size_t sz)
#endif
}
static int is_object_reference(char *begin, char **endchar)
static int is_object_reference(char *begin, char **endchar, uint32_t *id)
{
char *end = *endchar;
char *p1=begin, *p2;
unsigned long n;
uint32_t t=0;
/*
* Object references are always this format:
* XXXX YYYY R
* Where XXXX is the object ID and YYYY is the revision ID of the object.
* The letter R signifies that this is a reference.
*
* In between each item can be an arbitrary amount of whitespace.
*/
/* Skip whitespace */
while (p1 < end && isspace(p1[0]))
p1++;
@ -2916,6 +2928,7 @@ static int is_object_reference(char *begin, char **endchar)
if (!isnumber(p1[0]))
return 0;
/* Ensure strtoul() isn't going to go past our buffer */
p2 = p1+1;
while (p2 < end && !isspace(p2[0]))
p2++;
@ -2927,6 +2940,9 @@ static int is_object_reference(char *begin, char **endchar)
if (n == ULONG_MAX && errno)
return 0;
t = n<<8;
/* Skip more whitespace */
p1 = p2;
while (p1 < end && isspace(p1[0]))
p1++;
@ -2937,6 +2953,7 @@ static int is_object_reference(char *begin, char **endchar)
if (!isnumber(p1[0]))
return 0;
/* Ensure strtoul() is going to go past our buffer */
p2 = p1+1;
while (p2 < end && !isspace(p2[0]))
p2++;
@ -2948,6 +2965,8 @@ static int is_object_reference(char *begin, char **endchar)
if (n == ULONG_MAX && errno)
return 0;
t |= (n&0xff);
p1 = p2;
while (p1 < end && isspace(p1[0]))
p1++;
@ -2957,6 +2976,9 @@ static int is_object_reference(char *begin, char **endchar)
if (p1[0] == 'R') {
*endchar = p1+1;
if (id)
*id = t;
return 1;
}
@ -2971,6 +2993,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
char *buf, *outbuf, *res;
int likelyutf = 0;
unsigned int i;
uint32_t objid;
/*
* Yes, all of this is required to find the start and end of a potentially UTF-* string
@ -3011,45 +3034,14 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
/* We should be at the start of the string, minus 1 */
p2 = q + objsize;
if (is_object_reference(p1, &p2)) {
unsigned long objnum, revnum;
if (is_object_reference(p1, &p2, &objid)) {
struct pdf_obj *newobj;
char *end, *begin;
STATBUF sb;
uint32_t objflags;
int fd;
/*
* This is kind of sketchy... This string says it points to another object.
* Try to get/parse the object and return the decoded value as an ASCII/UTF-8 string.
*/
/* Get the object number */
objnum = strtoul(p1, &end, 10);
if ((end - p1) == 0)
return NULL;
/* Skip whitespace and get the revision number */
p1 = end+1;
while (p1 - q < objsize && isspace(p1[0]))
p1++;
if (p1 - q == objsize)
return NULL;
revnum = strtoul(p1, &end, 10);
p1 = end+1;
while (p1 - q < objsize && isspace(p1[0]))
p1++;
if (p1 - q == objsize)
return NULL;
if (p1[0] != 'R')
return NULL;
newobj = find_obj(pdf, obj, (objnum<<8) | (revnum & 0xff));
newobj = find_obj(pdf, obj, objid);
if (!(newobj))
return NULL;
@ -3131,7 +3123,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
newobj->path = NULL;
if (endchar)
*endchar = p1;
*endchar = p2;
return res;
}
@ -3368,7 +3360,7 @@ static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *o
p1++;
}
is_object_reference(begin, &p1);
is_object_reference(begin, &p1, NULL);
val = cli_calloc((p1 - begin) + 2, 1);
if (!(val))
@ -3517,7 +3509,7 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj
default:
/* This should just be a number or the letter R */
p1 = end;
if (!is_object_reference(begin, &p1)) {
if (!is_object_reference(begin, &p1, NULL)) {
p1 = begin+1;
while (p1 < end && !isspace(p1[0]))
p1++;
@ -3639,12 +3631,14 @@ static void pdf_print_dict(struct pdf_dict *dict, unsigned long depth)
struct pdf_dict_node *node;
for (node = dict->nodes; node != NULL; node = node->next) {
if (node->type == PDF_DICT_STRING)
if (node->type == PDF_DICT_STRING) {
cli_errmsg("dict[%lu][%s]: %s\n", depth, node->key, (char *)(node->value));
else if (node->type == PDF_DICT_ARRAY)
} else if (node->type == PDF_DICT_ARRAY) {
cli_errmsg("dict[%lu][%s]: Array =>\n", depth, node->key);
pdf_print_array((struct pdf_array *)(node->value), depth);
else if (node->type == PDF_DICT_DICT)
} else if (node->type == PDF_DICT_DICT) {
pdf_print_dict((struct pdf_dict *)(node->value), depth+1);
}
}
}
@ -3939,8 +3933,11 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_act
return;
dict = pdf_parse_dict(pdf, obj, objsz, begin, NULL);
if (dict)
if (dict) {
cli_errmsg("==== ==== ==== ====\n");
pdf_print_dict(dict, 0);
pdf_free_dict(dict);
}
begin = cli_memstr(objstart, objsz, "/Kids", 5);
if (!(begin))

Loading…
Cancel
Save