From 5b734eb4aac45180ae0ab329cf5099a326621e86 Mon Sep 17 00:00:00 2001 From: Shawn Webb Date: Wed, 25 Jun 2014 10:58:55 -0400 Subject: [PATCH] Make some of the new PDF code a little more efficient --- libclamav/pdf.c | 79 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/libclamav/pdf.c b/libclamav/pdf.c index 850b85c66..3ff4c4248 100644 --- a/libclamav/pdf.c +++ b/libclamav/pdf.c @@ -83,6 +83,7 @@ static const char *pdf_nextobject(const char *ptr, size_t len); static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar); static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar); static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar); +static int is_object_reference(char *begin, char **endchar, uint32_t *id); static void pdf_free_dict(struct pdf_dict *dict); static void pdf_free_array(struct pdf_array *array); static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags); @@ -2901,12 +2902,23 @@ static char *pdf_convert_utf(char *begin, size_t sz) #endif } -static int is_object_reference(char *begin, char **endchar) +static int is_object_reference(char *begin, char **endchar, uint32_t *id) { char *end = *endchar; char *p1=begin, *p2; unsigned long n; + uint32_t t=0; + /* + * Object references are always this format: + * XXXX YYYY R + * Where XXXX is the object ID and YYYY is the revision ID of the object. + * The letter R signifies that this is a reference. + * + * In between each item can be an arbitrary amount of whitespace. + */ + + /* Skip whitespace */ while (p1 < end && isspace(p1[0])) p1++; @@ -2916,6 +2928,7 @@ static int is_object_reference(char *begin, char **endchar) if (!isnumber(p1[0])) return 0; + /* Ensure strtoul() isn't going to go past our buffer */ p2 = p1+1; while (p2 < end && !isspace(p2[0])) p2++; @@ -2927,6 +2940,9 @@ static int is_object_reference(char *begin, char **endchar) if (n == ULONG_MAX && errno) return 0; + t = n<<8; + + /* Skip more whitespace */ p1 = p2; while (p1 < end && isspace(p1[0])) p1++; @@ -2937,6 +2953,7 @@ static int is_object_reference(char *begin, char **endchar) if (!isnumber(p1[0])) return 0; + /* Ensure strtoul() is going to go past our buffer */ p2 = p1+1; while (p2 < end && !isspace(p2[0])) p2++; @@ -2948,6 +2965,8 @@ static int is_object_reference(char *begin, char **endchar) if (n == ULONG_MAX && errno) return 0; + t |= (n&0xff); + p1 = p2; while (p1 < end && isspace(p1[0])) p1++; @@ -2957,6 +2976,9 @@ static int is_object_reference(char *begin, char **endchar) if (p1[0] == 'R') { *endchar = p1+1; + if (id) + *id = t; + return 1; } @@ -2971,6 +2993,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *buf, *outbuf, *res; int likelyutf = 0; unsigned int i; + uint32_t objid; /* * Yes, all of this is required to find the start and end of a potentially UTF-* string @@ -3011,45 +3034,14 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const /* We should be at the start of the string, minus 1 */ p2 = q + objsize; - if (is_object_reference(p1, &p2)) { - unsigned long objnum, revnum; + if (is_object_reference(p1, &p2, &objid)) { struct pdf_obj *newobj; char *end, *begin; STATBUF sb; uint32_t objflags; int fd; - /* - * This is kind of sketchy... This string says it points to another object. - * Try to get/parse the object and return the decoded value as an ASCII/UTF-8 string. - */ - - /* Get the object number */ - objnum = strtoul(p1, &end, 10); - if ((end - p1) == 0) - return NULL; - - /* Skip whitespace and get the revision number */ - p1 = end+1; - while (p1 - q < objsize && isspace(p1[0])) - p1++; - - if (p1 - q == objsize) - return NULL; - - revnum = strtoul(p1, &end, 10); - - p1 = end+1; - while (p1 - q < objsize && isspace(p1[0])) - p1++; - - if (p1 - q == objsize) - return NULL; - - if (p1[0] != 'R') - return NULL; - - newobj = find_obj(pdf, obj, (objnum<<8) | (revnum & 0xff)); + newobj = find_obj(pdf, obj, objid); if (!(newobj)) return NULL; @@ -3131,7 +3123,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const newobj->path = NULL; if (endchar) - *endchar = p1; + *endchar = p2; return res; } @@ -3368,7 +3360,7 @@ static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *o p1++; } - is_object_reference(begin, &p1); + is_object_reference(begin, &p1, NULL); val = cli_calloc((p1 - begin) + 2, 1); if (!(val)) @@ -3517,7 +3509,7 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj default: /* This should just be a number or the letter R */ p1 = end; - if (!is_object_reference(begin, &p1)) { + if (!is_object_reference(begin, &p1, NULL)) { p1 = begin+1; while (p1 < end && !isspace(p1[0])) p1++; @@ -3639,12 +3631,14 @@ static void pdf_print_dict(struct pdf_dict *dict, unsigned long depth) struct pdf_dict_node *node; for (node = dict->nodes; node != NULL; node = node->next) { - if (node->type == PDF_DICT_STRING) + if (node->type == PDF_DICT_STRING) { cli_errmsg("dict[%lu][%s]: %s\n", depth, node->key, (char *)(node->value)); - else if (node->type == PDF_DICT_ARRAY) + } else if (node->type == PDF_DICT_ARRAY) { + cli_errmsg("dict[%lu][%s]: Array =>\n", depth, node->key); pdf_print_array((struct pdf_array *)(node->value), depth); - else if (node->type == PDF_DICT_DICT) + } else if (node->type == PDF_DICT_DICT) { pdf_print_dict((struct pdf_dict *)(node->value), depth+1); + } } } @@ -3939,8 +3933,11 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_act return; dict = pdf_parse_dict(pdf, obj, objsz, begin, NULL); - if (dict) + if (dict) { + cli_errmsg("==== ==== ==== ====\n"); + pdf_print_dict(dict, 0); pdf_free_dict(dict); + } begin = cli_memstr(objstart, objsz, "/Kids", 5); if (!(begin))