Add explanitory comments to the new pdf_parse_string function

11 years ago · 76aa0fc55a
parent 6606d05000
commit 76aa0fc55a
1 changed files with 34 additions and 2 deletions
--- a/libclamav/pdf.c
+++ b/libclamav/pdf.c
@ -2832,7 +2832,14 @@ static char *pdf_parse_string(struct pdf_struct *pdf, const char *objstart, size

    checklen = strlen(str);

-    /* Yes, all of this is required to find the start and end of a potentially UTF-* string */
+    /* Yes, all of this is required to find the start and end of a potentially UTF-* string
+     *
+     * First, find the key of the key/value pair we're looking for in this object.
+     * Second, determine whether the value points to another object (NOTE: this is sketchy behavior)
+     * Third, attempt to determine if we're ASCII or UTF-*
+     * If we're ASCII, just copy the ASCII string into a new heap-allocated string and return that
+     * Fourth, Attempt to decode from UTF-* to ASCII
+     */

    for (p1=(char *)q; (p1 - q) < objsize-checklen; p1++)
        if (!strncmp(p1, str, checklen))
@ -2853,21 +2860,44 @@ static char *pdf_parse_string(struct pdf_struct *pdf, const char *objstart, size
    if ((p1 - q) == objsize)
        return NULL;

+    /* We should be at the start of the string, minus 1 */
+
    if (isdigit(p1[0])) {
-        unsigned long objnum;
+        unsigned long objnum, revnum;
        char *end;

+        /*
+         * This is kind of sketchy... This string says it points to another object.
+         * Try to get/parse the object and return the decoded value as an ASCII string.
+         */
+
+        /* Get the object number */
        objnum = strtoul(p1, &end, 10);
        if ((end - p1) == 0)
            return NULL;

+        /* Skip whitespaces */
+        p1 = end;
+        while ((p1 - q) < objsize && isspace(p1[0]))
+            p1++;
+
+        if (p1 - q == objsize)
+            return NULL;
+
+        /* Get revision number */
+        revnum = strtoul(p1, &end, 10);
+        if ((end - p1) == 0)
+            return NULL;
+
        if (objnum > pdf->nobjs)
            return NULL;

+        /* TODO: Parse the object, decode it if necessary, and return the string */
        res = NULL;
        return res;
    }

+    /* Make a best effort to find the end of the string and determine if UTF-* */
    p2 = ++p1;
    while (1) {
        int shouldbreak=1;
@ -2905,6 +2935,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, const char *objstart, size
    len = inlen = outlen = (size_t)(p2 - p1) + 1;

    if (likelyutf == 0) {
+        /* We're not UTF-*, so just make a copy of the string and return that */
        res = cli_calloc(1, len);
        if (!(res))
            return NULL;
@ -2914,6 +2945,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, const char *objstart, size
    }

 #if HAVE_ICONV
+    /* We should be UTF-* from this point on, try to decode */
    buf = cli_calloc(1, inlen);
    if (!(buf))
        return NULL;