From 063f0d256263dbf201c42a39a1897b3b5611cb3c Mon Sep 17 00:00:00 2001 From: Shawn Webb Date: Fri, 23 May 2014 14:06:35 -0400 Subject: [PATCH] Parse the Author, Producer, Creator, CreationDate, and ModDate PDF flags --- libclamav/pdf.c | 263 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 240 insertions(+), 23 deletions(-) diff --git a/libclamav/pdf.c b/libclamav/pdf.c index 86bdbe448..2ba35e727 100644 --- a/libclamav/pdf.c +++ b/libclamav/pdf.c @@ -45,6 +45,8 @@ static char const rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $"; #endif #include +#include + #include #include #include "libclamav/crypto.h" @@ -71,6 +73,7 @@ static int asciihexdecode(const char *buf, off_t len, char *output); static int ascii85decode(const char *buf, off_t len, unsigned char *output); static const char *pdf_nextlinestart(const char *ptr, size_t len); static const char *pdf_nextobject(const char *ptr, size_t len); +static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str); /* PDF statistics callbacks and related */ struct pdf_struct; @@ -97,6 +100,11 @@ static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_acti static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); static void print_pdf_stats(struct pdf_struct *); +static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); +static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); +static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); +static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); +static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *); /* End PDF statistics callbacks and related */ static int xrefCheck(const char *xref, const char *eof) @@ -157,6 +165,11 @@ struct pdf_stats { int32_t nopenaction; /* Number of OpenAction objects */ int32_t nlaunch; /* Number of Launch objects */ int32_t npage; /* Number of Page objects */ + char *author; /* Author of the PDF */ + char *creator; /* Application used to create the PDF */ + char *producer; /* Application used to produce the PDF */ + char *creationdate; /* Date the PDF was created */ + char *modificationdate; /* Date the PDF was modified */ }; struct pdf_struct { @@ -1371,7 +1384,12 @@ static struct pdfname_action pdfname_actions[] = { {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, OpenAction_cb}, {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, Launch_cb}, {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, Page_cb}, - {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL} + {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL}, + {"Author", OBJ_DICT, STATE_NONE, STATE_NONE, Author_cb}, + {"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, Producer_cb}, + {"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, CreationDate_cb}, + {"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, ModificationDate_cb}, + {"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, Creator_cb} }; #define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT)) @@ -2783,6 +2801,130 @@ pdf_nextobject(const char *ptr, size_t len) return NULL; } +static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str) +{ + const char *q = objstart; + char *p1, *p2; + size_t inlen, outlen, len; + char *buf, *outbuf, *res; + iconv_t cd; + int likelyutf = 0; + unsigned int i; + char *encodings[] = { + "UTF-8", + "UTF-16", + NULL + }; + + if (objsize < strlen(str) + 3) + return NULL; + + res = NULL; + + /* Yes, all of this is required to find the start and end of a potentially UTF-* string */ + + for (p1=(char *)q; (p1 - q) < objsize-8; p1++) + if (!strncmp(p1, str, strlen(str))) + break; + + if (p1 - q > objsize - 8 || strncmp(p1, str, strlen(str))) + return NULL; + + while ((p1 - q) <= objsize && *p1 != '(') + p1++; + + if ((p1 - q) > objsize || *p1 != '(') + return NULL; + + p2 = ++p1; + while (1) { + int shouldbreak=1; + unsigned int upperlimit=1; + + while ((p2 - q) < objsize && *p2 != ')') { + if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0')) + likelyutf = 1; + + p2++; + } + + if ((p2 - q) > objsize || *p2 != ')') + return NULL; + + if (likelyutf) + upperlimit = 3; + + for (i=0; i <= upperlimit && p2 - i > p1; i++) { + if (*(p2-i) == '\\') { + shouldbreak=0; + p2++; + } + } + + if (shouldbreak) { + p2--; + break; + } + } + + if (p2 - p1 == 0) + return NULL; + + len = inlen = outlen = (size_t)(p2 - p1) + 1; + + if (likelyutf == 0) { + res = cli_calloc(1, len); + if (!(res)) + return NULL; + + memcpy(res, p1, len); + return res; + } + + buf = cli_calloc(1, inlen); + if (!(buf)) + return NULL; + + memcpy(buf, p1, inlen); + p1 = buf; + + p2 = outbuf = cli_calloc(1, outlen); + if (!(outbuf)) { + free(buf); + return NULL; + } + + for (i=0; encodings[i] != NULL; i++) { + buf = p1; + outbuf = p2; + + cd = iconv_open("ASCII", encodings[i]); + if (cd == (iconv_t)(-1)) { + cli_errmsg("Could not initialize iconv\n"); + continue; + } + + iconv(cd, &buf, &inlen, &outbuf, &outlen); + + if (outlen == len) { + /* Decoding unsuccessful right from the start */ + iconv_close(cd); + continue; + } + + p2[len - outlen] = '\0'; + + res = strdup(p2); + iconv_close(cd); + break; + } + + free(p1); + free(p2); + + return res; +} + /* PDF statistics */ static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) { @@ -2928,34 +3070,74 @@ static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_acti pdf->stats.npage++; } +static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) +{ + if (!(pdf)) + return; + + pdf->stats.author = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author"); +} + +static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) +{ + if (!(pdf)) + return; + + pdf->stats.creator = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator"); +} + +static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) +{ + if (!(pdf)) + return; + + pdf->stats.modificationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate"); +} + +static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) +{ + if (!(pdf)) + return; + + pdf->stats.creationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate"); +} + +static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) +{ + if (!(pdf)) + return; + + pdf->stats.producer = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer"); +} + static void print_pdf_stats(struct pdf_struct *pdf) { if (!(pdf)) return; cli_dbgmsg("Statistics collected from PDF:\n"); - cli_dbgmsg(" Invalid Objects:\t\t\t\t%lu\n", pdf->stats.ninvalidobjs); - cli_dbgmsg(" Number of JavaScript Objects:\t\t%lu\n", pdf->stats.njs); - cli_dbgmsg(" Number of Inflate-Encoded Objects:\t\t%lu\n", pdf->stats.nflate); - cli_dbgmsg(" Number of ActiveX Objects:\t\t\t%lu\n", pdf->stats.nactivex); - cli_dbgmsg(" Number of Flash Objects:\t\t\t%lu\n", pdf->stats.nflash); - cli_dbgmsg(" Number of Declared Colors:\t\t\t%lu\n", pdf->stats.ncolors); - cli_dbgmsg(" Number of ASCIIHexEncoded Objects:\t\t%lu\n", pdf->stats.nasciihexdecode); - cli_dbgmsg(" Number of ASCII85Encoded Objects:\t\t%lu\n", pdf->stats.nascii85decode); - cli_dbgmsg(" Number of Embedded Files:\t\t\t%lu\n", pdf->stats.nembeddedfile); - cli_dbgmsg(" Number of Image Objects:\t\t\t%lu\n", pdf->stats.nimage); - cli_dbgmsg(" Number of LZW-Encoded Objects:\t\t%lu\n", pdf->stats.nlzw); - cli_dbgmsg(" Number of RunLengthEncoded Objects:\t%lu\n", pdf->stats.nrunlengthdecode); - cli_dbgmsg(" Number of Fax-Encoded Objects:\t\t%lu\n", pdf->stats.nfaxdecode); - cli_dbgmsg(" Number of JBIG2-Encoded Objects:\t\t%lu\n", pdf->stats.njbig2decode); - cli_dbgmsg(" Number of DCT-Encoded Objects:\t\t%lu\n", pdf->stats.ndctdecode); - cli_dbgmsg(" Number of JPX-Encoded Objects:\t\t%lu\n", pdf->stats.njpxdecode); - cli_dbgmsg(" Number of Crypt-Encoded Objects:\t\t%lu\n", pdf->stats.ncrypt); - cli_dbgmsg(" Number of Standard-Filtered Objects:\t%lu\n", pdf->stats.nstandard); - cli_dbgmsg(" Number of Signed Objects:\t\t\t%lu\n", pdf->stats.nsigned); - cli_dbgmsg(" Number of Open Actions:\t\t\t%lu\n", pdf->stats.nopenaction); - cli_dbgmsg(" Number of Launch Objects:\t\t\t%lu\n", pdf->stats.nlaunch); - cli_dbgmsg(" Number of Objects with /Pages:\t\t%lu\n", pdf->stats.npage); + cli_dbgmsg(" Invalid Objects:\t\t\t\t%u\n", pdf->stats.ninvalidobjs); + cli_dbgmsg(" Number of JavaScript Objects:\t\t%u\n", pdf->stats.njs); + cli_dbgmsg(" Number of Inflate-Encoded Objects:\t\t%u\n", pdf->stats.nflate); + cli_dbgmsg(" Number of ActiveX Objects:\t\t\t%u\n", pdf->stats.nactivex); + cli_dbgmsg(" Number of Flash Objects:\t\t\t%u\n", pdf->stats.nflash); + cli_dbgmsg(" Number of Declared Colors:\t\t\t%u\n", pdf->stats.ncolors); + cli_dbgmsg(" Number of ASCIIHexEncoded Objects:\t\t%u\n", pdf->stats.nasciihexdecode); + cli_dbgmsg(" Number of ASCII85Encoded Objects:\t\t%u\n", pdf->stats.nascii85decode); + cli_dbgmsg(" Number of Embedded Files:\t\t\t%u\n", pdf->stats.nembeddedfile); + cli_dbgmsg(" Number of Image Objects:\t\t\t%u\n", pdf->stats.nimage); + cli_dbgmsg(" Number of LZW-Encoded Objects:\t\t%u\n", pdf->stats.nlzw); + cli_dbgmsg(" Number of RunLengthEncoded Objects:\t%u\n", pdf->stats.nrunlengthdecode); + cli_dbgmsg(" Number of Fax-Encoded Objects:\t\t%u\n", pdf->stats.nfaxdecode); + cli_dbgmsg(" Number of JBIG2-Encoded Objects:\t\t%u\n", pdf->stats.njbig2decode); + cli_dbgmsg(" Number of DCT-Encoded Objects:\t\t%u\n", pdf->stats.ndctdecode); + cli_dbgmsg(" Number of JPX-Encoded Objects:\t\t%u\n", pdf->stats.njpxdecode); + cli_dbgmsg(" Number of Crypt-Encoded Objects:\t\t%u\n", pdf->stats.ncrypt); + cli_dbgmsg(" Number of Standard-Filtered Objects:\t%u\n", pdf->stats.nstandard); + cli_dbgmsg(" Number of Signed Objects:\t\t\t%u\n", pdf->stats.nsigned); + cli_dbgmsg(" Number of Open Actions:\t\t\t%u\n", pdf->stats.nopenaction); + cli_dbgmsg(" Number of Launch Objects:\t\t\t%u\n", pdf->stats.nlaunch); + cli_dbgmsg(" Number of Objects with /Pages:\t\t%u\n", pdf->stats.npage); } static void pdf_export_json(struct pdf_struct *pdf) @@ -2977,6 +3159,41 @@ static void pdf_export_json(struct pdf_struct *pdf) return; json_object_object_add(pdf->ctx->wrkproperty, "PDFStats", pdfobj); + if (pdf->stats.author) { + cli_jsonstr(pdfobj, "Author", pdf->stats.author); + + free(pdf->stats.author); + pdf->stats.author = NULL; + } + + if (pdf->stats.creator) { + cli_jsonstr(pdfobj, "Creator", pdf->stats.creator); + + free(pdf->stats.creator); + pdf->stats.creator = NULL; + } + + if (pdf->stats.producer) { + cli_jsonstr(pdfobj, "Producer", pdf->stats.producer); + + free(pdf->stats.producer); + pdf->stats.producer = NULL; + } + + if (pdf->stats.modificationdate) { + cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate); + + free(pdf->stats.modificationdate); + pdf->stats.modificationdate = NULL; + } + + if (pdf->stats.creationdate) { + cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate); + + free(pdf->stats.creationdate); + pdf->stats.creationdate = NULL; + } + if (pdf->stats.ninvalidobjs) cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs); if (pdf->stats.njs)