Parse the Author, Producer, Creator, CreationDate, and ModDate PDF flags

pull/6/head
Shawn Webb 11 years ago
parent 75c525ed7d
commit 063f0d2562
  1. 263
      libclamav/pdf.c

@ -45,6 +45,8 @@ static char const rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
#endif
#include <zlib.h>
#include <iconv.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#include "libclamav/crypto.h"
@ -71,6 +73,7 @@ static int asciihexdecode(const char *buf, off_t len, char *output);
static int ascii85decode(const char *buf, off_t len, unsigned char *output);
static const char *pdf_nextlinestart(const char *ptr, size_t len);
static const char *pdf_nextobject(const char *ptr, size_t len);
static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str);
/* PDF statistics callbacks and related */
struct pdf_struct;
@ -97,6 +100,11 @@ static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_acti
static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
static void print_pdf_stats(struct pdf_struct *);
static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
/* End PDF statistics callbacks and related */
static int xrefCheck(const char *xref, const char *eof)
@ -157,6 +165,11 @@ struct pdf_stats {
int32_t nopenaction; /* Number of OpenAction objects */
int32_t nlaunch; /* Number of Launch objects */
int32_t npage; /* Number of Page objects */
char *author; /* Author of the PDF */
char *creator; /* Application used to create the PDF */
char *producer; /* Application used to produce the PDF */
char *creationdate; /* Date the PDF was created */
char *modificationdate; /* Date the PDF was modified */
};
struct pdf_struct {
@ -1371,7 +1384,12 @@ static struct pdfname_action pdfname_actions[] = {
{"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, OpenAction_cb},
{"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, Launch_cb},
{"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, Page_cb},
{"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL}
{"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL},
{"Author", OBJ_DICT, STATE_NONE, STATE_NONE, Author_cb},
{"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, Producer_cb},
{"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, CreationDate_cb},
{"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, ModificationDate_cb},
{"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, Creator_cb}
};
#define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
@ -2783,6 +2801,130 @@ pdf_nextobject(const char *ptr, size_t len)
return NULL;
}
static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str)
{
const char *q = objstart;
char *p1, *p2;
size_t inlen, outlen, len;
char *buf, *outbuf, *res;
iconv_t cd;
int likelyutf = 0;
unsigned int i;
char *encodings[] = {
"UTF-8",
"UTF-16",
NULL
};
if (objsize < strlen(str) + 3)
return NULL;
res = NULL;
/* Yes, all of this is required to find the start and end of a potentially UTF-* string */
for (p1=(char *)q; (p1 - q) < objsize-8; p1++)
if (!strncmp(p1, str, strlen(str)))
break;
if (p1 - q > objsize - 8 || strncmp(p1, str, strlen(str)))
return NULL;
while ((p1 - q) <= objsize && *p1 != '(')
p1++;
if ((p1 - q) > objsize || *p1 != '(')
return NULL;
p2 = ++p1;
while (1) {
int shouldbreak=1;
unsigned int upperlimit=1;
while ((p2 - q) < objsize && *p2 != ')') {
if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0'))
likelyutf = 1;
p2++;
}
if ((p2 - q) > objsize || *p2 != ')')
return NULL;
if (likelyutf)
upperlimit = 3;
for (i=0; i <= upperlimit && p2 - i > p1; i++) {
if (*(p2-i) == '\\') {
shouldbreak=0;
p2++;
}
}
if (shouldbreak) {
p2--;
break;
}
}
if (p2 - p1 == 0)
return NULL;
len = inlen = outlen = (size_t)(p2 - p1) + 1;
if (likelyutf == 0) {
res = cli_calloc(1, len);
if (!(res))
return NULL;
memcpy(res, p1, len);
return res;
}
buf = cli_calloc(1, inlen);
if (!(buf))
return NULL;
memcpy(buf, p1, inlen);
p1 = buf;
p2 = outbuf = cli_calloc(1, outlen);
if (!(outbuf)) {
free(buf);
return NULL;
}
for (i=0; encodings[i] != NULL; i++) {
buf = p1;
outbuf = p2;
cd = iconv_open("ASCII", encodings[i]);
if (cd == (iconv_t)(-1)) {
cli_errmsg("Could not initialize iconv\n");
continue;
}
iconv(cd, &buf, &inlen, &outbuf, &outlen);
if (outlen == len) {
/* Decoding unsuccessful right from the start */
iconv_close(cd);
continue;
}
p2[len - outlen] = '\0';
res = strdup(p2);
iconv_close(cd);
break;
}
free(p1);
free(p2);
return res;
}
/* PDF statistics */
static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
{
@ -2928,34 +3070,74 @@ static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_acti
pdf->stats.npage++;
}
static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
{
if (!(pdf))
return;
pdf->stats.author = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author");
}
static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
{
if (!(pdf))
return;
pdf->stats.creator = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator");
}
static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
{
if (!(pdf))
return;
pdf->stats.modificationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate");
}
static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
{
if (!(pdf))
return;
pdf->stats.creationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate");
}
static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
{
if (!(pdf))
return;
pdf->stats.producer = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer");
}
static void print_pdf_stats(struct pdf_struct *pdf)
{
if (!(pdf))
return;
cli_dbgmsg("Statistics collected from PDF:\n");
cli_dbgmsg(" Invalid Objects:\t\t\t\t%lu\n", pdf->stats.ninvalidobjs);
cli_dbgmsg(" Number of JavaScript Objects:\t\t%lu\n", pdf->stats.njs);
cli_dbgmsg(" Number of Inflate-Encoded Objects:\t\t%lu\n", pdf->stats.nflate);
cli_dbgmsg(" Number of ActiveX Objects:\t\t\t%lu\n", pdf->stats.nactivex);
cli_dbgmsg(" Number of Flash Objects:\t\t\t%lu\n", pdf->stats.nflash);
cli_dbgmsg(" Number of Declared Colors:\t\t\t%lu\n", pdf->stats.ncolors);
cli_dbgmsg(" Number of ASCIIHexEncoded Objects:\t\t%lu\n", pdf->stats.nasciihexdecode);
cli_dbgmsg(" Number of ASCII85Encoded Objects:\t\t%lu\n", pdf->stats.nascii85decode);
cli_dbgmsg(" Number of Embedded Files:\t\t\t%lu\n", pdf->stats.nembeddedfile);
cli_dbgmsg(" Number of Image Objects:\t\t\t%lu\n", pdf->stats.nimage);
cli_dbgmsg(" Number of LZW-Encoded Objects:\t\t%lu\n", pdf->stats.nlzw);
cli_dbgmsg(" Number of RunLengthEncoded Objects:\t%lu\n", pdf->stats.nrunlengthdecode);
cli_dbgmsg(" Number of Fax-Encoded Objects:\t\t%lu\n", pdf->stats.nfaxdecode);
cli_dbgmsg(" Number of JBIG2-Encoded Objects:\t\t%lu\n", pdf->stats.njbig2decode);
cli_dbgmsg(" Number of DCT-Encoded Objects:\t\t%lu\n", pdf->stats.ndctdecode);
cli_dbgmsg(" Number of JPX-Encoded Objects:\t\t%lu\n", pdf->stats.njpxdecode);
cli_dbgmsg(" Number of Crypt-Encoded Objects:\t\t%lu\n", pdf->stats.ncrypt);
cli_dbgmsg(" Number of Standard-Filtered Objects:\t%lu\n", pdf->stats.nstandard);
cli_dbgmsg(" Number of Signed Objects:\t\t\t%lu\n", pdf->stats.nsigned);
cli_dbgmsg(" Number of Open Actions:\t\t\t%lu\n", pdf->stats.nopenaction);
cli_dbgmsg(" Number of Launch Objects:\t\t\t%lu\n", pdf->stats.nlaunch);
cli_dbgmsg(" Number of Objects with /Pages:\t\t%lu\n", pdf->stats.npage);
cli_dbgmsg(" Invalid Objects:\t\t\t\t%u\n", pdf->stats.ninvalidobjs);
cli_dbgmsg(" Number of JavaScript Objects:\t\t%u\n", pdf->stats.njs);
cli_dbgmsg(" Number of Inflate-Encoded Objects:\t\t%u\n", pdf->stats.nflate);
cli_dbgmsg(" Number of ActiveX Objects:\t\t\t%u\n", pdf->stats.nactivex);
cli_dbgmsg(" Number of Flash Objects:\t\t\t%u\n", pdf->stats.nflash);
cli_dbgmsg(" Number of Declared Colors:\t\t\t%u\n", pdf->stats.ncolors);
cli_dbgmsg(" Number of ASCIIHexEncoded Objects:\t\t%u\n", pdf->stats.nasciihexdecode);
cli_dbgmsg(" Number of ASCII85Encoded Objects:\t\t%u\n", pdf->stats.nascii85decode);
cli_dbgmsg(" Number of Embedded Files:\t\t\t%u\n", pdf->stats.nembeddedfile);
cli_dbgmsg(" Number of Image Objects:\t\t\t%u\n", pdf->stats.nimage);
cli_dbgmsg(" Number of LZW-Encoded Objects:\t\t%u\n", pdf->stats.nlzw);
cli_dbgmsg(" Number of RunLengthEncoded Objects:\t%u\n", pdf->stats.nrunlengthdecode);
cli_dbgmsg(" Number of Fax-Encoded Objects:\t\t%u\n", pdf->stats.nfaxdecode);
cli_dbgmsg(" Number of JBIG2-Encoded Objects:\t\t%u\n", pdf->stats.njbig2decode);
cli_dbgmsg(" Number of DCT-Encoded Objects:\t\t%u\n", pdf->stats.ndctdecode);
cli_dbgmsg(" Number of JPX-Encoded Objects:\t\t%u\n", pdf->stats.njpxdecode);
cli_dbgmsg(" Number of Crypt-Encoded Objects:\t\t%u\n", pdf->stats.ncrypt);
cli_dbgmsg(" Number of Standard-Filtered Objects:\t%u\n", pdf->stats.nstandard);
cli_dbgmsg(" Number of Signed Objects:\t\t\t%u\n", pdf->stats.nsigned);
cli_dbgmsg(" Number of Open Actions:\t\t\t%u\n", pdf->stats.nopenaction);
cli_dbgmsg(" Number of Launch Objects:\t\t\t%u\n", pdf->stats.nlaunch);
cli_dbgmsg(" Number of Objects with /Pages:\t\t%u\n", pdf->stats.npage);
}
static void pdf_export_json(struct pdf_struct *pdf)
@ -2977,6 +3159,41 @@ static void pdf_export_json(struct pdf_struct *pdf)
return;
json_object_object_add(pdf->ctx->wrkproperty, "PDFStats", pdfobj);
if (pdf->stats.author) {
cli_jsonstr(pdfobj, "Author", pdf->stats.author);
free(pdf->stats.author);
pdf->stats.author = NULL;
}
if (pdf->stats.creator) {
cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
free(pdf->stats.creator);
pdf->stats.creator = NULL;
}
if (pdf->stats.producer) {
cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
free(pdf->stats.producer);
pdf->stats.producer = NULL;
}
if (pdf->stats.modificationdate) {
cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
free(pdf->stats.modificationdate);
pdf->stats.modificationdate = NULL;
}
if (pdf->stats.creationdate) {
cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
free(pdf->stats.creationdate);
pdf->stats.creationdate = NULL;
}
if (pdf->stats.ninvalidobjs)
cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
if (pdf->stats.njs)

Loading…
Cancel
Save