Add the new pdfng.c file

pull/6/head
Shawn Webb 11 years ago
parent 224d1c4de0
commit 125067829b
  1. 888
      libclamav/pdfng.c

@ -0,0 +1,888 @@
/*
* Copyright (C) 2014 Cisco and/or its affiliates. All rights reserved.
*
* Author: Shawn Webb
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* In addition, as a special exception, the copyright holders give
* permission to link the code of portions of this program with the
* OpenSSL library under certain conditions as described in each
* individual source file, and distribute linked combinations
* including the two.
*
* You must obey the GNU General Public License in all respects
* for all of the code used other than OpenSSL. If you modify
* file(s) with this exception, you may extend this exception to your
* version of the file(s), but you are not obligated to do so. If you
* do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source
* files in the program, then also delete it here.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <string.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <zlib.h>
#if HAVE_ICONV
#include <iconv.h>
#endif
#include <openssl/ssl.h>
#include <openssl/err.h>
#include "libclamav/crypto.h"
#include "clamav.h"
#include "others.h"
#include "pdf.h"
#include "scanners.h"
#include "fmap.h"
#include "str.h"
#include "bytecode.h"
#include "bytecode_api.h"
#include "arc4.h"
#include "rijndael.h"
#include "textnorm.h"
#include "json_api.h"
char *pdf_convert_utf(char *begin, size_t sz)
{
char *res=NULL;
#if HAVE_ICONV
char *buf, *outbuf, *p1, *p2;
size_t inlen, outlen, i;
char *encodings[] = {
"UTF-16",
NULL
};
iconv_t cd;
buf = cli_calloc(1, sz);
if (!(buf))
return NULL;
memcpy(buf, begin, sz);
p1 = buf;
p2 = outbuf = cli_calloc(1, sz+1);
if (!(outbuf)) {
free(buf);
return NULL;
}
for (i=0; encodings[i] != NULL; i++) {
p1 = buf;
p2 = outbuf;
inlen = outlen = sz;
cd = iconv_open("UTF-8", encodings[i]);
if (cd == (iconv_t)(-1)) {
cli_errmsg("Could not initialize iconv\n");
continue;
}
iconv(cd, &p1, &inlen, &p2, &outlen);
if (outlen == sz) {
/* Decoding unsuccessful right from the start */
iconv_close(cd);
continue;
}
outbuf[sz - outlen] = '\0';
res = strdup(outbuf);
iconv_close(cd);
break;
}
free(buf);
free(outbuf);
return res;
#else
res = cli_calloc(begin, sz+1);
if ((res)) {
memcpy(res, begin, sz);
res[sz] = '\0';
}
return res;
#endif
}
int is_object_reference(char *begin, char **endchar, uint32_t *id)
{
char *end = *endchar;
char *p1=begin, *p2;
unsigned long n;
uint32_t t=0;
/*
* Object references are always this format:
* XXXX YYYY R
* Where XXXX is the object ID and YYYY is the revision ID of the object.
* The letter R signifies that this is a reference.
*
* In between each item can be an arbitrary amount of whitespace.
*/
/* Skip whitespace */
while (p1 < end && isspace(p1[0]))
p1++;
if (p1 == end)
return 0;
if (!isnumber(p1[0]))
return 0;
/* Ensure strtoul() isn't going to go past our buffer */
p2 = p1+1;
while (p2 < end && !isspace(p2[0]))
p2++;
if (p2 == end)
return 0;
n = strtoul(p1, &p2, 10);
if (n == ULONG_MAX && errno)
return 0;
t = n<<8;
/* Skip more whitespace */
p1 = p2;
while (p1 < end && isspace(p1[0]))
p1++;
if (p1 == end)
return 0;
if (!isnumber(p1[0]))
return 0;
/* Ensure strtoul() is going to go past our buffer */
p2 = p1+1;
while (p2 < end && !isspace(p2[0]))
p2++;
if (p2 == end)
return 0;
n = strtoul(p1, &p2, 10);
if (n == ULONG_MAX && errno)
return 0;
t |= (n&0xff);
p1 = p2;
while (p1 < end && isspace(p1[0]))
p1++;
if (p1 == end)
return 0;
if (p1[0] == 'R') {
*endchar = p1+1;
if (id)
*id = t;
return 1;
}
return 0;
}
char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar)
{
const char *q = objstart;
char *p1, *p2;
size_t inlen, outlen, len, checklen;
char *buf, *outbuf, *res;
int likelyutf = 0;
unsigned int i;
uint32_t objid;
/*
* Yes, all of this is required to find the start and end of a potentially UTF-* string
*
* First, find the key of the key/value pair we're looking for in this object.
* Second, determine whether the value points to another object (NOTE: this is sketchy behavior)
* Third, attempt to determine if we're ASCII or UTF-*
* If we're ASCII, just copy the ASCII string into a new heap-allocated string and return that
* Fourth, Attempt to decode from UTF-* to UTF-8
*/
res = NULL;
if (str) {
checklen = strlen(str);
if (objsize < strlen(str) + 3)
return NULL;
for (p1=(char *)q; (p1 - q) < objsize-checklen; p1++)
if (!strncmp(p1, str, checklen))
break;
if (p1 - q == objsize - checklen)
return NULL;
p1 += checklen;
} else {
p1 = q;
}
while ((p1 - q) < objsize && isspace(p1[0]))
p1++;
if ((p1 - q) == objsize)
return NULL;
/* We should be at the start of the string, minus 1 */
p2 = q + objsize;
if (is_object_reference(p1, &p2, &objid)) {
struct pdf_obj *newobj;
char *end, *begin;
STATBUF sb;
uint32_t objflags;
int fd;
newobj = find_obj(pdf, obj, objid);
if (!(newobj))
return NULL;
if (newobj == obj)
return NULL;
/*
* If pdf_handlename hasn't been called for this object,
* then parse the object prior to extracting it
*/
if (!(newobj->statsflags & OBJ_FLAG_PDFNAME_DONE))
pdf_parseobj(pdf, newobj);
/* Extract the object. Force pdf_extract_obj() to dump this object. */
objflags = newobj->flags;
newobj->flags |= (1 << OBJ_FORCEDUMP);
if (pdf_extract_obj(pdf, newobj, PDF_EXTRACT_OBJ_NONE) != CL_SUCCESS)
return NULL;
newobj->flags = objflags;
if (!(newobj->path))
return NULL;
fd = open(newobj->path, O_RDONLY);
if (fd == -1) {
cli_unlink(newobj->path);
free(newobj->path);
newobj->path = NULL;
return NULL;
}
if (FSTAT(fd, &sb)) {
close(fd);
cli_unlink(newobj->path);
free(newobj->path);
newobj->path = NULL;
return NULL;
}
if (sb.st_size) {
begin = calloc(1, sb.st_size);
if (!(begin)) {
close(fd);
cli_unlink(newobj->path);
free(newobj->path);
newobj->path = NULL;
return NULL;
}
if (read(fd, begin, sb.st_size) != sb.st_size) {
close(fd);
cli_unlink(newobj->path);
free(newobj->path);
newobj->path = NULL;
free(begin);
return NULL;
}
switch (begin[0]) {
case '(':
case '<':
res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL, NULL);
free(begin);
break;
default:
res = pdf_convert_utf(begin, sb.st_size);
if (!(res))
res = begin;
else
free(begin);
}
}
close(fd);
cli_unlink(newobj->path);
free(newobj->path);
newobj->path = NULL;
if (endchar)
*endchar = p2;
return res;
}
if (*p1 == '<') {
size_t sz;
/* Hex string */
p2 = p1+1;
while ((p2 - q) < objsize && *p2 != '>')
p2++;
if (p2 - q == objsize) {
return NULL;
}
res = cli_calloc(1, (p2 - p1) + 2);
if (!(res))
return NULL;
strncpy(res, p1, (p2 - p1) + 1);
if (endchar)
*endchar = p2;
return res;
}
/* We should be at the start of a string literal (...) here */
if (*p1 != '(')
return NULL;
/* Make a best effort to find the end of the string and determine if UTF-* */
p2 = ++p1;
while (1) {
int shouldbreak=1;
unsigned int upperlimit=1;
while ((p2 - q) < objsize && *p2 != ')') {
if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0'))
likelyutf = 1;
p2++;
}
if ((p2 - q) == objsize || *p2 != ')')
return NULL;
if (likelyutf)
upperlimit = 3;
for (i=0; i <= upperlimit && p2 - i > p1; i++) {
if (*(p2-i) == '\\' && *(p2 - i - 1) != '\\') {
shouldbreak=0;
p2++;
}
}
if (shouldbreak) {
p2--;
break;
}
}
/* If we're an empty string (), p2 would be at the left paren and p1 would be at the right paren */
if (p2 < p1)
return NULL;
len = (size_t)(p2 - p1) + 1;
if (likelyutf == 0) {
/* We're not UTF-*, so just make a copy of the string and return that */
res = cli_calloc(1, len+1);
if (!(res))
return NULL;
memcpy(res, p1, len);
res[len] = '\0';
if (endchar)
*endchar = p2;
return res;
}
res = pdf_convert_utf(p1, len);
if (res && endchar)
*endchar = p2;
return res;
}
struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
{
struct pdf_dict *res=NULL;
struct pdf_dict_node *node=NULL;
const char *objstart;
char *end;
unsigned int in_string=0, ninner=0;
/* Sanity checking */
if (!(pdf) || !(obj) || !(begin))
return NULL;
objstart = (const char *)(obj->start + pdf->map);
if (begin < objstart || begin - objstart >= objsz - 2)
return NULL;
if (begin[0] != '<' || begin[1] != '<')
return NULL;
/* Find the end of the dictionary */
end = begin;
while (end - objstart < objsz) {
if (in_string) {
if (*end == ')')
in_string = 0;
end++;
continue;
}
switch (*end) {
case '(':
in_string=1;
break;
case '<':
if (end - objstart <= objsz - 2 && end[1] == '<')
ninner++;
break;
case '>':
if (end - objstart <= objsz - 2 && end[1] == '>')
ninner--;
break;
case '\\':
end += 2;
if (end - objstart >= objsz)
return NULL;
}
if (end - objstart <= objsz - 2)
if (end[0] == '>' && end[1] == '>' && ninner == 0)
break;
end++;
}
/* More sanity checking */
if (end - objstart >= objsz - 1)
return NULL;
if (end[0] != '>' || end[1] != '>')
return NULL;
res = cli_calloc(1, sizeof(struct pdf_dict));
if (!(res))
return NULL;
/* Loop through each element of the dictionary */
begin += 2;
while (begin < end) {
char *val=NULL, *key=NULL, *p1;
struct pdf_dict *dict=NULL;
struct pdf_array *arr=NULL;
/* Skip any whitespaces */
while (begin < end && isspace(begin[0]))
begin++;
if (begin == end)
break;
/* Get the key */
p1 = begin+1;
while (p1 < end && isalpha(p1[0]))
p1++;
if (p1 == end)
break;
key = cli_calloc((p1 - begin) + 2, 1);
if (!(key))
break;
strncpy(key, begin, p1 - begin);
key[p1 - begin] = '\0';
/* Now for the value */
begin = p1;
/* Skip any whitespaces */
while (begin < end && isspace(begin[0]))
begin++;
if (begin == end) {
free(key);
break;
}
switch (begin[0]) {
case '(':
val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
begin = p1+2;
break;
case '[':
arr = pdf_parse_array(pdf, obj, objsz, begin, &p1);
begin = p1+1;
break;
case '<':
if (begin - objstart < objsz - 2) {
if (begin[1] == '<') {
dict = pdf_parse_dict(pdf, obj, objsz, begin, &p1);
begin = p1+2;
break;
}
}
val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
begin = p1+2;
break;
default:
p1 = (begin[0] == '/') ? begin+1 : begin;
while (p1 < end) {
int shouldbreak = 0;
switch (p1[0]) {
case '>':
case '/':
shouldbreak=1;
break;
}
if (shouldbreak)
break;
p1++;
}
is_object_reference(begin, &p1, NULL);
val = cli_calloc((p1 - begin) + 2, 1);
if (!(val))
break;
strncpy(val, begin, p1 - begin);
val[p1 - begin] = '\0';
if (p1[0] != '/')
begin = p1+1;
else
begin = p1;
break;
}
if (!(val) && !(dict) && !(arr)) {
free(key);
break;
}
if (!(res->nodes)) {
res->nodes = res->tail = node = cli_calloc(1, sizeof(struct pdf_dict_node));
if (!(node)) {
free(key);
break;
}
} else {
node = calloc(1, sizeof(struct pdf_dict_node));
if (!(node)) {
free(key);
break;
}
node->prev = res->tail;
if (res->tail)
res->tail->next = node;
res->tail = node;
}
node->key = key;
if ((val)) {
node->value = val;
node->valuesz = strlen(val);
node->type = PDF_DICT_STRING;
} else if ((arr)) {
node->value = arr;
node->valuesz = sizeof(struct pdf_array);
node->type = PDF_DICT_ARRAY;
} else if ((dict)) {
node->value = dict;
node->valuesz = sizeof(struct pdf_dict);
node->type = PDF_DICT_DICT;
}
}
if (endchar)
*endchar = end;
return res;
}
struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
{
struct pdf_array *res=NULL;
struct pdf_array_node *node=NULL;
const char *objstart = obj->start + pdf->map;
char *end, *tempend;
int in_string=0, ninner=0;
/* Sanity checking */
if (!(pdf) || !(obj) || !(begin))
return NULL;
if (begin < objstart || begin - objstart >= objsz)
return NULL;
if (begin[0] != '[')
return NULL;
/* Find the end of the array */
end = begin;
while (end - objstart < objsz) {
if (in_string) {
if (*end == ')')
in_string = 0;
end++;
continue;
}
switch (*end) {
case '(':
in_string=1;
break;
case '[':
ninner++;
break;
case ']':
ninner--;
break;
case '\\':
end += 2;
if (end - objstart >= objsz)
return NULL;
}
if (*end == ']' && ninner == 0)
break;
end++;
}
/* More sanity checking */
if (end - objstart == objsz)
return NULL;
if (*end != ']')
return NULL;
res = cli_calloc(1, sizeof(struct pdf_array));
if (!(res))
return NULL;
begin++;
while (begin < end) {
char *val=NULL, *p1;
struct pdf_array *arr=NULL;
struct pdf_dict *dict=NULL;
while (begin < end && isspace(begin[0]))
begin++;
if (begin == end)
break;
switch (begin[0]) {
case '<':
if (begin - objstart < objsz - 2 && begin[1] == '<') {
dict = pdf_parse_dict(pdf, obj, objsz, begin, &begin);
break;
}
/* Not a dictionary. Intentially fall through. */
case '(':
val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin);
break;
case '[':
/* XXX We should have a recursion counter here */
arr = pdf_parse_array(pdf, obj, objsz, begin, &begin);
break;
default:
/* This should just be a number or the letter R */
p1 = end;
if (!is_object_reference(begin, &p1, NULL)) {
p1 = begin+1;
while (p1 < end && !isspace(p1[0]))
p1++;
}
val = cli_calloc((p1 - begin) + 2, 1);
if (!(val))
break;
strncpy(val, begin, p1 - begin);
val[p1 - begin] = '\0';
begin = p1;
break;
}
/* Parse error, just return what we could */
if (!(val) && !(arr) && !(dict))
break;
if (!(node)) {
res->nodes = res->tail = node = calloc(1, sizeof(struct pdf_array_node));
if (!(node))
break;
} else {
node = calloc(1, sizeof(struct pdf_array_node));
if (!(node))
break;
node->prev = res->tail;
if (res->tail)
res->tail->next = node;
res->tail = node;
}
if (val != NULL) {
node->type = PDF_ARR_STRING;
node->data = val;
node->datasz = strlen(val);
} else if (dict != NULL) {
node->type = PDF_ARR_DICT;
node->data = dict;
node->datasz = sizeof(struct pdf_dict);
} else {
node->type = PDF_ARR_ARRAY;
node->data = arr;
node->datasz = sizeof(struct pdf_array);
}
}
if (endchar)
*endchar = end;
return res;
}
void pdf_free_dict(struct pdf_dict *dict)
{
struct pdf_dict_node *node, *next;
node = dict->nodes;
while (node != NULL) {
free(node->key);
if (node->type == PDF_DICT_STRING)
free(node->value);
else if (node->type == PDF_DICT_ARRAY)
pdf_free_array((struct pdf_array *)(node->value));
else if (node->type == PDF_DICT_DICT)
pdf_free_dict((struct pdf_dict *)(node->value));
next = node->next;
free(node);
node = next;
}
free(dict);
}
void pdf_free_array(struct pdf_array *array)
{
struct pdf_array_node *node, *next;
if (!(array))
return;
node = array->nodes;
while (node != NULL) {
if (node->type == PDF_ARR_ARRAY)
pdf_free_array((struct pdf_array *)(node->data));
else if (node->type == PDF_ARR_DICT)
pdf_free_dict((struct pdf_dict *)(node->data));
else
free(node->data);
next = node->next;
free(node);
node = next;
}
free(array);
}
void pdf_print_array(struct pdf_array *array, unsigned long depth)
{
struct pdf_array_node *node;
unsigned long i;
for (i=0, node = array->nodes; node != NULL; node = node->next, i++) {
if (node->type == PDF_ARR_STRING)
cli_errmsg("array[%lu][%lu]: %s\n", depth, i, (char *)(node->data));
else
pdf_print_array((struct pdf_array *)(node->data), depth+1);
}
}
void pdf_print_dict(struct pdf_dict *dict, unsigned long depth)
{
struct pdf_dict_node *node;
for (node = dict->nodes; node != NULL; node = node->next) {
if (node->type == PDF_DICT_STRING) {
cli_errmsg("dict[%lu][%s]: %s\n", depth, node->key, (char *)(node->value));
} else if (node->type == PDF_DICT_ARRAY) {
cli_errmsg("dict[%lu][%s]: Array =>\n", depth, node->key);
pdf_print_array((struct pdf_array *)(node->value), depth);
} else if (node->type == PDF_DICT_DICT) {
pdf_print_dict((struct pdf_dict *)(node->value), depth+1);
}
}
}
Loading…
Cancel
Save