ooxml: adjusted to reuse code used in msxml_parser

remotes/push_mirror/klin/altstr-yara
Kevin Lin 10 years ago
parent ab9611d4c1
commit d8f7468163
  1. 448
      libclamav/ooxml.c

@ -31,7 +31,7 @@
#include "json.h"
#endif
#include "json_api.h"
#include "msxml_parser.h"
#include "ooxml.h"
#if HAVE_LIBXML2
@ -47,362 +47,55 @@
#if HAVE_LIBXML2 && HAVE_JSON
#define OOXML_JSON_RECLEVEL 16
#define OOXML_JSON_RECLEVEL_MAX 5
#define OOXML_JSON_STRLEN_MAX 100
#define check_state(state) \
do { \
if (state == -1) { \
cli_warnmsg("check_state[ooxml]: CL_EPARSE @ ln%d\n", __LINE__); \
return CL_EPARSE; \
} \
else if (state == 0) { \
cli_dbgmsg("check_state[ooxml]: CL_BREAK @ ln%d\n", __LINE__); \
return CL_BREAK; \
} \
} while(0)
static int ooxml_is_int(const char *value, size_t len, int32_t *val)
{
long val2;
char *endptr = NULL;
val2 = strtol(value, &endptr, 10);
if (endptr != value+len) {
return 0;
}
*val = (int32_t)(val2 & 0x0000ffff);
return 1;
}
static int ooxml_add_parse_error(json_object *wrkptr, const xmlChar *errstr)
{
json_object *perr;
if (!wrkptr)
return CL_ENULLARG;
perr = cli_jsonarray(wrkptr, "ParseErrors");
if (perr == NULL) {
return CL_EMEM;
}
return cli_jsonstr(perr, NULL, errstr);
}
static int ooxml_parse_value(json_object *wrkptr, const char *arrname, const xmlChar *node_value)
{
json_object *newobj, *arrobj;
int val;
if (!wrkptr)
return CL_ENULLARG;
arrobj = cli_jsonarray(wrkptr, arrname);
if (arrobj == NULL) {
return CL_EMEM;
}
if (ooxml_is_int((const char *)node_value, xmlStrlen(node_value), &val)) {
newobj = json_object_new_int(val);
}
else if (!xmlStrcmp(node_value, (const xmlChar *)"true")) {
newobj = json_object_new_boolean(1);
}
else if (!xmlStrcmp(node_value, (const xmlChar *)"false")) {
newobj = json_object_new_boolean(0);
}
else {
newobj = json_object_new_string((const char *)node_value);
}
if (NULL == newobj) {
cli_errmsg("ooxml_parse_value: no memory for json value for [%s]\n", arrname);
return CL_EMEM;
}
json_object_array_add(arrobj, newobj);
return CL_SUCCESS;
}
static const char *ooxml_keys[] = {
"coreproperties",
"title",
"subject",
"creator",
"keywords",
"comments",
"description",
"lastmodifiedby",
"revision",
"created",
"modified",
"category",
"contentstatus",
"properties",
"application",
"appversion",
"characters",
"characterswithspaces",
"company",
"digsig",
"docsecurity",
//"headingpairs",
"hiddenslides",
"hlinks",
"hyperlinkbase",
"hyperlinkschanged",
"lines",
"linksuptodate",
"manager",
"mmclips",
"notes",
"pages",
"paragraphs",
"presentationformat",
"properties",
"scalecrop",
"shareddoc",
"slides",
"template",
//"titlesofparts",
"totaltime",
"words"
};
static const char *ooxml_json_keys[] = {
"CoreProperties",
"Title",
"Subject",
"Author",
"Keywords",
"Comments",
"Description",
"LastAuthor",
"Revision",
"Created",
"Modified",
"Category",
"ContentStatus",
"ExtendedProperties",
"Application",
"AppVersion",
"Characters",
"CharactersWithSpaces",
"Company",
"DigSig",
"DocSecurity",
//"HeadingPairs",
"HiddenSlides",
"HLinks",
"HyperlinkBase",
"HyperlinksChanged",
"Lines",
"LinksUpToDate",
"Manager",
"MultimediaClips",
"Notes",
"Pages",
"Paragraphs",
"PresentationFormat",
"Properties",
"ScaleCrop",
"SharedDoc",
"Slides",
"Template",
//"TitlesOfParts",
"TotalTime",
"Words"
static const struct key_entry ooxml_keys[] = {
{ "coreproperties", "CoreProperties", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
{ "title", "Title", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "subject", "Subject", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "creator", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "keywords", "Keywords", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "comments", "Comments", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "description", "Description", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "lastmodifiedby", "LastAuthor", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "revision", "Revision", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "created", "Created", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "modified", "Modified", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "category", "Category", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "contentstatus", "ContentStatus", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "properties", "ExtendedProperties", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
{ "application", "Application", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "appversion", "AppVersion", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "characters", "Characters", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "characterswithspaces", "CharactersWithSpaces", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "company", "Company", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "digsig", "DigSig", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "docsecurity", "DocSecurity", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
//{ "headingpairs", "HeadingPairs", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "hiddenslides", "HiddenSlides", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "hlinks", "HLinks", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "hyperlinkbase", "HyperlinkBase", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "hyperlinkschanged", "HyperlinksChanged", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "lines", "Lines", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "linksuptodate", "LinksUpToDate", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "manager", "Manager", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "mmclips", "MultimediaClips", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "notes", "Notes", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "pages", "Pages", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "paragraphs", "Paragraphs", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "presentationformat", "PresentationFormat", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
//{ "properties", "Properties", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "scalecrop", "ScaleCrop", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "shareddoc", "SharedDocs", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "slides", "Slides", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "template", "Template", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
//{ "titleofparts", "TitleOfParts", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "totaltime", "TotalTime", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
{ "words", "Words", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
/* Should NOT Exist */
{ "bindata", "BinaryData", MSXML_SCAN_B64 | MSXML_JSON_COUNT | MSXML_JSON_ROOT }
};
static size_t num_ooxml_keys = 40; //42
static const char *ooxml_check_key(const char* key, size_t keylen)
{
unsigned i;
if (keylen > OOXML_JSON_STRLEN_MAX-1) {
cli_dbgmsg("ooxml_check_key: key name too long\n");
return NULL;
}
for (i = 0; i < num_ooxml_keys; ++i) {
//cli_dbgmsg("%d %d %s %s %s %s\n", keylen, strlen(ooxml_keys[i]), key, keycmp, ooxml_keys[i], ooxml_json_keys[i]);
if (keylen == strlen(ooxml_keys[i]) && !strncasecmp(key, ooxml_keys[i], keylen)) {
return ooxml_json_keys[i];
}
}
return NULL;
}
static int ooxml_parse_element(cli_ctx *ctx, xmlTextReaderPtr reader, json_object *wrkptr, int rlvl, json_object *root)
{
const char *element_tag = NULL, *end_tag = NULL;
const xmlChar *node_name = NULL, *node_value = NULL;
json_object *thisjobj = NULL;
int node_type, ret = CL_SUCCESS, endtag = 0, toval = 0, state = 1;
cli_dbgmsg("in ooxml_parse_element @ layer %d\n", rlvl);
/* check recursion level */
if (rlvl >= OOXML_JSON_RECLEVEL_MAX) {
cli_dbgmsg("ooxml_parse_element: reached ooxml json recursion limit\n");
cli_jsonbool(root, "HitRecursiveLimit", 1);
/* skip it */
state = xmlTextReaderNext(reader);
check_state(state);
return CL_SUCCESS;
}
/* acquire element type */
node_type = xmlTextReaderNodeType(reader);
if (node_type == -1)
return CL_EPARSE;
if (node_type != XML_READER_TYPE_ELEMENT) {
cli_dbgmsg("ooxml_parse_element: first node typed %d, not %d\n", node_type, XML_READER_TYPE_ELEMENT);
return CL_EFORMAT; /* first type is not an element */
}
node_name = xmlTextReaderConstLocalName(reader);
if (!node_name) {
cli_dbgmsg("ooxml_parse_element: element tag node nameless\n");
return CL_EPARSE; /* no name, nameless */
}
element_tag = ooxml_check_key((const char *)node_name, xmlStrlen(node_name));
if (!element_tag) {
cli_dbgmsg("ooxml_parse_element: invalid element tag [%s]\n", node_name);
/* skip it */
state = xmlTextReaderNext(reader);
check_state(state);
return CL_SUCCESS;
}
/* generate json object */
thisjobj = cli_jsonobj(wrkptr, element_tag);
if (!thisjobj) {
return CL_EMEM;
}
cli_dbgmsg("ooxml_parse_element: generated json object [%s]\n", element_tag);
if (rlvl == 0)
root = thisjobj;
/* handle attributes */
state = xmlTextReaderHasAttributes(reader);
if (state == 1) {
json_object *attributes;
attributes = cli_jsonobj(thisjobj, "Attributes");
if (!attributes) {
return CL_EPARSE;
}
cli_dbgmsg("ooxml_parse_element: retrieved json object [Attributes]\n");
while (xmlTextReaderMoveToNextAttribute(reader) == 1) {
const xmlChar *name, *value;
name = xmlTextReaderConstLocalName(reader);
value = xmlTextReaderConstValue(reader);
if (name == NULL || value == NULL) continue;
cli_dbgmsg("%s: %s\n", name, value);
cli_jsonstr(attributes, name, (const char *)value);
}
}
else if (state == -1)
return CL_EPARSE;
state = xmlTextReaderIsEmptyElement(reader);
if (state == 1) {
state = xmlTextReaderNext(reader);
check_state(state);
return CL_SUCCESS;
}
else if (state == -1)
return CL_EPARSE;
/* advance to first content node */
state = xmlTextReaderRead(reader);
check_state(state);
/* parse until the end element tag */
while (!endtag) {
if (cli_json_timeout_cycle_check(ctx, &toval) != CL_SUCCESS) {
return CL_ETIMEOUT;
}
node_type = xmlTextReaderNodeType(reader);
if (node_type == -1)
return CL_EPARSE;
switch (node_type) {
case XML_READER_TYPE_ELEMENT:
ret = ooxml_parse_element(ctx, reader, thisjobj, rlvl+1, root);
if (ret != CL_SUCCESS) {
return ret;
}
break;
case XML_READER_TYPE_END_ELEMENT:
cli_dbgmsg("in ooxml_parse_element @ layer %d closed\n", rlvl);
node_name = xmlTextReaderConstLocalName(reader);
if (!node_name) {
cli_dbgmsg("ooxml_parse_element: element end tag node nameless\n");
return CL_EPARSE; /* no name, nameless */
}
end_tag = ooxml_check_key((const char *)node_name, xmlStrlen(node_name));
if (!end_tag) {
cli_dbgmsg("ooxml_parse_element: invalid element end tag [%s]\n", node_name);
return CL_EFORMAT; /* unrecognized element tag */
}
if (strncmp(element_tag, end_tag, strlen(element_tag))) {
cli_dbgmsg("ooxml_parse_element: element tag does not match end tag\n");
return CL_EFORMAT;
}
/* advance to next element tag */
state = xmlTextReaderRead(reader);
check_state(state);
endtag = 1;
break;
case XML_READER_TYPE_TEXT:
node_value = xmlTextReaderConstValue(reader);
ret = ooxml_parse_value(thisjobj, "Value", node_value);
if (ret != CL_SUCCESS)
return ret;
cli_dbgmsg("ooxml_parse_element: added json value [%s: %s]\n", element_tag, node_value);
/* advance to next element tag */
state = xmlTextReaderRead(reader);
check_state(state);
break;
default:
#if OOXML_DEBUG
node_name = xmlTextReaderConstLocalName(reader);
node_value = xmlTextReaderConstValue(reader);
cli_dbgmsg("ooxml_parse_element: unhandled xml node %s [%d]: %s\n", node_name, node_type, node_value);
#endif
state = xmlTextReaderNext(reader);
check_state(state);
return CL_SUCCESS;
}
}
return CL_SUCCESS;
}
static size_t num_ooxml_keys = sizeof(ooxml_keys) / sizeof(struct key_entry);
static int ooxml_updatelimits(int fd, cli_ctx *ctx)
{
@ -433,14 +126,7 @@ static int ooxml_parse_document(int fd, cli_ctx *ctx)
return CL_SUCCESS; // internal error from libxml2
}
/* move reader to first element */
if (xmlTextReaderRead(reader) != 1) {
xmlTextReaderClose(reader);
xmlFreeTextReader(reader);
return CL_SUCCESS; /* libxml2 failed */
}
ret = ooxml_parse_element(ctx, reader, ctx->wrkproperty, 0, NULL);
ret = cli_msxml_parse_document(ctx, reader, ooxml_keys, num_ooxml_keys, 1);
if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK)
cli_warnmsg("ooxml_parse_document: encountered issue in parsing properties document\n");
@ -457,9 +143,9 @@ static int ooxml_core_cb(int fd, cli_ctx *ctx)
cli_dbgmsg("in ooxml_core_cb\n");
ret = ooxml_parse_document(fd, ctx);
if (ret == CL_EPARSE)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_CORE_XMLPARSER");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_CORE_XMLPARSER");
else if (ret == CL_EFORMAT)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_CORE_MALFORMED");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_CORE_MALFORMED");
return ret;
}
@ -471,9 +157,9 @@ static int ooxml_extn_cb(int fd, cli_ctx *ctx)
cli_dbgmsg("in ooxml_extn_cb\n");
ret = ooxml_parse_document(fd, ctx);
if (ret == CL_EPARSE)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_EXTN_XMLPARSER");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EXTN_XMLPARSER");
else if (ret == CL_EFORMAT)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_EXTN_MALFORMED");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EXTN_MALFORMED");
return ret;
}
@ -501,7 +187,7 @@ static int ooxml_content_cb(int fd, cli_ctx *ctx)
reader = xmlReaderForFd(fd, "[Content_Types].xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
if (reader == NULL) {
cli_dbgmsg("ooxml_content_cb: xmlReaderForFd error for ""[Content_Types].xml""\n");
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_XML_READER_FD");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_XML_READER_FD");
ctx->scansize = sav_scansize;
ctx->scannedfiles = sav_scannedfiles;
@ -610,37 +296,37 @@ static int ooxml_content_cb(int fd, cli_ctx *ctx)
if (core) {
cli_jsonint(ctx->wrkproperty, "CorePropertiesFileCount", core);
if (core > 1)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_CORE_PROPFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_CORE_PROPFILES");
}
else if (!mcore)
cli_dbgmsg("cli_process_ooxml: file does not contain core properties file\n");
if (mcore) {
cli_jsonint(ctx->wrkproperty, "CorePropertiesMissingFileCount", mcore);
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_CORE_PROPFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_CORE_PROPFILES");
}
if (extn) {
cli_jsonint(ctx->wrkproperty, "ExtendedPropertiesFileCount", extn);
if (extn > 1)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_EXTN_PROPFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_EXTN_PROPFILES");
}
else if (!mextn)
cli_dbgmsg("cli_process_ooxml: file does not contain extended properties file\n");
if (mextn) {
cli_jsonint(ctx->wrkproperty, "ExtendedPropertiesMissingFileCount", mextn);
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_EXTN_PROPFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_EXTN_PROPFILES");
}
if (cust) {
cli_jsonint(ctx->wrkproperty, "CustomPropertiesFileCount", cust);
if (cust > 1)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_CUSTOM_PROPFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_CUSTOM_PROPFILES");
}
else if (!mcust)
cli_dbgmsg("cli_process_ooxml: file does not contain custom properties file\n");
if (mcust) {
cli_jsonint(ctx->wrkproperty, "CustomPropertiesMissingFileCount", mcust);
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_CUST_PROPFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_CUST_PROPFILES");
}
if (dsig) {
@ -704,25 +390,25 @@ int cli_process_ooxml(cli_ctx *ctx)
/* find "[Content Types].xml" */
tmp = unzip_search_single(ctx, "[Content_Types].xml", 18, &loff);
if (tmp == CL_ETIMEOUT) {
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
return CL_ETIMEOUT;
}
else if (tmp != CL_VIRUS) {
cli_dbgmsg("cli_process_ooxml: failed to find ""[Content_Types].xml""!\n");
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_NO_CONTENT_TYPES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_NO_CONTENT_TYPES");
return CL_EFORMAT;
}
cli_dbgmsg("cli_process_ooxml: found ""[Content_Types].xml"" @ %x\n", loff);
tmp = unzip_single_internal(ctx, loff, ooxml_content_cb);
if (tmp == CL_ETIMEOUT)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
else if (tmp == CL_EMEM)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_OUTOFMEM");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_OUTOFMEM");
else if (tmp == CL_EMAXSIZE)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_EMAXSIZE");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EMAXSIZE");
else if (tmp == CL_EMAXFILES)
ooxml_add_parse_error(ctx->wrkproperty, "OOXML_ERROR_EMAXFILES");
cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EMAXFILES");
return tmp;
#else

Loading…
Cancel
Save