clamav/libclamav/ooxml.c

/*
 * OOXML JSON Internals
 *
 * Copyright (C) 2014-2020 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
 *
 * Authors: Kevin Lin
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License version 2 as published by the
 * Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 51
 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#include "clamav.h"
#include "filetypes.h"
#include "others.h"
#include "unzip.h"
#if HAVE_JSON
#include "json.h"
#endif
#include "json_api.h"
#include "msxml_parser.h"
#include "ooxml.h"

#if HAVE_LIBXML2
#include <libxml/xmlreader.h>
#endif

#if HAVE_LIBXML2 && HAVE_JSON

// clang-format off

/*** OOXML MSDOC ***/
static const struct key_entry ooxml_keys[] = {
    { "coreproperties",     "CoreProperties",         MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
    { "title",              "Title",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "subject",            "Subject",                MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "creator",            "Author",                 MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "keywords",           "Keywords",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "comments",           "Comments",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "description",        "Description",            MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "lastmodifiedby",     "LastAuthor",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "revision",           "Revision",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "created",            "Created",                MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "modified",           "Modified",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "category",           "Category",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "contentstatus",      "ContentStatus",          MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "properties",         "ExtendedProperties",     MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
    { "application",        "Application",            MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "appversion",         "AppVersion",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "characters",         "Characters",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "characterswithspaces", "CharactersWithSpaces", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "company",            "Company",                MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "digsig",             "DigSig",                 MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "docsecurity",        "DocSecurity",            MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    //{ "headingpairs",       "HeadingPairs",           MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "hiddenslides",       "HiddenSlides",           MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "hlinks",             "HLinks",                 MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "hyperlinkbase",      "HyperlinkBase",          MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "hyperlinkschanged",  "HyperlinksChanged",      MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "lines",              "Lines",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "linksuptodate",      "LinksUpToDate",          MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "manager",            "Manager",                MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "mmclips",            "MultimediaClips",        MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "notes",              "Notes",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "pages",              "Pages",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "paragraphs",         "Paragraphs",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "presentationformat", "PresentationFormat",     MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    //{ "properties",         "Properties",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "scalecrop",          "ScaleCrop",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "shareddoc",          "SharedDocs",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "slides",             "Slides",                 MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "template",           "Template",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    //{ "titleofparts",       "TitleOfParts",           MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "totaltime",          "TotalTime",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "words",              "Words",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },

    /* Should NOT Exist */
    { "bindata",            "BinaryData",             MSXML_SCAN_B64 | MSXML_JSON_COUNT | MSXML_JSON_ROOT }
};
static size_t num_ooxml_keys = sizeof(ooxml_keys) / sizeof(struct key_entry);

/*** OOXML HWP ***/
static const struct key_entry ooxml_hwp_keys[] = {
    { "hcfversion",         "HCFVersion",             MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
    { "package",            "Properties",             MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
    { "metadata",           "Metadata",               MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
    { "title",              "Title",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "language",           "Language",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
    { "meta",               "MetaFields",             MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB | MSXML_JSON_VALUE | MSXML_JSON_COUNT | MSXML_JSON_MULTI },
    { "item",               "Contents",               MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB | MSXML_JSON_COUNT | MSXML_JSON_MULTI }
};
static size_t num_ooxml_hwp_keys = sizeof(ooxml_hwp_keys) / sizeof(struct key_entry);

// clang-format on

static cl_error_t ooxml_updatelimits(int fd, cli_ctx *ctx)
{
    STATBUF sb;
    if (FSTAT(fd, &sb) == -1) {
        cli_errmsg("ooxml_updatelimits: Can't fstat descriptor %d\n", fd);
        return CL_ESTAT;
    }

    return cli_updatelimits(ctx, sb.st_size);
}

static cl_error_t ooxml_parse_document(int fd, cli_ctx *ctx)
{
    cl_error_t ret                 = CL_SUCCESS;
    xmlTextReaderPtr reader = NULL;

    cli_dbgmsg("in ooxml_parse_document\n");

    /* perform engine limit checks in temporary tracking session */
    ret = ooxml_updatelimits(fd, ctx);
    if (ret != CL_CLEAN)
        return ret;

    reader = xmlReaderForFd(fd, "properties.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
    if (reader == NULL) {
        cli_dbgmsg("ooxml_parse_document: xmlReaderForFd error\n");
        return CL_SUCCESS; // internal error from libxml2
    }

    ret = cli_msxml_parse_document(ctx, reader, ooxml_keys, num_ooxml_keys, MSXML_FLAG_JSON, NULL);

    if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK)
        cli_warnmsg("ooxml_parse_document: encountered issue in parsing properties document\n");

    xmlTextReaderClose(reader);
    xmlFreeTextReader(reader);
    return ret;
}

static cl_error_t ooxml_core_cb(int fd, const char *filepath, cli_ctx *ctx, const char *name)
{
    cl_error_t ret;

    UNUSEDPARAM(filepath);
    UNUSEDPARAM(name);

    cli_dbgmsg("in ooxml_core_cb\n");
    ret = ooxml_parse_document(fd, ctx);
    if (ret == CL_EPARSE)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_CORE_XMLPARSER");
    else if (ret == CL_EFORMAT)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_CORE_MALFORMED");

    return ret;
}

static cl_error_t ooxml_extn_cb(int fd, const char *filepath, cli_ctx *ctx, const char *name)
{
    cl_error_t ret;

    UNUSEDPARAM(filepath);
    UNUSEDPARAM(name);

    cli_dbgmsg("in ooxml_extn_cb\n");
    ret = ooxml_parse_document(fd, ctx);
    if (ret == CL_EPARSE)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EXTN_XMLPARSER");
    else if (ret == CL_EFORMAT)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EXTN_MALFORMED");

    return ret;
}

static cl_error_t ooxml_content_cb(int fd, const char *filepath, cli_ctx *ctx, const char *name)
{
    cl_error_t ret = CL_SUCCESS;
    int tmp, toval = 0, state;
    int core = 0, extn = 0, cust = 0, dsig = 0;
    int mcore = 0, mextn = 0, mcust = 0;
    const xmlChar *localname, *value, *CT, *PN;
    xmlTextReaderPtr reader = NULL;
    uint32_t loff;

    UNUSEDPARAM(filepath);
    UNUSEDPARAM(name);

    unsigned long sav_scansize    = ctx->scansize;
    unsigned int sav_scannedfiles = ctx->scannedfiles;

    cli_dbgmsg("in ooxml_content_cb\n");

    /* perform engine limit checks in temporary tracking session */
    ret = ooxml_updatelimits(fd, ctx);
    if (ret != CL_CLEAN)
        return ret;

    /* apply a reader to the document */
    reader = xmlReaderForFd(fd, "[Content_Types].xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
    if (reader == NULL) {
        cli_dbgmsg("ooxml_content_cb: xmlReaderForFd error for "
                   "[Content_Types].xml"
                   "\n");
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_XML_READER_FD");

        ctx->scansize     = sav_scansize;
        ctx->scannedfiles = sav_scannedfiles;
        return CL_SUCCESS; // libxml2 failed!
    }

    /* locate core-properties, extended-properties, and custom-properties (optional) */
    while ((state = xmlTextReaderRead(reader)) == 1) {
        if (cli_json_timeout_cycle_check(ctx, &toval) != CL_SUCCESS) {
            ret = CL_ETIMEOUT;
            goto ooxml_content_exit;
        }

        localname = xmlTextReaderConstLocalName(reader);
        if (localname == NULL) continue;

        if (strcmp((const char *)localname, "Override")) continue;

        if (xmlTextReaderHasAttributes(reader) != 1) continue;

        CT = PN = NULL;
        while (xmlTextReaderMoveToNextAttribute(reader) == 1) {
            localname  = xmlTextReaderConstLocalName(reader);
            value = xmlTextReaderConstValue(reader);
            if (localname == NULL || value == NULL) continue;

            if (!xmlStrcmp(localname, (const xmlChar *)"ContentType")) {
                CT = value;
            } else if (!xmlStrcmp(localname, (const xmlChar *)"PartName")) {
                PN = value;
            }

            cli_dbgmsg("%s: %s\n", localname, value);
        }

        if (!CT && !PN) continue;

        if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-package.core-properties+xml")) {
            /* default: /docProps/core.xml*/
            tmp = unzip_search_single(ctx, (const char *)(PN + 1), xmlStrlen(PN) - 1, &loff);
            if (tmp == CL_ETIMEOUT) {
                ret = tmp;
            } else if (tmp != CL_VIRUS) {
                cli_dbgmsg("cli_process_ooxml: failed to find core properties file \"%s\"!\n", PN);
                mcore++;
            } else {
                cli_dbgmsg("ooxml_content_cb: found core properties file \"%s\" @ %x\n", PN, loff);
                if (!core) {
                    tmp = unzip_single_internal(ctx, loff, ooxml_core_cb);
                    if (tmp == CL_ETIMEOUT || tmp == CL_EMEM) {
                        ret = tmp;
                    }
                }
                core++;
            }
        } else if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-officedocument.extended-properties+xml")) {
            /* default: /docProps/app.xml */
            tmp = unzip_search_single(ctx, (const char *)(PN + 1), xmlStrlen(PN) - 1, &loff);
            if (tmp == CL_ETIMEOUT) {
                ret = tmp;
            } else if (tmp != CL_VIRUS) {
                cli_dbgmsg("cli_process_ooxml: failed to find extended properties file \"%s\"!\n", PN);
                mextn++;
            } else {
                cli_dbgmsg("ooxml_content_cb: found extended properties file \"%s\" @ %x\n", PN, loff);
                if (!extn) {
                    tmp = unzip_single_internal(ctx, loff, ooxml_extn_cb);
                    if (tmp == CL_ETIMEOUT || tmp == CL_EMEM) {
                        ret = tmp;
                    }
                }
                extn++;
            }
        } else if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-officedocument.custom-properties+xml")) {
            /* default: /docProps/custom.xml */
            tmp = unzip_search_single(ctx, (const char *)(PN + 1), xmlStrlen(PN) - 1, &loff);
            if (tmp == CL_ETIMEOUT) {
                ret = tmp;
            } else if (tmp != CL_VIRUS) {
                cli_dbgmsg("cli_process_ooxml: failed to find custom properties file \"%s\"!\n", PN);
                mcust++;
            } else {
                cli_dbgmsg("ooxml_content_cb: found custom properties file \"%s\" @ %x\n", PN, loff);
                /* custom properties are not parsed */
                cust++;
            }
        } else if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-package.digital-signature-xmlsignature+xml")) {
            dsig++;
        }

        if (ret != CL_SUCCESS)
            goto ooxml_content_exit;
    }

ooxml_content_exit:
    if (core) {
        cli_jsonint(ctx->wrkproperty, "CorePropertiesFileCount", core);
        if (core > 1)
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_CORE_PROPFILES");
    } else if (!mcore)
        cli_dbgmsg("cli_process_ooxml: file does not contain core properties file\n");
    if (mcore) {
        cli_jsonint(ctx->wrkproperty, "CorePropertiesMissingFileCount", mcore);
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_CORE_PROPFILES");
    }

    if (extn) {
        cli_jsonint(ctx->wrkproperty, "ExtendedPropertiesFileCount", extn);
        if (extn > 1)
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_EXTN_PROPFILES");
    } else if (!mextn)
        cli_dbgmsg("cli_process_ooxml: file does not contain extended properties file\n");
    if (mextn) {
        cli_jsonint(ctx->wrkproperty, "ExtendedPropertiesMissingFileCount", mextn);
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_EXTN_PROPFILES");
    }

    if (cust) {
        cli_jsonint(ctx->wrkproperty, "CustomPropertiesFileCount", cust);
        if (cust > 1)
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MULTIPLE_CUSTOM_PROPFILES");
    } else if (!mcust)
        cli_dbgmsg("cli_process_ooxml: file does not contain custom properties file\n");
    if (mcust) {
        cli_jsonint(ctx->wrkproperty, "CustomPropertiesMissingFileCount", mcust);
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_MISSING_CUST_PROPFILES");
    }

    if (dsig) {
        cli_jsonint(ctx->wrkproperty, "DigitalSignaturesCount", dsig);
    }

    /* restore the engine tracking limits; resets session limit tracking */
    ctx->scansize     = sav_scansize;
    ctx->scannedfiles = sav_scannedfiles;

    xmlTextReaderClose(reader);
    xmlFreeTextReader(reader);
    return ret;
}

static cl_error_t ooxml_hwp_cb(int fd, const char *filepath, cli_ctx *ctx, const char *name)
{
    cl_error_t ret          = CL_SUCCESS;
    xmlTextReaderPtr reader = NULL;

    UNUSEDPARAM(filepath);
    UNUSEDPARAM(name);

    cli_dbgmsg("in ooxml_hwp_cb\n");

    /* perform engine limit checks in temporary tracking session */
    ret = ooxml_updatelimits(fd, ctx);
    if (ret != CL_CLEAN)
        return ret;

    reader = xmlReaderForFd(fd, "ooxml_hwp.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
    if (reader == NULL) {
        cli_dbgmsg("ooxml_hwp_cb: xmlReaderForFd error\n");
        return CL_SUCCESS; // internal error from libxml2
    }

    ret = cli_msxml_parse_document(ctx, reader, ooxml_hwp_keys, num_ooxml_hwp_keys, MSXML_FLAG_JSON, NULL);

    if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK)
        cli_warnmsg("ooxml_hwp_cb: encountered issue in parsing properties document\n");

    xmlTextReaderClose(reader);
    xmlFreeTextReader(reader);
    return ret;
}

#endif /* HAVE_LIBXML2 && HAVE_JSON */

cli_file_t cli_ooxml_filetype(cli_ctx *ctx, fmap_t *map)
{
    struct zip_requests requests;
    cl_error_t ret;

    memset(&requests, 0, sizeof(struct zip_requests));

    if ((ret = unzip_search_add(&requests, "xl/", 3)) != CL_SUCCESS) {
        return CL_TYPE_ANY;
    }
    if ((ret = unzip_search_add(&requests, "ppt/", 4)) != CL_SUCCESS) {
        return CL_TYPE_ANY;
    }
    if ((ret = unzip_search_add(&requests, "word/", 5)) != CL_SUCCESS) {
        return CL_TYPE_ANY;
    }
    if ((ret = unzip_search_add(&requests, "Contents/content.hpf", 22)) != CL_SUCCESS) {
        return CL_TYPE_ANY;
    }

    if ((ret = unzip_search(ctx, map, &requests)) == CL_VIRUS) {
        switch (requests.found) {
            case 0:
                return CL_TYPE_OOXML_XL;
            case 1:
                return CL_TYPE_OOXML_PPT;
            case 2:
                return CL_TYPE_OOXML_WORD;
            case 3:
                return CL_TYPE_OOXML_HWP;
            default:
                return CL_TYPE_ANY;
        }
    }

    return CL_TYPE_ANY;
}

cl_error_t cli_process_ooxml(cli_ctx *ctx, int type)
{
#if HAVE_LIBXML2 && HAVE_JSON
    uint32_t loff = 0;
    cl_error_t ret       = CL_SUCCESS;

    cli_dbgmsg("in cli_process_ooxml\n");
    if (!ctx) {
        return CL_ENULLARG;
    }

    if (type == CL_TYPE_OOXML_HWP) {
        /* two files: version.xml and Contents/content.hpf */
        ret = unzip_search_single(ctx, "version.xml", 11, &loff);
        if (ret == CL_ETIMEOUT) {
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
            return CL_ETIMEOUT;
        } else if (ret != CL_VIRUS) {
            cli_dbgmsg("cli_process_ooxml: failed to find "
                       "version.xml"
                       "!\n");
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_NO_HWP_VERSION");
            return CL_EFORMAT;
        }
        ret = unzip_single_internal(ctx, loff, ooxml_hwp_cb);

        if (ret == CL_SUCCESS) {
            ret = unzip_search_single(ctx, "Contents/content.hpf", 20, &loff);
            if (ret == CL_ETIMEOUT) {
                cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
                return CL_ETIMEOUT;
            } else if (ret != CL_VIRUS) {
                cli_dbgmsg("cli_process_ooxml: failed to find "
                           "Contents/content.hpf"
                           "!\n");
                cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_NO_HWP_CONTENT");
                return CL_EFORMAT;
            }
            ret = unzip_single_internal(ctx, loff, ooxml_hwp_cb);
        }
    } else {
        /* find "[Content Types].xml" */
        ret = unzip_search_single(ctx, "[Content_Types].xml", 19, &loff);
        if (ret == CL_ETIMEOUT) {
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
            return CL_ETIMEOUT;
        } else if (ret != CL_VIRUS) {
            cli_dbgmsg("cli_process_ooxml: failed to find "
                       "[Content_Types].xml"
                       "!\n");
            cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_NO_CONTENT_TYPES");
            return CL_EFORMAT;
        }
        cli_dbgmsg("cli_process_ooxml: found "
                   "[Content_Types].xml"
                   " @ %x\n",
                   loff);

        ret = unzip_single_internal(ctx, loff, ooxml_content_cb);
    }

    if (ret == CL_ETIMEOUT)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_TIMEOUT");
    else if (ret == CL_EMEM)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_OUTOFMEM");
    else if (ret == CL_EMAXSIZE)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EMAXSIZE");
    else if (ret == CL_EMAXFILES)
        cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_EMAXFILES");

    return ret;
#else
    UNUSEDPARAM(ctx);
    cli_dbgmsg("in cli_process_ooxml\n");
#if !HAVE_LIBXML2
    cli_dbgmsg("cli_process_ooxml: libxml2 needs to enabled!\n");
#endif
#if !HAVE_JSON
    cli_dbgmsg("cli_process_ooxml: libjson needs to enabled!\n");
#endif
    return CL_SUCCESS;
#endif
}