clamav/libclamav/filetypes.c

/*
 *  Copyright (C) 2007-2008 Sourcefire, Inc.
 *
 *  Authors: Tomasz Kojm
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */

#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#ifdef	HAVE_UNISTD_H
#include <unistd.h>
#endif

#include "clamav.h"
#include "filetypes.h"
#include "others.h"
#include "readdb.h"
#include "matcher-ac.h"
#include "str.h"
#include "textdet.h"
#include "default.h"

#include "htmlnorm.h"
#include "entconv.h"
#include "mpool.h"

static const struct ftmap_s {
    const char *name;
    cli_file_t code;
} ftmap[] = {
    { "CL_TYPE_TEXT_ASCII",	CL_TYPE_TEXT_ASCII	},
    { "CL_TYPE_TEXT_UTF8",	CL_TYPE_TEXT_UTF8	},
    { "CL_TYPE_TEXT_UTF16LE",	CL_TYPE_TEXT_UTF16LE	},
    { "CL_TYPE_TEXT_UTF16BE",	CL_TYPE_TEXT_UTF16BE	},
    { "CL_TYPE_BINARY_DATA",	CL_TYPE_BINARY_DATA	},
    { "CL_TYPE_IGNORED",	CL_TYPE_IGNORED		},
    { "CL_TYPE_ANY",		0			}, /* for ft-sigs */
    { "CL_TYPE_MSEXE",		CL_TYPE_MSEXE		},
    { "CL_TYPE_ELF",		CL_TYPE_ELF		},
    { "CL_TYPE_MACHO",		CL_TYPE_MACHO		},
    { "CL_TYPE_MACHO_UNIBIN",	CL_TYPE_MACHO_UNIBIN	},
    { "CL_TYPE_POSIX_TAR",	CL_TYPE_POSIX_TAR	},
    { "CL_TYPE_OLD_TAR",	CL_TYPE_OLD_TAR		},
    { "CL_TYPE_CPIO_OLD",	CL_TYPE_CPIO_OLD	},
    { "CL_TYPE_CPIO_ODC",	CL_TYPE_CPIO_ODC	},
    { "CL_TYPE_CPIO_NEWC",	CL_TYPE_CPIO_NEWC	},
    { "CL_TYPE_CPIO_CRC",	CL_TYPE_CPIO_CRC	},
    { "CL_TYPE_GZ",		CL_TYPE_GZ		},
    { "CL_TYPE_ZIP",		CL_TYPE_ZIP		},
    { "CL_TYPE_BZ",		CL_TYPE_BZ		},
    { "CL_TYPE_RAR",		CL_TYPE_RAR		},
    { "CL_TYPE_ARJ",		CL_TYPE_ARJ		},
    { "CL_TYPE_MSSZDD",		CL_TYPE_MSSZDD		},
    { "CL_TYPE_MSOLE2",		CL_TYPE_MSOLE2		},
    { "CL_TYPE_MSCAB",		CL_TYPE_MSCAB		},
    { "CL_TYPE_MSCHM",		CL_TYPE_MSCHM		},
    { "CL_TYPE_SIS",		CL_TYPE_SIS		},
    { "CL_TYPE_SCRENC",		CL_TYPE_SCRENC		},
    { "CL_TYPE_GRAPHICS",	CL_TYPE_GRAPHICS	},
    { "CL_TYPE_RIFF",		CL_TYPE_RIFF		},
    { "CL_TYPE_BINHEX",		CL_TYPE_BINHEX		},
    { "CL_TYPE_TNEF",		CL_TYPE_TNEF		},
    { "CL_TYPE_CRYPTFF",	CL_TYPE_CRYPTFF		},
    { "CL_TYPE_PDF",		CL_TYPE_PDF		},
    { "CL_TYPE_UUENCODED",	CL_TYPE_UUENCODED	},
    { "CL_TYPE_HTML_UTF16",	CL_TYPE_HTML_UTF16	},
    { "CL_TYPE_SCRIPT",         CL_TYPE_SCRIPT          },
    { "CL_TYPE_RTF",		CL_TYPE_RTF		},
    { "CL_TYPE_HTML",		CL_TYPE_HTML		},
    { "CL_TYPE_MAIL",		CL_TYPE_MAIL		},
    { "CL_TYPE_SFX",		CL_TYPE_SFX		},
    { "CL_TYPE_ZIPSFX",		CL_TYPE_ZIPSFX		},
    { "CL_TYPE_RARSFX",		CL_TYPE_RARSFX		},
    { "CL_TYPE_CABSFX",		CL_TYPE_CABSFX		},
    { "CL_TYPE_ARJSFX",		CL_TYPE_ARJSFX		},
    { "CL_TYPE_NULSFT",		CL_TYPE_NULSFT		},
    { "CL_TYPE_AUTOIT",		CL_TYPE_AUTOIT		},
    { "CL_TYPE_ISHIELD_MSI",	CL_TYPE_ISHIELD_MSI	},
    { "CL_TYPE_7Z",		CL_TYPE_7Z		},
    { NULL,			CL_TYPE_IGNORED		}
};

cli_file_t cli_ftcode(const char *name)
{
	unsigned int i;

    for(i = 0; ftmap[i].name; i++)
	if(!strcmp(ftmap[i].name, name))
	    return ftmap[i].code;

    return CL_TYPE_ERROR;
}

void cli_ftfree(const struct cl_engine *engine)
{
	struct cli_ftype *ftypes=engine->ftypes, *pt;

    while(ftypes) {
	pt = ftypes;
	ftypes = ftypes->next;
	mpool_free(engine->mempool, pt->magic);
	mpool_free(engine->mempool, pt->tname);
	mpool_free(engine->mempool, pt);
    }
}

cli_file_t cli_filetype(const unsigned char *buf, size_t buflen, const struct cl_engine *engine)
{
	struct cli_ftype *ftype = engine->ftypes;


    while(ftype) {
	if(ftype->offset + ftype->length <= buflen) {
	    if(!memcmp(buf + ftype->offset, ftype->magic, ftype->length)) {
		cli_dbgmsg("Recognized %s file\n", ftype->tname);
		return ftype->type;
	    }
	}
	ftype = ftype->next;
    }

    return cli_texttype(buf, buflen);
}

int is_tar(unsigned char *buf, unsigned int nbytes);

cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
{
	unsigned char buff[MAGIC_BUFFER_SIZE + 1], *decoded;
	int bread, sret;
	cli_file_t ret = CL_TYPE_BINARY_DATA;
	struct cli_matcher *root;
	struct cli_ac_data mdata;


    if(!engine) {
	cli_errmsg("cli_filetype2: engine == NULL\n");
	return CL_TYPE_ERROR;
    }

    memset(buff, 0, sizeof(buff));
    bread = cli_readn(desc, buff, MAGIC_BUFFER_SIZE);
    if(bread == -1)
	return CL_TYPE_ERROR;
    buff[bread] = 0;

    ret = cli_filetype(buff, bread, engine);

    if(ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) {
	/* HTML files may contain special characters and could be
	 * misidentified as BINARY_DATA by cli_filetype()
	 */
	root = engine->root[0];
	if(!root)
	    return ret;

	if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, CLI_DEFAULT_AC_TRACKLEN))
	    return ret;

	sret = cli_ac_scanbuff(buff, bread, NULL, NULL, NULL, engine->root[0], &mdata, 0, ret, NULL, AC_SCAN_FT, NULL);

	cli_ac_freedata(&mdata);

	if(sret >= CL_TYPENO) {
	    ret = sret;
	} else {
	    if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, CLI_DEFAULT_AC_TRACKLEN))
		return ret;

	    decoded = (unsigned char *) cli_utf16toascii((char *) buff, bread);
	    if(decoded) {
		sret = cli_ac_scanbuff(decoded, strlen((char *) decoded), NULL, NULL, NULL,  engine->root[0], &mdata, 0, CL_TYPE_TEXT_ASCII, NULL, AC_SCAN_FT, NULL);
		free(decoded);
		if(sret == CL_TYPE_HTML)
		    ret = CL_TYPE_HTML_UTF16;
	    }
	    cli_ac_freedata(&mdata);

	    if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
		    const char* encoding;

		    /* check if we can autodetect this encoding.
		     * If we can't don't try to detect HTML sig, since
		     * we just tried that above, and failed */
		    if((encoding = encoding_detect_bom(buff, bread))) {
			    unsigned char decodedbuff[sizeof(buff)*2];
			    m_area_t in_area, out_area;

			    in_area.buffer = (unsigned char *) buff;
			    in_area.length = bread;
			    in_area.offset = 0;
			    out_area.buffer = decodedbuff;
			    out_area.length = sizeof(decodedbuff);
			    out_area.offset = 0;

			    /* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode 
			     * (multibyte characters will not be exactly handled, but that is not a problem).
			     * However when detecting whether a file is HTML or not, we need exact conversion.
			     * (just eliminating zeros and matching would introduce false positives */
			    if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
				    if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, CLI_DEFAULT_AC_TRACKLEN))
					    return ret;

				    if(out_area.length > 0) {
					    sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, NULL, NULL, engine->root[0], &mdata, 0, 0, NULL, AC_SCAN_FT, NULL); /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */
					    if(sret == CL_TYPE_HTML) {
						    cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
						    /* htmlnorm is able to handle any unicode now, since it skips null chars */
						    ret = CL_TYPE_HTML;
					    }
				    }

				    cli_ac_freedata(&mdata);
			    }
		    }
	    }
	}
    }

    if(ret == CL_TYPE_BINARY_DATA) {
	switch(is_tar(buff, bread)) {
	    case 1:
		ret = CL_TYPE_OLD_TAR;
		cli_dbgmsg("Recognized old fashioned tar file\n");
		break;
	    case 2:
		ret = CL_TYPE_POSIX_TAR;
		cli_dbgmsg("Recognized POSIX tar file\n");
		break;
	}
    }

    return ret;
}
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`/*`
update copyrights and stick more files to GPLv2; move and add more credits to the AUTHORS file; add COPYING.BSD git-svn: trunk@3749 17 years ago			`* Copyright (C) 2007-2008 Sourcefire, Inc.`
improved filetype detection code git-svn: trunk@3421 18 years ago			`*`
update copyrights and stick more files to GPLv2; move and add more credits to the AUTHORS file; add COPYING.BSD git-svn: trunk@3749 17 years ago			`* Authors: Tomasz Kojm`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`*`
			`* This program is free software; you can redistribute it and/or modify`
update some copyrights and stick to GPL v2 git-svn: trunk@3003 18 years ago			`* it under the terms of the GNU General Public License version 2 as`
			`* published by the Free Software Foundation.`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
update GPL headers with new address for FSF git-svn: trunk@1901 19 years ago			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,`
			`* MA 02110-1301, USA.`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`*/`

			`#if HAVE_CONFIG_H`
			`#include "clamav-config.h"`
			`#endif`

			`#include <stdio.h>`
			`#include <string.h>`
			`#include <stdlib.h>`
Fix implicit function declaration git-svn: trunk@2569 19 years ago			`#include <sys/types.h>`
pattern matcher accuracy improvements git-svn: trunk@2505 19 years ago			`#ifdef HAVE_UNISTD_H`
			`#include <unistd.h>`
			`#endif`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago
			`#include "clamav.h"`
			`#include "filetypes.h"`
Use new patter matching algorithm. Cleanup. git-svn: trunk@674 21 years ago			`#include "others.h"`
			`#include "readdb.h"`
replace AC_MIN_LENGTH with cli_matcher->ac_depth git-svn: trunk@1751 20 years ago			`#include "matcher-ac.h"`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`#include "str.h"`
re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`#include "textdet.h"`
libclamav: add default.h git-svn: trunk@4578 17 years ago			`#include "default.h"`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago
add encoding and entity normalizer from Edwin (bb#145) git-svn: trunk@2577 19 years ago			`#include "htmlnorm.h"`
			`#include "entconv.h"`
a faster and more compact allocator git-svn-id: file:///var/lib/svn/clamav-devel/branches/mpool@4286 77e5149b-7576-45b1-b177-96237e5ba77b 17 years ago			`#include "mpool.h"`
add encoding and entity normalizer from Edwin (bb#145) git-svn: trunk@2577 19 years ago
improved filetype detection code git-svn: trunk@3421 18 years ago			`static const struct ftmap_s {`
			`const char *name;`
			`cli_file_t code;`
			`} ftmap[] = {`
re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`{ "CL_TYPE_TEXT_ASCII", CL_TYPE_TEXT_ASCII },`
			`{ "CL_TYPE_TEXT_UTF8", CL_TYPE_TEXT_UTF8 },`
			`{ "CL_TYPE_TEXT_UTF16LE", CL_TYPE_TEXT_UTF16LE },`
			`{ "CL_TYPE_TEXT_UTF16BE", CL_TYPE_TEXT_UTF16BE },`
			`{ "CL_TYPE_BINARY_DATA", CL_TYPE_BINARY_DATA },`
improved filetype detection code git-svn: trunk@3421 18 years ago			`{ "CL_TYPE_IGNORED", CL_TYPE_IGNORED },`
filetype detection improvements git-svn: trunk@3662 18 years ago			`{ "CL_TYPE_ANY", 0 }, /* for ft-sigs */`
improved filetype detection code git-svn: trunk@3421 18 years ago			`{ "CL_TYPE_MSEXE", CL_TYPE_MSEXE },`
			`{ "CL_TYPE_ELF", CL_TYPE_ELF },`
missing stuff (-a !@(&!(@&) 16 years ago			`{ "CL_TYPE_MACHO", CL_TYPE_MACHO },`
libclamav: add support for Universal Binaries (archives with Mach-O files for different architectures, bb#1592) 16 years ago			`{ "CL_TYPE_MACHO_UNIBIN", CL_TYPE_MACHO_UNIBIN },`
improved filetype detection code git-svn: trunk@3421 18 years ago			`{ "CL_TYPE_POSIX_TAR", CL_TYPE_POSIX_TAR },`
			`{ "CL_TYPE_OLD_TAR", CL_TYPE_OLD_TAR },`
libclamav: add support for cpio archives (bb#1649) - missing changes 16 years ago			`{ "CL_TYPE_CPIO_OLD", CL_TYPE_CPIO_OLD },`
			`{ "CL_TYPE_CPIO_ODC", CL_TYPE_CPIO_ODC },`
			`{ "CL_TYPE_CPIO_NEWC", CL_TYPE_CPIO_NEWC },`
			`{ "CL_TYPE_CPIO_CRC", CL_TYPE_CPIO_CRC },`
improved filetype detection code git-svn: trunk@3421 18 years ago			`{ "CL_TYPE_GZ", CL_TYPE_GZ },`
			`{ "CL_TYPE_ZIP", CL_TYPE_ZIP },`
			`{ "CL_TYPE_BZ", CL_TYPE_BZ },`
			`{ "CL_TYPE_RAR", CL_TYPE_RAR },`
			`{ "CL_TYPE_ARJ", CL_TYPE_ARJ },`
			`{ "CL_TYPE_MSSZDD", CL_TYPE_MSSZDD },`
			`{ "CL_TYPE_MSOLE2", CL_TYPE_MSOLE2 },`
			`{ "CL_TYPE_MSCAB", CL_TYPE_MSCAB },`
			`{ "CL_TYPE_MSCHM", CL_TYPE_MSCHM },`
			`{ "CL_TYPE_SIS", CL_TYPE_SIS },`
			`{ "CL_TYPE_SCRENC", CL_TYPE_SCRENC },`
			`{ "CL_TYPE_GRAPHICS", CL_TYPE_GRAPHICS },`
			`{ "CL_TYPE_RIFF", CL_TYPE_RIFF },`
			`{ "CL_TYPE_BINHEX", CL_TYPE_BINHEX },`
			`{ "CL_TYPE_TNEF", CL_TYPE_TNEF },`
			`{ "CL_TYPE_CRYPTFF", CL_TYPE_CRYPTFF },`
			`{ "CL_TYPE_PDF", CL_TYPE_PDF },`
			`{ "CL_TYPE_UUENCODED", CL_TYPE_UUENCODED },`
			`{ "CL_TYPE_HTML_UTF16", CL_TYPE_HTML_UTF16 },`
support for generic text normalizer (CL_TYPE_SCRIPT) git-svn: trunk@3584 18 years ago			`{ "CL_TYPE_SCRIPT", CL_TYPE_SCRIPT },`
improved filetype detection code git-svn: trunk@3421 18 years ago			`{ "CL_TYPE_RTF", CL_TYPE_RTF },`
			`{ "CL_TYPE_HTML", CL_TYPE_HTML },`
			`{ "CL_TYPE_MAIL", CL_TYPE_MAIL },`
			`{ "CL_TYPE_SFX", CL_TYPE_SFX },`
			`{ "CL_TYPE_ZIPSFX", CL_TYPE_ZIPSFX },`
			`{ "CL_TYPE_RARSFX", CL_TYPE_RARSFX },`
			`{ "CL_TYPE_CABSFX", CL_TYPE_CABSFX },`
			`{ "CL_TYPE_ARJSFX", CL_TYPE_ARJSFX },`
			`{ "CL_TYPE_NULSFT", CL_TYPE_NULSFT },`
			`{ "CL_TYPE_AUTOIT", CL_TYPE_AUTOIT },`
ISHIELD support: - preliminary ishield-msi ftype sport 16 years ago			`{ "CL_TYPE_ISHIELD_MSI", CL_TYPE_ISHIELD_MSI },`
7z support 16 years ago			`{ "CL_TYPE_7Z", CL_TYPE_7Z },`
re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`{ NULL, CL_TYPE_IGNORED }`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`};`

improved filetype detection code git-svn: trunk@3421 18 years ago			`cli_file_t cli_ftcode(const char *name)`
			`{`
			`unsigned int i;`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago
improved filetype detection code git-svn: trunk@3421 18 years ago			`for(i = 0; ftmap[i].name; i++)`
			`if(!strcmp(ftmap[i].name, name))`
			`return ftmap[i].code;`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago
improved filetype detection code git-svn: trunk@3421 18 years ago			`return CL_TYPE_ERROR;`
			`}`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago
some leaks fixed git-svn-id: file:///var/lib/svn/clamav-devel/branches/mpool@4280 77e5149b-7576-45b1-b177-96237e5ba77b 17 years ago			`void cli_ftfree(const struct cl_engine *engine)`
improved filetype detection code git-svn: trunk@3421 18 years ago			`{`
some leaks fixed git-svn-id: file:///var/lib/svn/clamav-devel/branches/mpool@4280 77e5149b-7576-45b1-b177-96237e5ba77b 17 years ago			`struct cli_ftype ftypes=engine->ftypes, pt;`
improved filetype detection code git-svn: trunk@3421 18 years ago
			`while(ftypes) {`
			`pt = ftypes;`
			`ftypes = ftypes->next;`
libclamav: use LibTomMath by Tom St Denis instead of libgmp for multiple precision integer arithmetic (bb#1366) git-svn: trunk@4650 17 years ago			`mpool_free(engine->mempool, pt->magic);`
			`mpool_free(engine->mempool, pt->tname);`
			`mpool_free(engine->mempool, pt);`
improved filetype detection code git-svn: trunk@3421 18 years ago			`}`
			`}`
try to detect international text data git-svn: trunk@1448 20 years ago
improved filetype detection code git-svn: trunk@3421 18 years ago			`cli_file_t cli_filetype(const unsigned char buf, size_t buflen, const struct cl_engine engine)`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`{`
improved filetype detection code git-svn: trunk@3421 18 years ago			`struct cli_ftype *ftype = engine->ftypes;`
only enable signature file type recognition for text files git-svn: trunk@788 21 years ago
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago
improved filetype detection code git-svn: trunk@3421 18 years ago			`while(ftype) {`
			`if(ftype->offset + ftype->length <= buflen) {`
			`if(!memcmp(buf + ftype->offset, ftype->magic, ftype->length)) {`
			`cli_dbgmsg("Recognized %s file\n", ftype->tname);`
			`return ftype->type;`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`}`
			`}`
improved filetype detection code git-svn: trunk@3421 18 years ago			`ftype = ftype->next;`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`}`

re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`return cli_texttype(buf, buflen);`
new method of file type detection; HTML normalisation git-svn: trunk@648 21 years ago			`}`

minor code cleanup git-svn: trunk@2031 19 years ago			`int is_tar(unsigned char *buf, unsigned int nbytes);`
add support for old fashioned tar archives git-svn: trunk@1421 20 years ago
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)`
add support for old fashioned tar archives git-svn: trunk@1421 20 years ago			`{`
improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`unsigned char buff[MAGIC_BUFFER_SIZE + 1], *decoded;`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`int bread, sret;`
re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`cli_file_t ret = CL_TYPE_BINARY_DATA;`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`struct cli_matcher *root;`
pattern matcher accuracy improvements git-svn: trunk@2505 19 years ago			`struct cli_ac_data mdata;`
add support for old fashioned tar archives git-svn: trunk@1421 20 years ago

re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`if(!engine) {`
			`cli_errmsg("cli_filetype2: engine == NULL\n");`
			`return CL_TYPE_ERROR;`
			`}`

improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`memset(buff, 0, sizeof(buff));`
			`bread = cli_readn(desc, buff, MAGIC_BUFFER_SIZE);`
improve I/O error handling in cli_filetype2 (bb#818) git-svn: trunk@3605 18 years ago			`if(bread == -1)`
			`return CL_TYPE_ERROR;`
improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`buff[bread] = 0;`
improve I/O error handling in cli_filetype2 (bb#818) git-svn: trunk@3605 18 years ago
improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`ret = cli_filetype(buff, bread, engine);`
add support for old fashioned tar archives git-svn: trunk@1421 20 years ago
re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`if(ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) {`
			`/* HTML files may contain special characters and could be`
			`* misidentified as BINARY_DATA by cli_filetype()`
			`*/`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`root = engine->root[0];`
			`if(!root)`
			`return ret;`

libclamav: add default.h git-svn: trunk@4578 17 years ago			`if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, CLI_DEFAULT_AC_TRACKLEN))`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`return ret;`

libclamav: improve handling of signature offsets 16 years ago			`sret = cli_ac_scanbuff(buff, bread, NULL, NULL, NULL, engine->root[0], &mdata, 0, ret, NULL, AC_SCAN_FT, NULL);`
pattern matcher accuracy improvements git-svn: trunk@2505 19 years ago
			`cli_ac_freedata(&mdata);`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago
			`if(sret >= CL_TYPENO) {`
			`ret = sret;`
			`} else {`
libclamav: add default.h git-svn: trunk@4578 17 years ago			`if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, CLI_DEFAULT_AC_TRACKLEN))`
pattern matcher accuracy improvements git-svn: trunk@2505 19 years ago			`return ret;`

improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`decoded = (unsigned char ) cli_utf16toascii((char ) buff, bread);`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`if(decoded) {`
libclamav: improve handling of signature offsets 16 years ago			`sret = cli_ac_scanbuff(decoded, strlen((char *) decoded), NULL, NULL, NULL, engine->root[0], &mdata, 0, CL_TYPE_TEXT_ASCII, NULL, AC_SCAN_FT, NULL);`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`free(decoded);`
			`if(sret == CL_TYPE_HTML)`
			`ret = CL_TYPE_HTML_UTF16;`
			`}`
pattern matcher accuracy improvements git-svn: trunk@2505 19 years ago			`cli_ac_freedata(&mdata);`
add encoding and entity normalizer from Edwin (bb#145) git-svn: trunk@2577 19 years ago
prepare for enabling phishing code in non-experimental builds git-svn: trunk@3042 18 years ago			`if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {`
use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571 18 years ago			`const char* encoding;`

			`/* check if we can autodetect this encoding.`
			`* If we can't don't try to detect HTML sig, since`
			`* we just tried that above, and failed */`
improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`if((encoding = encoding_detect_bom(buff, bread))) {`
			`unsigned char decodedbuff[sizeof(buff)*2];`
use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571 18 years ago			`m_area_t in_area, out_area;`

improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`in_area.buffer = (unsigned char *) buff;`
use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571 18 years ago			`in_area.length = bread;`
			`in_area.offset = 0;`
			`out_area.buffer = decodedbuff;`
			`out_area.length = sizeof(decodedbuff);`
			`out_area.offset = 0;`

			`/* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode`
			`* (multibyte characters will not be exactly handled, but that is not a problem).`
			`* However when detecting whether a file is HTML or not, we need exact conversion.`
			`* (just eliminating zeros and matching would introduce false positives */`
			`if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {`
libclamav: add default.h git-svn: trunk@4578 17 years ago			`if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, CLI_DEFAULT_AC_TRACKLEN))`
use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571 18 years ago			`return ret;`

			`if(out_area.length > 0) {`
libclamav: improve handling of signature offsets 16 years ago			`sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, NULL, NULL, engine->root[0], &mdata, 0, 0, NULL, AC_SCAN_FT, NULL); /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */`
use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571 18 years ago			`if(sret == CL_TYPE_HTML) {`
			`cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");`
			`/* htmlnorm is able to handle any unicode now, since it skips null chars */`
			`ret = CL_TYPE_HTML;`
			`}`
AC_TRY_LINK already adds a main(), remove duplicate main() entconv improvements to improve security and performance Part I for (bb #686, #386) TODO: * optimize entity_norm * create testfiles for unicode encoding variants * create a regression test * check for memory leaks git-svn: trunk@3511 18 years ago			`}`
add encoding and entity normalizer from Edwin (bb#145) git-svn: trunk@2577 19 years ago
use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() git-svn: trunk@3571 18 years ago			`cli_ac_freedata(&mdata);`
			`}`
AC_TRY_LINK already adds a main(), remove duplicate main() entconv improvements to improve security and performance Part I for (bb #686, #386) TODO: * optimize entity_norm * create testfiles for unicode encoding variants * create a regression test * check for memory leaks git-svn: trunk@3511 18 years ago			`}`
add encoding and entity normalizer from Edwin (bb#145) git-svn: trunk@2577 19 years ago			`}`
add support for UTF16 encoded HTML files git-svn: trunk@2430 19 years ago			`}`
			`}`

re-enable text detection (ASCII, UTF8, UTF16) git-svn: trunk@3486 18 years ago			`if(ret == CL_TYPE_BINARY_DATA) {`
improve handling of PDF, CAB, RTF, OLE2 and HTML files (sync with branch/0.93) git-svn: trunk@3862 17 years ago			`switch(is_tar(buff, bread)) {`
			`case 1:`
			`ret = CL_TYPE_OLD_TAR;`
			`cli_dbgmsg("Recognized old fashioned tar file\n");`
			`break;`
			`case 2:`
			`ret = CL_TYPE_POSIX_TAR;`
			`cli_dbgmsg("Recognized POSIX tar file\n");`
			`break;`
add support for old fashioned tar archives git-svn: trunk@1421 20 years ago			`}`
			`}`

			`return ret;`
			`}`